diff options
author | Tom Rondeau <trondeau@vt.edu> | 2012-06-24 16:06:38 -0400 |
---|---|---|
committer | Tom Rondeau <trondeau@vt.edu> | 2012-06-24 16:06:38 -0400 |
commit | d97764ce6040aff8023b43323b4bc048a907a1fd (patch) | |
tree | dda4ba74128454fd271ffe7dcbe673f2ae0b4c08 /volk/lib | |
parent | b142f64573b8b7b15a94fc9c64e2f26264a5b144 (diff) | |
parent | bf8700a226091c5ce0130a3819a4c12b9d9981a6 (diff) |
Merge branch 'master' into my_qtsink
Conflicts:
gr-qtgui/include/qtgui_sink_c.h
gr-qtgui/include/qtgui_sink_f.h
gr-qtgui/include/qtgui_util.h
gr-qtgui/lib/ConstellationDisplayPlot.cc
gr-qtgui/lib/FrequencyDisplayPlot.cc
gr-qtgui/lib/SpectrumGUIClass.cc
gr-qtgui/lib/TimeDomainDisplayPlot.cc
gr-qtgui/lib/WaterfallDisplayPlot.cc
gr-qtgui/lib/qtgui_sink_c.cc
gr-qtgui/lib/qtgui_sink_f.cc
gr-qtgui/lib/spectrumdisplayform.cc
Diffstat (limited to 'volk/lib')
-rw-r--r-- | volk/lib/CMakeLists.txt | 364 | ||||
-rw-r--r-- | volk/lib/gcc_x86_cpuid.h | 16 | ||||
-rw-r--r-- | volk/lib/qa_16s_add_quad_aligned16.cc | 10 | ||||
-rw-r--r-- | volk/lib/qa_16s_branch_4_state_8_aligned16.cc | 38 | ||||
-rw-r--r-- | volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc | 18 | ||||
-rw-r--r-- | volk/lib/qa_16s_quad_max_star_aligned16.cc | 6 | ||||
-rw-r--r-- | volk/lib/qa_32f_fm_detect_aligned16.cc | 8 | ||||
-rw-r--r-- | volk/lib/qa_32f_index_max_aligned16.cc | 36 | ||||
-rw-r--r-- | volk/lib/qa_32fc_index_max_aligned16.cc | 34 | ||||
-rw-r--r-- | volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc | 8 | ||||
-rw-r--r-- | volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc | 28 | ||||
-rw-r--r-- | volk/lib/qa_32u_popcnt_aligned16.cc | 8 | ||||
-rw-r--r-- | volk/lib/qa_64u_popcnt_aligned16.cc | 8 | ||||
-rw-r--r-- | volk/lib/qa_utils.cc | 66 | ||||
-rw-r--r-- | volk/lib/qa_utils.h | 8 | ||||
-rw-r--r-- | volk/lib/testqa.cc | 11 | ||||
-rw-r--r-- | volk/lib/volk_prefs.c | 2 | ||||
-rw-r--r-- | volk/lib/volk_rank_archs.c | 4 |
18 files changed, 335 insertions, 338 deletions
diff --git a/volk/lib/CMakeLists.txt b/volk/lib/CMakeLists.txt index b491f94bbb..8288786c98 100644 --- a/volk/lib/CMakeLists.txt +++ b/volk/lib/CMakeLists.txt @@ -1,5 +1,5 @@ # -# Copyright 2011 Free Software Foundation, Inc. +# Copyright 2011-2012 Free Software Foundation, Inc. # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -16,6 +16,31 @@ # ######################################################################## +# header file detection +######################################################################## +include(CheckIncludeFile) +CHECK_INCLUDE_FILE(cpuid.h HAVE_CPUID_H) +if(HAVE_CPUID_H) + add_definitions(-DHAVE_CPUID_H) +endif() + +CHECK_INCLUDE_FILE(intrin.h HAVE_INTRIN_H) +if(HAVE_INTRIN_H) + add_definitions(-DHAVE_INTRIN_H) +endif() + +CHECK_INCLUDE_FILE(fenv.h HAVE_FENV_H) +if(HAVE_FENV_H) + add_definitions(-DHAVE_FENV_H) +endif() + +CHECK_INCLUDE_FILE(dlfcn.h HAVE_DLFCN_H) +if(HAVE_DLFCN_H) + add_definitions(-DHAVE_DLFCN_H) + list(APPEND volk_libraries ${CMAKE_DL_LIBS}) +endif() + +######################################################################## # Setup the compiler name ######################################################################## set(COMPILER_NAME ${CMAKE_C_COMPILER_ID}) @@ -23,235 +48,183 @@ if(MSVC) #its not set otherwise set(COMPILER_NAME MSVC) endif() +message(STATUS "Compiler name: ${COMPILER_NAME}") + if(NOT DEFINED COMPILER_NAME) message(FATAL_ERROR "COMPILER_NAME undefined. Volk build may not support this compiler.") endif() ######################################################################## -# Parse the arches xml file: -# Test each arch to see if the compiler supports the flag. -# If the test passes append the arch to the available list. +# detect x86 flavor of CPU ######################################################################## -#extract the compiler lines from the xml file using abusive python - - - -execute_process( - COMMAND ${PYTHON_EXECUTABLE} -c - "from xml.dom import minidom; print ';'.join(map(lambda b: ','.join([','.join([b.attributes['name'].value,item.attributes['name'].value,item.firstChild.data]) for item in b.getElementsByTagName('remap')]), minidom.parse('${CMAKE_SOURCE_DIR}/gen/compilers.xml').getElementsByTagName('compiler')))" - - OUTPUT_VARIABLE compiler_lines OUTPUT_STRIP_TRAILING_WHITESPACE -) - -foreach(thing ${compiler_lines}) - string(REGEX REPLACE "," ";" thing_list ${thing}) - list(FIND thing_list ${COMPILER_NAME} check_val) - if(NOT ("${check_val}" STREQUAL "-1")) - string(REGEX REPLACE "${COMPILER_NAME}," ";" filter_string ${thing}) - endif() -endforeach() - +if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(i.86|x86|x86_64|amd64)$") + message(STATUS "x86* CPU detected") + set(CPU_IS_x86 TRUE) +endif() -#extract compiler prefixes from the xml file using abusive python +######################################################################## +# determine passing architectures based on compile flag tests +######################################################################## execute_process( - COMMAND ${PYTHON_EXECUTABLE} -c - "from xml.dom import minidom; print ';'.join(map(lambda b: ','.join([','.join([b.attributes['name'].value,item.firstChild.data]) for item in b.getElementsByTagName('prefix')]), minidom.parse('${CMAKE_SOURCE_DIR}/gen/compilers.xml').getElementsByTagName('compiler')))" - - OUTPUT_VARIABLE compiler_prefixes OUTPUT_STRIP_TRAILING_WHITESPACE + COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_DASH_B} + ${CMAKE_SOURCE_DIR}/gen/volk_compile_utils.py + --mode "arch_flags" --compiler "${COMPILER_NAME}" + OUTPUT_VARIABLE arch_flag_lines OUTPUT_STRIP_TRAILING_WHITESPACE ) -foreach(thing ${compiler_prefixes}) - string(REGEX REPLACE "," ";" thing_list ${thing}) - list(FIND thing_list ${COMPILER_NAME} check_val) - if(NOT ("${check_val}" STREQUAL "-1")) - list(GET thing_list "1" prefix) +macro(check_arch arch_name) + set(flags ${ARGN}) + set(have_${arch_name} TRUE) + foreach(flag ${flags}) + include(CheckCXXCompilerFlag) + set(have_flag have${flag}) + execute_process( #make the have_flag have nice alphanum chars (just for looks/not necessary) + COMMAND ${PYTHON_EXECUTABLE} -c "import re; print(re.sub('\\W', '_', '${have_flag}'))" + OUTPUT_VARIABLE have_flag OUTPUT_STRIP_TRAILING_WHITESPACE + ) + CHECK_CXX_COMPILER_FLAG(${flag} ${have_flag}) + if (NOT ${have_flag}) + set(have_${arch_name} FALSE) + endif() + endforeach(flag) + if (have_${arch_name}) + list(APPEND available_archs ${arch_name}) endif() -endforeach() - - - - -#extract the arch lines from the xml file using abusive python -execute_process( - COMMAND ${PYTHON_EXECUTABLE} -c - "from xml.dom import minidom; print ';'.join(map(lambda a: '%s %s %s %s'%(a.attributes['name'].value,a.getElementsByTagName('flag')[0].firstChild.data,a.getElementsByTagName('overrule')[0].firstChild.data,a.getElementsByTagName('overrule_val')[0].firstChild.data) if (len(a.getElementsByTagName('overrule'))) else '%s %s %s %s'%(a.attributes['name'].value,a.getElementsByTagName('flag')[0].firstChild.data,'no_overrule', 'no_overrule_val'), minidom.parse('${CMAKE_SOURCE_DIR}/gen/archs.xml').getElementsByTagName('arch')))" - - OUTPUT_VARIABLE arch_lines OUTPUT_STRIP_TRAILING_WHITESPACE -) +endmacro(check_arch) +foreach(line ${arch_flag_lines}) + string(REGEX REPLACE "," ";" arch_flags ${line}) + check_arch(${arch_flags}) +endforeach(line) +macro(OVERRULE_ARCH arch reason) + message(STATUS "${reason}, Overruled arch ${arch}") + list(REMOVE_ITEM available_archs ${arch}) +endmacro(OVERRULE_ARCH) - -#set the various overrule values (see archs.xml) -#a lot of this is translating between automake and cmake -if(NOT "${CROSSCOMPILE_MULTILIB}" STREQUAL "true") - set(MD_SUBCPU ${CMAKE_SYSTEM_PROCESSOR}) - #detect 32 or 64 bit compiler - if(MD_SUBCPU MATCHES "^(i.86|x86|x86_64|amd64)$") - include(CheckTypeSize) - check_type_size("void*" SIZEOF_VOID_P BUILTIN_TYPES_ONLY) - if (${SIZEOF_VOID_P} EQUAL 8) - set(MD_SUBCPU x86_64) - else() - set(MD_SUBCPU x86) - endif() +######################################################################## +# eliminate AVX on GCC < 4.4 +# even though it accepts -mavx, as won't assemble xgetbv, which we need +######################################################################## +if(CPU_IS_x86 AND COMPILER_NAME MATCHES "GNU") + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion + OUTPUT_VARIABLE GCC_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE) + if(GCC_VERSION VERSION_LESS "4.4") + OVERRULE_ARCH(avx "GCC missing xgetbv") endif() endif() -if(NOT "${ORC_FOUND}" STREQUAL "TRUE") - set(LV_HAVE_ORC "no") -endif() - +######################################################################## +# implement overruling in the ORC case, +# since ORC always passes flag detection +######################################################################## +if(NOT ORC_FOUND) + OVERRULE_ARCH(orc "ORC support not found") +endif() - - - -macro(compiler_filter name flag) - set(filtered_flag ${flag}) - foreach(thing ${filter_string}) - string(REGEX REPLACE "," ";" flagmap ${thing}) - list(GET flagmap "0" key) - list(GET flagmap "1" val) - string(REGEX MATCH "^${key}$" found ${flag}) - if("${found}" STREQUAL "${key}") - string(REGEX REPLACE "^${key}$" "${val}" filtered_flag ${flag}) - endif() - endforeach() - set(${name}_flag "${prefix}${filtered_flag}") -endmacro() - - - - - - - -macro(handle_arch name flag overrule overrule_val) - - #handle overrule - if("${${overrule}}" STREQUAL "${overrule_val}") - set(have_${name} FALSE) - message(STATUS "${name} overruled") - #handle special case for none flag - elseif(${flag} STREQUAL "none") - set(have_${name} TRUE) - #otherwise test the flag(s) against the compiler - else() - include(CheckCXXCompilerFlag) - string(REGEX REPLACE "," ";" flag_list ${flag}) - set(have_${name} 1) - foreach(thing ${flag_list}) - compiler_filter(${name} ${thing}) - CHECK_CXX_COMPILER_FLAG(${${name}_flag} have_${thing}) - if(NOT (${have_${name}} AND ("${have_${thing}}" STREQUAL "1"))) - set(have_${name} 0) - endif() - endforeach() +######################################################################## +# implement overruling in the non-multilib case +# this makes things work when both -m32 and -m64 pass +######################################################################## +if(NOT CROSSCOMPILE_MULTILIB AND CPU_IS_x86) + include(CheckTypeSize) + check_type_size("void*[8]" SIZEOF_CPU BUILTIN_TYPES_ONLY) + if (${SIZEOF_CPU} EQUAL 64) + OVERRULE_ARCH(32 "CPU width is 64 bits") endif() - - if(have_${name}) - list(APPEND available_arches ${name}) + if (${SIZEOF_CPU} EQUAL 32) + OVERRULE_ARCH(64 "CPU width is 32 bits") endif() +endif() -endmacro(handle_arch) - -#create a list of available arches -foreach(arch_line ${arch_lines}) - string(REPLACE " " ";" args "${arch_line}") - handle_arch(${args}) -endforeach(arch_line) - -message(STATUS "Available arches: ${available_arches}") +######################################################################## +# done overrules! print the result +######################################################################## +message(STATUS "Available architectures: ${available_archs}") ######################################################################## -# Parse the machines xml file: -# Test each machine to see if its arch dependencies are supported. -# Build a list of supported machines and the machine definitions. +# determine available machines given the available architectures ######################################################################## -#extract the machine lines from the xml file using crazy python execute_process( - COMMAND ${PYTHON_EXECUTABLE} -c - "from xml.dom import minidom; print ';'.join(map(lambda a: '%s %s'%(a.attributes['name'].value,a.getElementsByTagName('archs')[0].firstChild.data),minidom.parse('${CMAKE_SOURCE_DIR}/gen/machines.xml').getElementsByTagName('machine')))" - OUTPUT_VARIABLE machine_lines OUTPUT_STRIP_TRAILING_WHITESPACE + COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_DASH_B} + ${CMAKE_SOURCE_DIR}/gen/volk_compile_utils.py + --mode "machines" --archs "${available_archs}" + OUTPUT_VARIABLE available_machines OUTPUT_STRIP_TRAILING_WHITESPACE ) -macro(handle_machine1 name) - unset(machine_flags) - string(TOUPPER LV_MACHINE_${name} machine_def) - - #check if all the arches are supported - foreach(arch ${ARGN}) - set(is_match ${have_${arch}}) - if(NOT is_match) - set(is_match FALSE) - break() - endif(NOT is_match) - set(machine_flags "${machine_flags} ${${arch}_flag}") - endforeach(arch) - - string(REGEX REPLACE "^[ \t]+" "" machine_flags "${machine_flags}") - - if(is_match) - #this is a match, append the source and set its flags - set(machine_source ${CMAKE_CURRENT_BINARY_DIR}/volk_machine_${name}.c) - set_source_files_properties(${machine_source} PROPERTIES COMPILE_FLAGS "${machine_flags}") - list(APPEND machine_sources ${machine_source}) - list(APPEND machine_defs ${machine_def}) - list(APPEND available_machines ${name}) - endif() -endmacro(handle_machine1) - -macro(handle_machine name) - set(arches ${ARGN}) - list(FIND arches "32|64" index) - if(${index} EQUAL -1) - handle_machine1(${name} ${arches}) - else() - list(REMOVE_ITEM arches "32|64") - handle_machine1(${name}_32 32 ${arches}) - handle_machine1(${name}_64 64 ${arches}) - endif() -endmacro(handle_machine) - -#setup the available machines -foreach(machine_line ${machine_lines}) - string(REPLACE " " ";" args "${machine_line}") - handle_machine(${args}) -endforeach(machine_line) +######################################################################## +# Implement machine overruling for redundant machines: +# A machine is redundant when expansion rules occur, +# and the arch superset passes configuration checks. +# When this occurs, eliminate the redundant machines +# to avoid unnecessary compilation of subset machines. +######################################################################## +foreach(arch orc 64 32) + foreach(machine_name ${available_machines}) + string(REPLACE "_${arch}" "" machine_name_no_arch ${machine_name}) + if (${machine_name} STREQUAL ${machine_name_no_arch}) + else() + list(REMOVE_ITEM available_machines ${machine_name_no_arch}) + endif() + endforeach(machine_name) +endforeach(arch) +######################################################################## +# done overrules! print the result +######################################################################## message(STATUS "Available machines: ${available_machines}") ######################################################################## # Create rules to run the volk generator ######################################################################## -#list of the generated sources -set(volk_gen_sources - ${CMAKE_BINARY_DIR}/include/volk/volk.h - ${CMAKE_BINARY_DIR}/lib/volk.c - ${CMAKE_BINARY_DIR}/lib/volk_init.h - ${CMAKE_BINARY_DIR}/include/volk/volk_typedefs.h - ${CMAKE_BINARY_DIR}/include/volk/volk_cpu.h - ${CMAKE_BINARY_DIR}/lib/volk_cpu.c - ${CMAKE_BINARY_DIR}/include/volk/volk_config_fixed.h - ${CMAKE_BINARY_DIR}/lib/volk_environment_init.c - ${CMAKE_BINARY_DIR}/lib/volk_environment_init.h - ${CMAKE_BINARY_DIR}/lib/volk_machines.h - ${CMAKE_BINARY_DIR}/lib/volk_machines.c - ${machine_sources} -) #dependencies are all python, xml, and header implementation files file(GLOB xml_files ${CMAKE_SOURCE_DIR}/gen/*.xml) file(GLOB py_files ${CMAKE_SOURCE_DIR}/gen/*.py) file(GLOB h_files ${CMAKE_SOURCE_DIR}/include/volk/*.h) -add_custom_command( - OUTPUT ${volk_gen_sources} - DEPENDS ${xml_files} ${py_files} ${h_files} - COMMAND ${PYTHON_EXECUTABLE} -B - ${CMAKE_SOURCE_DIR}/gen/volk_register.py - ${CMAKE_BINARY_DIR} -) +macro(gen_template tmpl output) + list(APPEND volk_gen_sources ${output}) + add_custom_command( + OUTPUT ${output} + DEPENDS ${xml_files} ${py_files} ${h_files} ${tmpl} + COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_DASH_B} + ${CMAKE_SOURCE_DIR}/gen/volk_tmpl_utils.py + --input ${tmpl} --output ${output} ${ARGN} + ) +endmacro(gen_template) + +make_directory(${CMAKE_BINARY_DIR}/include/volk) + +gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk.tmpl.h ${CMAKE_BINARY_DIR}/include/volk/volk.h) +gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk.tmpl.c ${CMAKE_BINARY_DIR}/lib/volk.c) +gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_typedefs.tmpl.h ${CMAKE_BINARY_DIR}/include/volk/volk_typedefs.h) +gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_cpu.tmpl.h ${CMAKE_BINARY_DIR}/include/volk/volk_cpu.h) +gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_cpu.tmpl.c ${CMAKE_BINARY_DIR}/lib/volk_cpu.c) +gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_config_fixed.tmpl.h ${CMAKE_BINARY_DIR}/include/volk/volk_config_fixed.h) +gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_machines.tmpl.h ${CMAKE_BINARY_DIR}/lib/volk_machines.h) +gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_machines.tmpl.c ${CMAKE_BINARY_DIR}/lib/volk_machines.c) + +foreach(machine_name ${available_machines}) + #generate machine source + set(machine_source ${CMAKE_CURRENT_BINARY_DIR}/volk_machine_${machine_name}.c) + gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_machine_xxx.tmpl.c ${machine_source} ${machine_name}) + + #determine machine flags + execute_process( + COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_DASH_B} + ${CMAKE_SOURCE_DIR}/gen/volk_compile_utils.py + --mode "machine_flags" --machine "${machine_name}" --compiler "${COMPILER_NAME}" + OUTPUT_VARIABLE ${machine_name}_flags OUTPUT_STRIP_TRAILING_WHITESPACE + ) + if(${machine_name}_flags) + set_source_files_properties(${machine_source} PROPERTIES COMPILE_FLAGS "${${machine_name}_flags}") + endif() + + #add to available machine defs + string(TOUPPER LV_MACHINE_${machine_name} machine_def) + list(APPEND machine_defs ${machine_def}) +endforeach(machine_name) ######################################################################## # Set local include directories first @@ -270,7 +243,7 @@ if(ORC_FOUND) #setup orc library usage include_directories(${ORC_INCLUDE_DIRS}) link_directories(${ORC_LIBRARY_DIRS}) - add_definitions(-DLV_HAVE_ORC) + list(APPEND volk_libraries ${ORC_LIBRARIES}) #setup orc functions file(GLOB orc_files ${CMAKE_SOURCE_DIR}/orc/*.orc) @@ -313,16 +286,15 @@ PROPERTIES COMPILE_DEFINITIONS "${machine_defs}") if(MSVC) #add compatibility includes for stdint types - include_directories(${CMAKE_SOURCE_DIR}/msvc) + include_directories(${CMAKE_SOURCE_DIR}/cmake/msvc) + add_definitions(-DHAVE_CONFIG_H) #compile the sources as C++ due to the lack of complex.h under MSVC set_source_files_properties(${volk_sources} PROPERTIES LANGUAGE CXX) endif() #create the volk runtime library add_library(volk SHARED ${volk_sources}) -if(ORC_FOUND) - target_link_libraries(volk ${ORC_LIBRARIES}) -endif(ORC_FOUND) +target_link_libraries(volk ${volk_libraries}) set_target_properties(volk PROPERTIES SOVERSION ${LIBVER}) set_target_properties(volk PROPERTIES DEFINE_SYMBOL "volk_EXPORTS") diff --git a/volk/lib/gcc_x86_cpuid.h b/volk/lib/gcc_x86_cpuid.h index 2d0916fb36..3c3f47b003 100644 --- a/volk/lib/gcc_x86_cpuid.h +++ b/volk/lib/gcc_x86_cpuid.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc. + * Copyright (C) 2007, 2008, 2009, 2010 Free Software Foundation, Inc. * * This file is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -35,6 +35,8 @@ #define bit_XSAVE (1 << 26) #define bit_OSXSAVE (1 << 27) #define bit_AVX (1 << 28) +#define bit_F16C (1 << 29) +#define bit_RDRND (1 << 30) /* %edx */ #define bit_CMPXCHG8B (1 << 8) @@ -47,14 +49,22 @@ /* Extended Features */ /* %ecx */ #define bit_LAHF_LM (1 << 0) +#define bit_ABM (1 << 5) #define bit_SSE4a (1 << 6) -#define bit_SSE5 (1 << 11) +#define bit_XOP (1 << 11) +#define bit_LWP (1 << 15) +#define bit_FMA4 (1 << 16) +#define bit_TBM (1 << 21) /* %edx */ +#define bit_MMXEXT (1 << 22) #define bit_LM (1 << 29) #define bit_3DNOWP (1 << 30) #define bit_3DNOW (1 << 31) +/* Extended Features (%eax == 7) */ +#define bit_FSGSBASE (1 << 0) +#define bit_BMI (1 << 3) #if defined(__i386__) && defined(__PIC__) /* %ebx may be the PIC register. */ @@ -114,8 +124,8 @@ __get_cpuid_max (unsigned int __ext, unsigned int *__sig) unsigned int __eax, __ebx, __ecx, __edx; #ifndef __x86_64__ -#if __GNUC__ >= 3 /* See if we can use cpuid. On AMD64 we always can. */ +#if __GNUC__ >= 3 __asm__ ("pushf{l|d}\n\t" "pushf{l|d}\n\t" "pop{l}\t%0\n\t" diff --git a/volk/lib/qa_16s_add_quad_aligned16.cc b/volk/lib/qa_16s_add_quad_aligned16.cc index 5d5eb7e188..8da43b9724 100644 --- a/volk/lib/qa_16s_add_quad_aligned16.cc +++ b/volk/lib/qa_16s_add_quad_aligned16.cc @@ -16,7 +16,7 @@ void qa_16s_add_quad_aligned16::t1() { void qa_16s_add_quad_aligned16::t1() { - + volk_environment_init(); clock_t start, end; double total; @@ -27,7 +27,7 @@ void qa_16s_add_quad_aligned16::t1() { __VOLK_ATTR_ALIGNED(16) short input2[vlen]; __VOLK_ATTR_ALIGNED(16) short input3[vlen]; __VOLK_ATTR_ALIGNED(16) short input4[vlen]; - + __VOLK_ATTR_ALIGNED(16) short output0[vlen]; __VOLK_ATTR_ALIGNED(16) short output1[vlen]; __VOLK_ATTR_ALIGNED(16) short output2[vlen]; @@ -48,13 +48,13 @@ void qa_16s_add_quad_aligned16::t1() { short minus3 = ((short) (rand() - (RAND_MAX/2))) >> 2; short plus4 = ((short) (rand() - (RAND_MAX/2))) >> 2; short minus4 = ((short) (rand() - (RAND_MAX/2))) >> 2; - + input0[i] = plus0 - minus0; input1[i] = plus1 - minus1; input2[i] = plus2 - minus2; input3[i] = plus3 - minus3; input4[i] = plus4 - minus4; - + } printf("16s_add_quad_aligned\n"); @@ -76,7 +76,7 @@ void qa_16s_add_quad_aligned16::t1() { //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); } - + for(int i = 0; i < vlen; ++i) { //printf("%d...%d\n", output0[i], output01[i]); CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); diff --git a/volk/lib/qa_16s_branch_4_state_8_aligned16.cc b/volk/lib/qa_16s_branch_4_state_8_aligned16.cc index 2e6e6a1a0b..5a58569a1d 100644 --- a/volk/lib/qa_16s_branch_4_state_8_aligned16.cc +++ b/volk/lib/qa_16s_branch_4_state_8_aligned16.cc @@ -22,17 +22,17 @@ void qa_16s_branch_4_state_8_aligned16::t1() { static char permute2[16]__attribute__((aligned(16))) = {0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d, 0x00, 0x01, 0x04, 0x05, 0x0a, 0x0b, 0x0e, 0x0f}; static char permute3[16]__attribute__((aligned(16))) = {0x00, 0x01, 0x04, 0x05, 0x0a, 0x0b, 0x0e, 0x0f, 0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d}; static char* permuters[4] = {permute0, permute1, permute2, permute3}; - + unsigned int num_bytes = vlen << 1; volk_environment_init(); clock_t start, end; double total; - + __VOLK_ATTR_ALIGNED(16) short target[vlen]; __VOLK_ATTR_ALIGNED(16) short target2[vlen]; __VOLK_ATTR_ALIGNED(16) short target3[vlen]; - + __VOLK_ATTR_ALIGNED(16) short src0[vlen]; __VOLK_ATTR_ALIGNED(16) short permute_indexes[vlen] = { 7, 5, 2, 0, 6, 4, 3, 1, 6, 4, 3, 1, 7, 5, 2, 0, 1, 3, 4, 6, 0, 2, 5, 7, 0, 2, 5, 7, 1, 3, 4, 6 }; @@ -45,29 +45,29 @@ void qa_16s_branch_4_state_8_aligned16::t1() { __VOLK_ATTR_ALIGNED(16) short cntl3[vlen] = { 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff }; __VOLK_ATTR_ALIGNED(16) short scalars[4] = {1, 2, 3, 4}; - - + + for(int i = 0; i < vlen; ++i) { src0[i] = i; - + } - + printf("16s_branch_4_state_8_aligned\n"); - - + + start = clock(); for(int i = 0; i < num_iters; ++i) { volk_16s_permute_and_scalar_add_aligned16_manual(target, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "sse2"); } end = clock(); - + total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("permute_and_scalar_add_time: %f\n", total); - - + + start = clock(); for(int i = 0; i < num_iters; ++i) { @@ -78,25 +78,25 @@ void qa_16s_branch_4_state_8_aligned16::t1() { total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("branch_4_state_8_time, ssse3: %f\n", total); - + start = clock(); for(int i = 0; i < num_iters; ++i) { volk_16s_branch_4_state_8_aligned16_manual(target3, src0, permuters, cntl2, cntl3, scalars, "generic"); } end = clock(); - + total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("permute_and_scalar_add_time, generic: %f\n", total); - - - + + + for(int i = 0; i < vlen; ++i) { printf("psa... %d, b4s8... %d\n", target[i], target3[i]); } - + for(int i = 0; i < vlen; ++i) { - + CPPUNIT_ASSERT(target[i] == target2[i]); CPPUNIT_ASSERT(target[i] == target3[i]); } diff --git a/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc index 3cd4e906df..dadd2c5804 100644 --- a/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc +++ b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc @@ -16,13 +16,13 @@ void qa_16s_permute_and_scalar_add_aligned16::t1() { void qa_16s_permute_and_scalar_add_aligned16::t1() { const int vlen = 64; - + unsigned int num_bytes = vlen << 1; volk_environment_init(); clock_t start, end; double total; - + __VOLK_ATTR_ALIGNED(16) short target[vlen]; __VOLK_ATTR_ALIGNED(16) short target2[vlen]; __VOLK_ATTR_ALIGNED(16) short src0[vlen]; @@ -43,7 +43,7 @@ void qa_16s_permute_and_scalar_add_aligned16::t1() { } printf("16s_permute_and_scalar_add_aligned\n"); - + start = clock(); for(int i = 0; i < 100000; ++i) { volk_16s_permute_and_scalar_add_aligned16_manual(target, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "generic"); @@ -53,24 +53,24 @@ void qa_16s_permute_and_scalar_add_aligned16::t1() { total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("generic_time: %f\n", total); - + start = clock(); for(int i = 0; i < 100000; ++i) { volk_16s_permute_and_scalar_add_aligned16_manual(target2, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "sse2"); } end = clock(); - + total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("sse2_time: %f\n", total); - - + + for(int i = 0; i < vlen; ++i) { //printf("generic... %d, sse2... %d\n", target[i], target2[i]); } - + for(int i = 0; i < vlen; ++i) { - + CPPUNIT_ASSERT(target[i] == target2[i]); } } diff --git a/volk/lib/qa_16s_quad_max_star_aligned16.cc b/volk/lib/qa_16s_quad_max_star_aligned16.cc index 192a69e350..2a5dec44ab 100644 --- a/volk/lib/qa_16s_quad_max_star_aligned16.cc +++ b/volk/lib/qa_16s_quad_max_star_aligned16.cc @@ -16,7 +16,7 @@ void qa_16s_quad_max_star_aligned16::t1() { void qa_16s_quad_max_star_aligned16::t1() { const int vlen = 34; - + __VOLK_ATTR_ALIGNED(16) short input0[vlen]; __VOLK_ATTR_ALIGNED(16) short input1[vlen]; __VOLK_ATTR_ALIGNED(16) short input2[vlen]; @@ -50,9 +50,9 @@ void qa_16s_quad_max_star_aligned16::t1() { for(int i = 0; i < vlen; ++i) { printf("generic... %d, sse2... %d, inputs: %d, %d, %d, %d\n", output0[i], output1[i], input0[i], input1[i], input2[i], input3[i]); } - + for(int i = 0; i < vlen; ++i) { - + CPPUNIT_ASSERT_EQUAL(output0[i], output1[i]); } } diff --git a/volk/lib/qa_32f_fm_detect_aligned16.cc b/volk/lib/qa_32f_fm_detect_aligned16.cc index a2e7a85be3..4e792ec6cb 100644 --- a/volk/lib/qa_32f_fm_detect_aligned16.cc +++ b/volk/lib/qa_32f_fm_detect_aligned16.cc @@ -15,18 +15,18 @@ void qa_32f_fm_detect_aligned16::t1() { #else void qa_32f_fm_detect_aligned16::t1() { - + volk_environment_init(); clock_t start, end; double total; const int vlen = 3201; const int ITERS = 10000; __VOLK_ATTR_ALIGNED(16) float input0[vlen]; - + __VOLK_ATTR_ALIGNED(16) float output0[vlen]; __VOLK_ATTR_ALIGNED(16) float output01[vlen]; - for(int i = 0; i < vlen; ++i) { + for(int i = 0; i < vlen; ++i) { input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); } printf("32f_fm_detect_aligned\n"); @@ -51,7 +51,7 @@ void qa_32f_fm_detect_aligned16::t1() { //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); } - + for(int i = 0; i < vlen; ++i) { //printf("%d...%d\n", output0[i], output01[i]); CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output01[i], fabs(output0[i]) * 1e-4); diff --git a/volk/lib/qa_32f_index_max_aligned16.cc b/volk/lib/qa_32f_index_max_aligned16.cc index a1c3d4cd14..2df206726d 100644 --- a/volk/lib/qa_32f_index_max_aligned16.cc +++ b/volk/lib/qa_32f_index_max_aligned16.cc @@ -34,12 +34,12 @@ void qa_32f_index_max_aligned16::t1(){ void qa_32f_index_max_aligned16::t1(){ - + const int vlen = VEC_LEN; - + volk_runtime_init(); - + volk_environment_init(); int ret; @@ -47,8 +47,8 @@ void qa_32f_index_max_aligned16::t1(){ unsigned int* target_sse; unsigned int* target_generic; float* src0 ; - - + + unsigned int i_target_sse4_1; target_sse4_1 = &i_target_sse4_1; unsigned int i_target_sse; @@ -57,20 +57,20 @@ void qa_32f_index_max_aligned16::t1(){ target_generic = &i_target_generic; ret = posix_memalign((void**)&src0, 16, vlen *sizeof(float)); - + random_floats((float*)src0, vlen); - + printf("32f_index_max_aligned16\n"); clock_t start, end; double total; - - + + start = clock(); for(int k = 0; k < NUM_ITERS; ++k) { volk_32f_index_max_aligned16_manual(target_generic, src0, vlen, "generic"); } - end = clock(); + end = clock(); total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("generic time: %f\n", total); @@ -78,25 +78,25 @@ void qa_32f_index_max_aligned16::t1(){ for(int k = 0; k < NUM_ITERS; ++k) { volk_32f_index_max_aligned16_manual(target_sse, src0, vlen, "sse2"); } - - end = clock(); + + end = clock(); total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("sse time: %f\n", total); - + start = clock(); for(int k = 0; k < NUM_ITERS; ++k) { get_volk_runtime()->volk_32f_index_max_aligned16(target_sse4_1, src0, vlen); } - - end = clock(); + + end = clock(); total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("sse4.1 time: %f\n", total); - - + + printf("generic: %u, sse: %u, sse4.1: %u\n", target_generic[0], target_sse[0], target_sse4_1[0]); CPPUNIT_ASSERT_EQUAL(target_generic[0], target_sse[0]); CPPUNIT_ASSERT_EQUAL(target_generic[0], target_sse4_1[0]); - + free(src0); } diff --git a/volk/lib/qa_32fc_index_max_aligned16.cc b/volk/lib/qa_32fc_index_max_aligned16.cc index 4d83f16395..3859bcb522 100644 --- a/volk/lib/qa_32fc_index_max_aligned16.cc +++ b/volk/lib/qa_32fc_index_max_aligned16.cc @@ -33,36 +33,36 @@ void qa_32fc_index_max_aligned16::t1(){ void qa_32fc_index_max_aligned16::t1(){ - + const int vlen = VEC_LEN; - + volk_environment_init(); int ret; - + unsigned int* target; unsigned int* target_generic; std::complex<float>* src0 ; - - + + unsigned int i_target; target = &i_target; unsigned int i_target_generic; target_generic = &i_target_generic; ret = posix_memalign((void**)&src0, 16, vlen << 3); - + random_floats((float*)src0, vlen * 2); - + printf("32fc_index_max_aligned16\n"); clock_t start, end; double total; - - + + start = clock(); for(int k = 0; k < NUM_ITERS; ++k) { volk_32fc_index_max_aligned16_manual(target_generic, src0, vlen << 3, "generic"); } - end = clock(); + end = clock(); total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("generic time: %f\n", total); @@ -70,19 +70,19 @@ void qa_32fc_index_max_aligned16::t1(){ for(int k = 0; k < NUM_ITERS; ++k) { volk_32fc_index_max_aligned16_manual(target, src0, vlen << 3, "sse3"); } - - end = clock(); + + end = clock(); total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("sse3 time: %f\n", total); - - - + + + printf("generic: %u, sse3: %u\n", target_generic[0], target[0]); CPPUNIT_ASSERT_DOUBLES_EQUAL(target_generic[0], target[0], 1.1); - - + + free(src0); } diff --git a/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc index 981bb19e69..daca31d9ce 100644 --- a/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc +++ b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc @@ -15,14 +15,14 @@ void qa_32fc_power_spectral_density_32f_aligned16::t1() { #else void qa_32fc_power_spectral_density_32f_aligned16::t1() { - + volk_environment_init(); clock_t start, end; double total; const int vlen = 3201; const int ITERS = 10000; __VOLK_ATTR_ALIGNED(16) std::complex<float> input0[vlen]; - + __VOLK_ATTR_ALIGNED(16) float output_generic[vlen]; __VOLK_ATTR_ALIGNED(16) float output_sse3[vlen]; @@ -30,7 +30,7 @@ void qa_32fc_power_spectral_density_32f_aligned16::t1() { const float rbw = 1.7; float* inputLoad = (float*)input0; - for(int i = 0; i < 2*vlen; ++i) { + for(int i = 0; i < 2*vlen; ++i) { inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))); } printf("32fc_power_spectral_density_32f_aligned\n"); @@ -54,7 +54,7 @@ void qa_32fc_power_spectral_density_32f_aligned16::t1() { //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); } - + for(int i = 0; i < vlen; ++i) { //printf("%d...%d\n", output0[i], output01[i]); CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i]*1e-4)); diff --git a/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc b/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc index fefdf06eeb..b825c20e4e 100644 --- a/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc +++ b/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc @@ -7,7 +7,7 @@ #define assertcomplexEqual(expected, actual, delta) \ CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ - CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta); + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta); #define ERR_DELTA (1e-4) @@ -35,7 +35,7 @@ void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() { std::complex<float>* input; std::complex<float>* taps; - + std::complex<float>* result_generic; std::complex<float>* result; @@ -43,19 +43,19 @@ void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() { ret = posix_memalign((void**)&taps, 16, vlen << 3); ret = posix_memalign((void**)&result_generic, 16, 8); ret = posix_memalign((void**)&result, 16, 8); - + result_generic[0] = std::complex<float>(0,0); result[0] = std::complex<float>(0,0); random_floats((float*)input, vlen * 2); random_floats((float*)taps, vlen * 2); - - + + volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result_generic, input, taps, vlen * 8, "generic"); - + volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result, input, taps, vlen * 8, "sse"); printf("32fc_x2_conjugate_dot_prod_32fc_u\n"); @@ -67,7 +67,7 @@ void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() { free(taps); free(result_generic); free(result); - + } @@ -87,13 +87,13 @@ random_floats (float *buf, unsigned n) void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() { const int vlen = 789743; - + volk_environment_init(); int ret; std::complex<float>* input; std::complex<float>* taps; - + std::complex<float>* result_generic; std::complex<float>* result; @@ -101,19 +101,19 @@ void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() { ret = posix_memalign((void**)&taps, 16, vlen << 3); ret = posix_memalign((void**)&result_generic, 16, 8); ret = posix_memalign((void**)&result, 16, 8); - + result_generic[0] = std::complex<float>(0,0); result[0] = std::complex<float>(0,0); random_floats((float*)input, vlen * 2); random_floats((float*)taps, vlen * 2); - - + + volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result_generic, input, taps, vlen * 8, "generic"); - + volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result, input, taps, vlen * 8, "sse_32"); printf("32fc_x2_conjugate_dot_prod_32fc_u\n"); @@ -125,7 +125,7 @@ void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() { free(taps); free(result_generic); free(result); - + } diff --git a/volk/lib/qa_32u_popcnt_aligned16.cc b/volk/lib/qa_32u_popcnt_aligned16.cc index c880260f2a..5559d933df 100644 --- a/volk/lib/qa_32u_popcnt_aligned16.cc +++ b/volk/lib/qa_32u_popcnt_aligned16.cc @@ -16,8 +16,8 @@ void qa_32u_popcnt_aligned16::t1() { #else void qa_32u_popcnt_aligned16::t1() { - - + + volk_runtime_init(); volk_environment_init(); @@ -26,7 +26,7 @@ void qa_32u_popcnt_aligned16::t1() { const int ITERS = 10000000; __VOLK_ATTR_ALIGNED(16) uint32_t input0; - + __VOLK_ATTR_ALIGNED(16) uint32_t output0; __VOLK_ATTR_ALIGNED(16) uint32_t output01; @@ -55,7 +55,7 @@ void qa_32u_popcnt_aligned16::t1() { total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("sse4.2_time: %f\n", total); - + CPPUNIT_ASSERT_EQUAL(output0, output01); } diff --git a/volk/lib/qa_64u_popcnt_aligned16.cc b/volk/lib/qa_64u_popcnt_aligned16.cc index 6be4e50ea0..391601f223 100644 --- a/volk/lib/qa_64u_popcnt_aligned16.cc +++ b/volk/lib/qa_64u_popcnt_aligned16.cc @@ -16,8 +16,8 @@ void qa_64u_popcnt_aligned16::t1() { #else void qa_64u_popcnt_aligned16::t1() { - - + + volk_runtime_init(); volk_environment_init(); @@ -26,7 +26,7 @@ void qa_64u_popcnt_aligned16::t1() { const int ITERS = 10000000; __VOLK_ATTR_ALIGNED(16) uint64_t input0; - + __VOLK_ATTR_ALIGNED(16) uint64_t output0; __VOLK_ATTR_ALIGNED(16) uint64_t output01; @@ -55,7 +55,7 @@ void qa_64u_popcnt_aligned16::t1() { total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("sse4.2_time: %f\n", total); - + CPPUNIT_ASSERT_EQUAL(output0, output01); } diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc index bb37801c94..4e361aece2 100644 --- a/volk/lib/qa_utils.cc +++ b/volk/lib/qa_utils.cc @@ -15,6 +15,7 @@ #include <volk/volk_common.h> #include <boost/typeof/typeof.hpp> #include <boost/type_traits.hpp> +#include <stdio.h> float uniform() { return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) @@ -46,7 +47,7 @@ void load_random_data(void *data, volk_type_t type, unsigned int n) { case 4: if(type.is_signed) ((int32_t *)data)[i] = (int32_t) scaled_rand; else ((uint32_t *)data)[i] = (uint32_t) scaled_rand; - break; + break; case 2: if(type.is_signed) ((int16_t *)data)[i] = (int16_t) scaled_rand; else ((uint16_t *)data)[i] = (uint16_t) scaled_rand; @@ -69,7 +70,7 @@ static std::vector<std::string> get_arch_list(struct volk_func_desc desc) { //if(!(archs[i+1] & volk_get_lvarch())) continue; //this arch isn't available on this pc archlist.push_back(std::string(desc.indices[i])); } - + return archlist; } @@ -81,15 +82,15 @@ volk_type_t volk_type_from_string(std::string name) { type.is_signed = false; type.size = 0; type.str = name; - + if(name.size() < 2) throw std::string("name too short to be a datatype"); - + //is it a scalar? - if(name[0] == 's') { + if(name[0] == 's') { type.is_scalar = true; name = name.substr(1, name.size()-1); } - + //get the data size size_t last_size_pos = name.find_last_of("0123456789"); if(last_size_pos < 0) throw std::string("no size spec in type ").append(name); @@ -98,7 +99,7 @@ volk_type_t volk_type_from_string(std::string name) { assert(((size % 8) == 0) && (size <= 64) && (size != 0)); type.size = size/8; //in bytes - + for(size_t i=last_size_pos+1; i < name.size(); i++) { switch (name[i]) { case 'f': @@ -117,19 +118,19 @@ volk_type_t volk_type_from_string(std::string name) { throw; } } - + return type; } -static void get_signatures_from_name(std::vector<volk_type_t> &inputsig, - std::vector<volk_type_t> &outputsig, +static void get_signatures_from_name(std::vector<volk_type_t> &inputsig, + std::vector<volk_type_t> &outputsig, std::string name) { boost::char_separator<char> sep("_"); boost::tokenizer<boost::char_separator<char> > tok(name, sep); std::vector<std::string> toked; tok.assign(name); toked.assign(tok.begin(), tok.end()); - + assert(toked[0] == "volk"); toked.erase(toked.begin()); @@ -143,7 +144,7 @@ static void get_signatures_from_name(std::vector<volk_type_t> &inputsig, try { type = volk_type_from_string(token); if(side == SIDE_NAME) side = SIDE_OUTPUT; //if this is the first one after the name... - + if(side == SIDE_INPUT) inputsig.push_back(type); else outputsig.push_back(type); } catch (...){ @@ -160,7 +161,7 @@ static void get_signatures_from_name(std::vector<volk_type_t> &inputsig, side = SIDE_NAME; fn_name.append("_"); fn_name.append(token); - } + } else if(side == SIDE_OUTPUT) { if(token != toked.back()) throw; //the last token in the name is the alignment } @@ -168,6 +169,7 @@ static void get_signatures_from_name(std::vector<volk_type_t> &inputsig, } //we don't need an output signature (some fn's operate on the input data, "in place"), but we do need at least one input! assert(inputsig.size() != 0); + } inline void run_cast_test1(volk_fn_1arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) { @@ -223,7 +225,7 @@ bool fcompare(t *in1, t *in2, unsigned int vlen, float tol) { } } } - + return fail; } @@ -239,7 +241,7 @@ bool icompare(t *in1, t *in2, unsigned int vlen, unsigned int tol) { } } } - + return fail; } @@ -261,13 +263,14 @@ bool run_volk_tests(struct volk_func_desc desc, lv_32fc_t scalar, int vlen, int iter, - std::vector<std::string> *best_arch_vector = 0 + std::vector<std::string> *best_arch_vector = 0, + std::string puppet_master_name = "NULL" ) { std::cout << "RUN_VOLK_TESTS: " << name << std::endl; - + //first let's get a list of available architectures for the test std::vector<std::string> arch_list = get_arch_list(desc); - + if(arch_list.size() < 2) { std::cout << "no architectures to test" << std::endl; return false; @@ -286,9 +289,9 @@ bool run_volk_tests(struct volk_func_desc desc, if(inputsig[i].is_scalar) { inputsc.push_back(inputsig[i]); inputsig.erase(inputsig.begin() + i); + i -= 1; } } - //for(int i=0; i<inputsig.size(); i++) std::cout << "Input: " << inputsig[i].str << std::endl; //for(int i=0; i<outputsig.size(); i++) std::cout << "Output: " << outputsig[i].str << std::endl; std::vector<void *> inbuffs; @@ -299,7 +302,7 @@ bool run_volk_tests(struct volk_func_desc desc, for(size_t i=0; i<inbuffs.size(); i++) { load_random_data(inbuffs[i], inputsig[i], vlen); } - + //ok let's make a vector of vector of void buffers, which holds the input/output vectors for each arch std::vector<std::vector<void *> > test_data; for(size_t i=0; i<arch_list.size(); i++) { @@ -312,7 +315,7 @@ bool run_volk_tests(struct volk_func_desc desc, } test_data.push_back(arch_buffs); } - + std::vector<volk_type_t> both_sigs; both_sigs.insert(both_sigs.end(), outputsig.begin(), outputsig.end()); both_sigs.insert(both_sigs.end(), inputsig.begin(), inputsig.end()); @@ -326,7 +329,7 @@ bool run_volk_tests(struct volk_func_desc desc, switch(both_sigs.size()) { case 1: if(inputsc.size() == 0) { - run_cast_test1((volk_fn_1arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); + run_cast_test1((volk_fn_1arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); } else if(inputsc.size() == 1 && inputsc[0].is_float) { if(inputsc[0].is_complex) { run_cast_test1_s32fc((volk_fn_1arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); @@ -364,23 +367,23 @@ bool run_volk_tests(struct volk_func_desc desc, throw "no function handler for this signature"; break; } - + end = clock(); double arch_time = (double)(end-start)/(double)CLOCKS_PER_SEC; std::cout << arch_list[i] << " completed in " << arch_time << "s" << std::endl; profile_times.push_back(arch_time); } - + //and now compare each output to the generic output //first we have to know which output is the generic one, they aren't in order... size_t generic_offset=0; - for(size_t i=0; i<arch_list.size(); i++) + for(size_t i=0; i<arch_list.size(); i++) if(arch_list[i] == "generic") generic_offset=i; //now compare //if(outputsig.size() == 0) outputsig = inputsig; //a hack, i know - + bool fail = false; bool fail_global = false; std::vector<bool> arch_results; @@ -438,7 +441,7 @@ bool run_volk_tests(struct volk_func_desc desc, } arch_results.push_back(!fail); } - + double best_time = std::numeric_limits<double>::max(); std::string best_arch = "generic"; for(size_t i=0; i < arch_list.size(); i++) { @@ -447,10 +450,15 @@ bool run_volk_tests(struct volk_func_desc desc, best_arch = arch_list[i]; } } - + std::cout << "Best arch: " << best_arch << std::endl; if(best_arch_vector) { - best_arch_vector->push_back(name + std::string(" ") + best_arch); + if(puppet_master_name == "NULL") { + best_arch_vector->push_back(name + std::string(" ") + best_arch); + } + else { + best_arch_vector->push_back(puppet_master_name + std::string(" ") + best_arch); + } } return fail_global; diff --git a/volk/lib/qa_utils.h b/volk/lib/qa_utils.h index b998df852e..1e639ac3c6 100644 --- a/volk/lib/qa_utils.h +++ b/volk/lib/qa_utils.h @@ -21,10 +21,12 @@ volk_type_t volk_type_from_string(std::string); float uniform(void); void random_floats(float *buf, unsigned n); -bool run_volk_tests(struct volk_func_desc, void(*)(), std::string, float, lv_32fc_t, int, int, std::vector<std::string> *); +bool run_volk_tests(struct volk_func_desc, void(*)(), std::string, float, lv_32fc_t, int, int, std::vector<std::string> *, std::string); -#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) BOOST_AUTO_TEST_CASE(func##_test) { BOOST_CHECK_EQUAL(run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, 0), 0); } -#define VOLK_PROFILE(func, tol, scalar, len, iter, results) run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, results) + +#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) BOOST_AUTO_TEST_CASE(func##_test) { BOOST_CHECK_EQUAL(run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, 0, "NULL"), 0); } +#define VOLK_PROFILE(func, tol, scalar, len, iter, results) run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, results, "NULL") +#define VOLK_PUPPET_PROFILE(func, puppet_master_func, tol, scalar, len, iter, results) run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, results, std::string(#puppet_master_func)) typedef void (*volk_fn_1arg)(void *, unsigned int, const char*); //one input, operate in place typedef void (*volk_fn_2arg)(void *, void *, unsigned int, const char*); typedef void (*volk_fn_3arg)(void *, void *, void *, unsigned int, const char*); diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc index 593087f85d..f0011190e1 100644 --- a/volk/lib/testqa.cc +++ b/volk/lib/testqa.cc @@ -20,6 +20,7 @@ VOLK_RUN_TESTS(volk_16i_convert_8i_u, 0, 0, 20460, 1); //VOLK_RUN_TESTS(volk_16i_permute_and_scalar_add_a, 1e-4, 0, 2046, 1000); //VOLK_RUN_TESTS(volk_16i_x4_quad_max_star_16i_a, 1e-4, 0, 2046, 1000); VOLK_RUN_TESTS(volk_16u_byteswap_a, 0, 0, 20460, 1); +//VOLK_RUN_TESTS(volk_16i_32fc_dot_prod_32fc_a, 1e-4, 0, 204600, 1); VOLK_RUN_TESTS(volk_32f_accumulator_s32f_a, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32f_x2_add_32f_a, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32f_x2_add_32f_u, 1e-4, 0, 20460, 1); @@ -33,8 +34,10 @@ VOLK_RUN_TESTS(volk_32fc_deinterleave_32f_x2_a, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32fc_deinterleave_64f_x2_a, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32fc_s32f_deinterleave_real_16i_a, 0, 32768, 20460, 1); VOLK_RUN_TESTS(volk_32fc_deinterleave_real_32f_a, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_deinterleave_imag_32f_a, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 204600, 1); +VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 2046000, 1); +VOLK_RUN_TESTS(volk_32fc_32f_dot_prod_32fc_a, 1e-4, 0, 2046000, 1); VOLK_RUN_TESTS(volk_32fc_index_max_16u_a, 3, 0, 20460, 1); VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i_a, 1, 32768, 20460, 1); VOLK_RUN_TESTS(volk_32fc_magnitude_32f_a, 1e-4, 0, 20460, 1); @@ -52,10 +55,11 @@ VOLK_RUN_TESTS(volk_32fc_x2_square_dist_32f_a, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a, 1e-4, 10, 20460, 1); VOLK_RUN_TESTS(volk_32f_x2_divide_32f_a, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_a, 1e-4, 0, 204600, 1); -VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 204600, 1); +//VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 204600, 1); +VOLK_RUN_TESTS(volk_32f_x2_dot_prod_16i_a, 1e-4, 0, 204600, 1); //VOLK_RUN_TESTS(volk_32f_s32f_32f_fm_detect_32f_a, 1e-4, 2046, 10000); VOLK_RUN_TESTS(volk_32f_index_max_16u_a, 3, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic_a, 1, 32768, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic_a, 1, 32767, 20460, 1); VOLK_RUN_TESTS(volk_32f_x2_interleave_32fc_a, 0, 0, 20460, 1); VOLK_RUN_TESTS(volk_32f_x2_max_32f_a, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32f_x2_min_32f_a, 1e-4, 0, 20460, 1); @@ -101,3 +105,4 @@ VOLK_RUN_TESTS(volk_32fc_s32fc_multiply_32fc_a, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32fc_s32fc_multiply_32fc_u, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32f_s32f_multiply_32f_a, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32f_s32f_multiply_32f_u, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_s32fc_rotatorpuppet_32fc_a, 1e-2, (lv_32fc_t)lv_cmake(0.953939201, 0.3), 20460, 1); diff --git a/volk/lib/volk_prefs.c b/volk/lib/volk_prefs.c index 7e705bed46..5e5c9dfff7 100644 --- a/volk/lib/volk_prefs.c +++ b/volk/lib/volk_prefs.c @@ -26,7 +26,7 @@ int load_preferences(struct volk_arch_pref **prefs) { char path[512], line[512], function[128], arch[32]; int n_arch_prefs = 0; struct volk_arch_pref *t_pref; - + //get the config path get_config_path(path); if (path == NULL) return n_arch_prefs; //no prefs found diff --git a/volk/lib/volk_rank_archs.c b/volk/lib/volk_rank_archs.c index 4baa078bca..865d60955c 100644 --- a/volk/lib/volk_rank_archs.c +++ b/volk/lib/volk_rank_archs.c @@ -26,14 +26,14 @@ unsigned int volk_rank_archs(const char *indices[], const int* arch_defs, unsign n_arch_prefs = load_preferences(&volk_arch_prefs); prefs_loaded = 1; } - + //now look for the function name in the prefs list for(i=0; i < n_arch_prefs; i++) { if(!strncmp(name, volk_arch_prefs[i].name, 128)) { //found it return get_index(indices, n_archs, volk_arch_prefs[i].arch); } } - + for(i=1; i < n_archs; ++i) { if((arch_defs[i]&(!arch)) == 0) { best_val = (arch_defs[i] > arch_defs[best_val + 1]) ? i-1 : best_val; |