summaryrefslogtreecommitdiff
path: root/volk/lib
diff options
context:
space:
mode:
authorTom Rondeau <trondeau@vt.edu>2012-06-24 16:06:38 -0400
committerTom Rondeau <trondeau@vt.edu>2012-06-24 16:06:38 -0400
commitd97764ce6040aff8023b43323b4bc048a907a1fd (patch)
treedda4ba74128454fd271ffe7dcbe673f2ae0b4c08 /volk/lib
parentb142f64573b8b7b15a94fc9c64e2f26264a5b144 (diff)
parentbf8700a226091c5ce0130a3819a4c12b9d9981a6 (diff)
Merge branch 'master' into my_qtsink
Conflicts: gr-qtgui/include/qtgui_sink_c.h gr-qtgui/include/qtgui_sink_f.h gr-qtgui/include/qtgui_util.h gr-qtgui/lib/ConstellationDisplayPlot.cc gr-qtgui/lib/FrequencyDisplayPlot.cc gr-qtgui/lib/SpectrumGUIClass.cc gr-qtgui/lib/TimeDomainDisplayPlot.cc gr-qtgui/lib/WaterfallDisplayPlot.cc gr-qtgui/lib/qtgui_sink_c.cc gr-qtgui/lib/qtgui_sink_f.cc gr-qtgui/lib/spectrumdisplayform.cc
Diffstat (limited to 'volk/lib')
-rw-r--r--volk/lib/CMakeLists.txt364
-rw-r--r--volk/lib/gcc_x86_cpuid.h16
-rw-r--r--volk/lib/qa_16s_add_quad_aligned16.cc10
-rw-r--r--volk/lib/qa_16s_branch_4_state_8_aligned16.cc38
-rw-r--r--volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc18
-rw-r--r--volk/lib/qa_16s_quad_max_star_aligned16.cc6
-rw-r--r--volk/lib/qa_32f_fm_detect_aligned16.cc8
-rw-r--r--volk/lib/qa_32f_index_max_aligned16.cc36
-rw-r--r--volk/lib/qa_32fc_index_max_aligned16.cc34
-rw-r--r--volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc8
-rw-r--r--volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc28
-rw-r--r--volk/lib/qa_32u_popcnt_aligned16.cc8
-rw-r--r--volk/lib/qa_64u_popcnt_aligned16.cc8
-rw-r--r--volk/lib/qa_utils.cc66
-rw-r--r--volk/lib/qa_utils.h8
-rw-r--r--volk/lib/testqa.cc11
-rw-r--r--volk/lib/volk_prefs.c2
-rw-r--r--volk/lib/volk_rank_archs.c4
18 files changed, 335 insertions, 338 deletions
diff --git a/volk/lib/CMakeLists.txt b/volk/lib/CMakeLists.txt
index b491f94bbb..8288786c98 100644
--- a/volk/lib/CMakeLists.txt
+++ b/volk/lib/CMakeLists.txt
@@ -1,5 +1,5 @@
#
-# Copyright 2011 Free Software Foundation, Inc.
+# Copyright 2011-2012 Free Software Foundation, Inc.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -16,6 +16,31 @@
#
########################################################################
+# header file detection
+########################################################################
+include(CheckIncludeFile)
+CHECK_INCLUDE_FILE(cpuid.h HAVE_CPUID_H)
+if(HAVE_CPUID_H)
+ add_definitions(-DHAVE_CPUID_H)
+endif()
+
+CHECK_INCLUDE_FILE(intrin.h HAVE_INTRIN_H)
+if(HAVE_INTRIN_H)
+ add_definitions(-DHAVE_INTRIN_H)
+endif()
+
+CHECK_INCLUDE_FILE(fenv.h HAVE_FENV_H)
+if(HAVE_FENV_H)
+ add_definitions(-DHAVE_FENV_H)
+endif()
+
+CHECK_INCLUDE_FILE(dlfcn.h HAVE_DLFCN_H)
+if(HAVE_DLFCN_H)
+ add_definitions(-DHAVE_DLFCN_H)
+ list(APPEND volk_libraries ${CMAKE_DL_LIBS})
+endif()
+
+########################################################################
# Setup the compiler name
########################################################################
set(COMPILER_NAME ${CMAKE_C_COMPILER_ID})
@@ -23,235 +48,183 @@ if(MSVC) #its not set otherwise
set(COMPILER_NAME MSVC)
endif()
+message(STATUS "Compiler name: ${COMPILER_NAME}")
+
if(NOT DEFINED COMPILER_NAME)
message(FATAL_ERROR "COMPILER_NAME undefined. Volk build may not support this compiler.")
endif()
########################################################################
-# Parse the arches xml file:
-# Test each arch to see if the compiler supports the flag.
-# If the test passes append the arch to the available list.
+# detect x86 flavor of CPU
########################################################################
-#extract the compiler lines from the xml file using abusive python
-
-
-
-execute_process(
- COMMAND ${PYTHON_EXECUTABLE} -c
- "from xml.dom import minidom; print ';'.join(map(lambda b: ','.join([','.join([b.attributes['name'].value,item.attributes['name'].value,item.firstChild.data]) for item in b.getElementsByTagName('remap')]), minidom.parse('${CMAKE_SOURCE_DIR}/gen/compilers.xml').getElementsByTagName('compiler')))"
-
- OUTPUT_VARIABLE compiler_lines OUTPUT_STRIP_TRAILING_WHITESPACE
-)
-
-foreach(thing ${compiler_lines})
- string(REGEX REPLACE "," ";" thing_list ${thing})
- list(FIND thing_list ${COMPILER_NAME} check_val)
- if(NOT ("${check_val}" STREQUAL "-1"))
- string(REGEX REPLACE "${COMPILER_NAME}," ";" filter_string ${thing})
- endif()
-endforeach()
-
+if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(i.86|x86|x86_64|amd64)$")
+ message(STATUS "x86* CPU detected")
+ set(CPU_IS_x86 TRUE)
+endif()
-#extract compiler prefixes from the xml file using abusive python
+########################################################################
+# determine passing architectures based on compile flag tests
+########################################################################
execute_process(
- COMMAND ${PYTHON_EXECUTABLE} -c
- "from xml.dom import minidom; print ';'.join(map(lambda b: ','.join([','.join([b.attributes['name'].value,item.firstChild.data]) for item in b.getElementsByTagName('prefix')]), minidom.parse('${CMAKE_SOURCE_DIR}/gen/compilers.xml').getElementsByTagName('compiler')))"
-
- OUTPUT_VARIABLE compiler_prefixes OUTPUT_STRIP_TRAILING_WHITESPACE
+ COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_DASH_B}
+ ${CMAKE_SOURCE_DIR}/gen/volk_compile_utils.py
+ --mode "arch_flags" --compiler "${COMPILER_NAME}"
+ OUTPUT_VARIABLE arch_flag_lines OUTPUT_STRIP_TRAILING_WHITESPACE
)
-foreach(thing ${compiler_prefixes})
- string(REGEX REPLACE "," ";" thing_list ${thing})
- list(FIND thing_list ${COMPILER_NAME} check_val)
- if(NOT ("${check_val}" STREQUAL "-1"))
- list(GET thing_list "1" prefix)
+macro(check_arch arch_name)
+ set(flags ${ARGN})
+ set(have_${arch_name} TRUE)
+ foreach(flag ${flags})
+ include(CheckCXXCompilerFlag)
+ set(have_flag have${flag})
+ execute_process( #make the have_flag have nice alphanum chars (just for looks/not necessary)
+ COMMAND ${PYTHON_EXECUTABLE} -c "import re; print(re.sub('\\W', '_', '${have_flag}'))"
+ OUTPUT_VARIABLE have_flag OUTPUT_STRIP_TRAILING_WHITESPACE
+ )
+ CHECK_CXX_COMPILER_FLAG(${flag} ${have_flag})
+ if (NOT ${have_flag})
+ set(have_${arch_name} FALSE)
+ endif()
+ endforeach(flag)
+ if (have_${arch_name})
+ list(APPEND available_archs ${arch_name})
endif()
-endforeach()
-
-
-
-
-#extract the arch lines from the xml file using abusive python
-execute_process(
- COMMAND ${PYTHON_EXECUTABLE} -c
- "from xml.dom import minidom; print ';'.join(map(lambda a: '%s %s %s %s'%(a.attributes['name'].value,a.getElementsByTagName('flag')[0].firstChild.data,a.getElementsByTagName('overrule')[0].firstChild.data,a.getElementsByTagName('overrule_val')[0].firstChild.data) if (len(a.getElementsByTagName('overrule'))) else '%s %s %s %s'%(a.attributes['name'].value,a.getElementsByTagName('flag')[0].firstChild.data,'no_overrule', 'no_overrule_val'), minidom.parse('${CMAKE_SOURCE_DIR}/gen/archs.xml').getElementsByTagName('arch')))"
-
- OUTPUT_VARIABLE arch_lines OUTPUT_STRIP_TRAILING_WHITESPACE
-)
+endmacro(check_arch)
+foreach(line ${arch_flag_lines})
+ string(REGEX REPLACE "," ";" arch_flags ${line})
+ check_arch(${arch_flags})
+endforeach(line)
+macro(OVERRULE_ARCH arch reason)
+ message(STATUS "${reason}, Overruled arch ${arch}")
+ list(REMOVE_ITEM available_archs ${arch})
+endmacro(OVERRULE_ARCH)
-
-#set the various overrule values (see archs.xml)
-#a lot of this is translating between automake and cmake
-if(NOT "${CROSSCOMPILE_MULTILIB}" STREQUAL "true")
- set(MD_SUBCPU ${CMAKE_SYSTEM_PROCESSOR})
- #detect 32 or 64 bit compiler
- if(MD_SUBCPU MATCHES "^(i.86|x86|x86_64|amd64)$")
- include(CheckTypeSize)
- check_type_size("void*" SIZEOF_VOID_P BUILTIN_TYPES_ONLY)
- if (${SIZEOF_VOID_P} EQUAL 8)
- set(MD_SUBCPU x86_64)
- else()
- set(MD_SUBCPU x86)
- endif()
+########################################################################
+# eliminate AVX on GCC < 4.4
+# even though it accepts -mavx, as won't assemble xgetbv, which we need
+########################################################################
+if(CPU_IS_x86 AND COMPILER_NAME MATCHES "GNU")
+ execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion
+ OUTPUT_VARIABLE GCC_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE)
+ if(GCC_VERSION VERSION_LESS "4.4")
+ OVERRULE_ARCH(avx "GCC missing xgetbv")
endif()
endif()
-if(NOT "${ORC_FOUND}" STREQUAL "TRUE")
- set(LV_HAVE_ORC "no")
-endif()
-
+########################################################################
+# implement overruling in the ORC case,
+# since ORC always passes flag detection
+########################################################################
+if(NOT ORC_FOUND)
+ OVERRULE_ARCH(orc "ORC support not found")
+endif()
-
-
-
-macro(compiler_filter name flag)
- set(filtered_flag ${flag})
- foreach(thing ${filter_string})
- string(REGEX REPLACE "," ";" flagmap ${thing})
- list(GET flagmap "0" key)
- list(GET flagmap "1" val)
- string(REGEX MATCH "^${key}$" found ${flag})
- if("${found}" STREQUAL "${key}")
- string(REGEX REPLACE "^${key}$" "${val}" filtered_flag ${flag})
- endif()
- endforeach()
- set(${name}_flag "${prefix}${filtered_flag}")
-endmacro()
-
-
-
-
-
-
-
-macro(handle_arch name flag overrule overrule_val)
-
- #handle overrule
- if("${${overrule}}" STREQUAL "${overrule_val}")
- set(have_${name} FALSE)
- message(STATUS "${name} overruled")
- #handle special case for none flag
- elseif(${flag} STREQUAL "none")
- set(have_${name} TRUE)
- #otherwise test the flag(s) against the compiler
- else()
- include(CheckCXXCompilerFlag)
- string(REGEX REPLACE "," ";" flag_list ${flag})
- set(have_${name} 1)
- foreach(thing ${flag_list})
- compiler_filter(${name} ${thing})
- CHECK_CXX_COMPILER_FLAG(${${name}_flag} have_${thing})
- if(NOT (${have_${name}} AND ("${have_${thing}}" STREQUAL "1")))
- set(have_${name} 0)
- endif()
- endforeach()
+########################################################################
+# implement overruling in the non-multilib case
+# this makes things work when both -m32 and -m64 pass
+########################################################################
+if(NOT CROSSCOMPILE_MULTILIB AND CPU_IS_x86)
+ include(CheckTypeSize)
+ check_type_size("void*[8]" SIZEOF_CPU BUILTIN_TYPES_ONLY)
+ if (${SIZEOF_CPU} EQUAL 64)
+ OVERRULE_ARCH(32 "CPU width is 64 bits")
endif()
-
- if(have_${name})
- list(APPEND available_arches ${name})
+ if (${SIZEOF_CPU} EQUAL 32)
+ OVERRULE_ARCH(64 "CPU width is 32 bits")
endif()
+endif()
-endmacro(handle_arch)
-
-#create a list of available arches
-foreach(arch_line ${arch_lines})
- string(REPLACE " " ";" args "${arch_line}")
- handle_arch(${args})
-endforeach(arch_line)
-
-message(STATUS "Available arches: ${available_arches}")
+########################################################################
+# done overrules! print the result
+########################################################################
+message(STATUS "Available architectures: ${available_archs}")
########################################################################
-# Parse the machines xml file:
-# Test each machine to see if its arch dependencies are supported.
-# Build a list of supported machines and the machine definitions.
+# determine available machines given the available architectures
########################################################################
-#extract the machine lines from the xml file using crazy python
execute_process(
- COMMAND ${PYTHON_EXECUTABLE} -c
- "from xml.dom import minidom; print ';'.join(map(lambda a: '%s %s'%(a.attributes['name'].value,a.getElementsByTagName('archs')[0].firstChild.data),minidom.parse('${CMAKE_SOURCE_DIR}/gen/machines.xml').getElementsByTagName('machine')))"
- OUTPUT_VARIABLE machine_lines OUTPUT_STRIP_TRAILING_WHITESPACE
+ COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_DASH_B}
+ ${CMAKE_SOURCE_DIR}/gen/volk_compile_utils.py
+ --mode "machines" --archs "${available_archs}"
+ OUTPUT_VARIABLE available_machines OUTPUT_STRIP_TRAILING_WHITESPACE
)
-macro(handle_machine1 name)
- unset(machine_flags)
- string(TOUPPER LV_MACHINE_${name} machine_def)
-
- #check if all the arches are supported
- foreach(arch ${ARGN})
- set(is_match ${have_${arch}})
- if(NOT is_match)
- set(is_match FALSE)
- break()
- endif(NOT is_match)
- set(machine_flags "${machine_flags} ${${arch}_flag}")
- endforeach(arch)
-
- string(REGEX REPLACE "^[ \t]+" "" machine_flags "${machine_flags}")
-
- if(is_match)
- #this is a match, append the source and set its flags
- set(machine_source ${CMAKE_CURRENT_BINARY_DIR}/volk_machine_${name}.c)
- set_source_files_properties(${machine_source} PROPERTIES COMPILE_FLAGS "${machine_flags}")
- list(APPEND machine_sources ${machine_source})
- list(APPEND machine_defs ${machine_def})
- list(APPEND available_machines ${name})
- endif()
-endmacro(handle_machine1)
-
-macro(handle_machine name)
- set(arches ${ARGN})
- list(FIND arches "32|64" index)
- if(${index} EQUAL -1)
- handle_machine1(${name} ${arches})
- else()
- list(REMOVE_ITEM arches "32|64")
- handle_machine1(${name}_32 32 ${arches})
- handle_machine1(${name}_64 64 ${arches})
- endif()
-endmacro(handle_machine)
-
-#setup the available machines
-foreach(machine_line ${machine_lines})
- string(REPLACE " " ";" args "${machine_line}")
- handle_machine(${args})
-endforeach(machine_line)
+########################################################################
+# Implement machine overruling for redundant machines:
+# A machine is redundant when expansion rules occur,
+# and the arch superset passes configuration checks.
+# When this occurs, eliminate the redundant machines
+# to avoid unnecessary compilation of subset machines.
+########################################################################
+foreach(arch orc 64 32)
+ foreach(machine_name ${available_machines})
+ string(REPLACE "_${arch}" "" machine_name_no_arch ${machine_name})
+ if (${machine_name} STREQUAL ${machine_name_no_arch})
+ else()
+ list(REMOVE_ITEM available_machines ${machine_name_no_arch})
+ endif()
+ endforeach(machine_name)
+endforeach(arch)
+########################################################################
+# done overrules! print the result
+########################################################################
message(STATUS "Available machines: ${available_machines}")
########################################################################
# Create rules to run the volk generator
########################################################################
-#list of the generated sources
-set(volk_gen_sources
- ${CMAKE_BINARY_DIR}/include/volk/volk.h
- ${CMAKE_BINARY_DIR}/lib/volk.c
- ${CMAKE_BINARY_DIR}/lib/volk_init.h
- ${CMAKE_BINARY_DIR}/include/volk/volk_typedefs.h
- ${CMAKE_BINARY_DIR}/include/volk/volk_cpu.h
- ${CMAKE_BINARY_DIR}/lib/volk_cpu.c
- ${CMAKE_BINARY_DIR}/include/volk/volk_config_fixed.h
- ${CMAKE_BINARY_DIR}/lib/volk_environment_init.c
- ${CMAKE_BINARY_DIR}/lib/volk_environment_init.h
- ${CMAKE_BINARY_DIR}/lib/volk_machines.h
- ${CMAKE_BINARY_DIR}/lib/volk_machines.c
- ${machine_sources}
-)
#dependencies are all python, xml, and header implementation files
file(GLOB xml_files ${CMAKE_SOURCE_DIR}/gen/*.xml)
file(GLOB py_files ${CMAKE_SOURCE_DIR}/gen/*.py)
file(GLOB h_files ${CMAKE_SOURCE_DIR}/include/volk/*.h)
-add_custom_command(
- OUTPUT ${volk_gen_sources}
- DEPENDS ${xml_files} ${py_files} ${h_files}
- COMMAND ${PYTHON_EXECUTABLE} -B
- ${CMAKE_SOURCE_DIR}/gen/volk_register.py
- ${CMAKE_BINARY_DIR}
-)
+macro(gen_template tmpl output)
+ list(APPEND volk_gen_sources ${output})
+ add_custom_command(
+ OUTPUT ${output}
+ DEPENDS ${xml_files} ${py_files} ${h_files} ${tmpl}
+ COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_DASH_B}
+ ${CMAKE_SOURCE_DIR}/gen/volk_tmpl_utils.py
+ --input ${tmpl} --output ${output} ${ARGN}
+ )
+endmacro(gen_template)
+
+make_directory(${CMAKE_BINARY_DIR}/include/volk)
+
+gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk.tmpl.h ${CMAKE_BINARY_DIR}/include/volk/volk.h)
+gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk.tmpl.c ${CMAKE_BINARY_DIR}/lib/volk.c)
+gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_typedefs.tmpl.h ${CMAKE_BINARY_DIR}/include/volk/volk_typedefs.h)
+gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_cpu.tmpl.h ${CMAKE_BINARY_DIR}/include/volk/volk_cpu.h)
+gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_cpu.tmpl.c ${CMAKE_BINARY_DIR}/lib/volk_cpu.c)
+gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_config_fixed.tmpl.h ${CMAKE_BINARY_DIR}/include/volk/volk_config_fixed.h)
+gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_machines.tmpl.h ${CMAKE_BINARY_DIR}/lib/volk_machines.h)
+gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_machines.tmpl.c ${CMAKE_BINARY_DIR}/lib/volk_machines.c)
+
+foreach(machine_name ${available_machines})
+ #generate machine source
+ set(machine_source ${CMAKE_CURRENT_BINARY_DIR}/volk_machine_${machine_name}.c)
+ gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_machine_xxx.tmpl.c ${machine_source} ${machine_name})
+
+ #determine machine flags
+ execute_process(
+ COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_DASH_B}
+ ${CMAKE_SOURCE_DIR}/gen/volk_compile_utils.py
+ --mode "machine_flags" --machine "${machine_name}" --compiler "${COMPILER_NAME}"
+ OUTPUT_VARIABLE ${machine_name}_flags OUTPUT_STRIP_TRAILING_WHITESPACE
+ )
+ if(${machine_name}_flags)
+ set_source_files_properties(${machine_source} PROPERTIES COMPILE_FLAGS "${${machine_name}_flags}")
+ endif()
+
+ #add to available machine defs
+ string(TOUPPER LV_MACHINE_${machine_name} machine_def)
+ list(APPEND machine_defs ${machine_def})
+endforeach(machine_name)
########################################################################
# Set local include directories first
@@ -270,7 +243,7 @@ if(ORC_FOUND)
#setup orc library usage
include_directories(${ORC_INCLUDE_DIRS})
link_directories(${ORC_LIBRARY_DIRS})
- add_definitions(-DLV_HAVE_ORC)
+ list(APPEND volk_libraries ${ORC_LIBRARIES})
#setup orc functions
file(GLOB orc_files ${CMAKE_SOURCE_DIR}/orc/*.orc)
@@ -313,16 +286,15 @@ PROPERTIES COMPILE_DEFINITIONS "${machine_defs}")
if(MSVC)
#add compatibility includes for stdint types
- include_directories(${CMAKE_SOURCE_DIR}/msvc)
+ include_directories(${CMAKE_SOURCE_DIR}/cmake/msvc)
+ add_definitions(-DHAVE_CONFIG_H)
#compile the sources as C++ due to the lack of complex.h under MSVC
set_source_files_properties(${volk_sources} PROPERTIES LANGUAGE CXX)
endif()
#create the volk runtime library
add_library(volk SHARED ${volk_sources})
-if(ORC_FOUND)
- target_link_libraries(volk ${ORC_LIBRARIES})
-endif(ORC_FOUND)
+target_link_libraries(volk ${volk_libraries})
set_target_properties(volk PROPERTIES SOVERSION ${LIBVER})
set_target_properties(volk PROPERTIES DEFINE_SYMBOL "volk_EXPORTS")
diff --git a/volk/lib/gcc_x86_cpuid.h b/volk/lib/gcc_x86_cpuid.h
index 2d0916fb36..3c3f47b003 100644
--- a/volk/lib/gcc_x86_cpuid.h
+++ b/volk/lib/gcc_x86_cpuid.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc.
+ * Copyright (C) 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
*
* This file is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the
@@ -35,6 +35,8 @@
#define bit_XSAVE (1 << 26)
#define bit_OSXSAVE (1 << 27)
#define bit_AVX (1 << 28)
+#define bit_F16C (1 << 29)
+#define bit_RDRND (1 << 30)
/* %edx */
#define bit_CMPXCHG8B (1 << 8)
@@ -47,14 +49,22 @@
/* Extended Features */
/* %ecx */
#define bit_LAHF_LM (1 << 0)
+#define bit_ABM (1 << 5)
#define bit_SSE4a (1 << 6)
-#define bit_SSE5 (1 << 11)
+#define bit_XOP (1 << 11)
+#define bit_LWP (1 << 15)
+#define bit_FMA4 (1 << 16)
+#define bit_TBM (1 << 21)
/* %edx */
+#define bit_MMXEXT (1 << 22)
#define bit_LM (1 << 29)
#define bit_3DNOWP (1 << 30)
#define bit_3DNOW (1 << 31)
+/* Extended Features (%eax == 7) */
+#define bit_FSGSBASE (1 << 0)
+#define bit_BMI (1 << 3)
#if defined(__i386__) && defined(__PIC__)
/* %ebx may be the PIC register. */
@@ -114,8 +124,8 @@ __get_cpuid_max (unsigned int __ext, unsigned int *__sig)
unsigned int __eax, __ebx, __ecx, __edx;
#ifndef __x86_64__
-#if __GNUC__ >= 3
/* See if we can use cpuid. On AMD64 we always can. */
+#if __GNUC__ >= 3
__asm__ ("pushf{l|d}\n\t"
"pushf{l|d}\n\t"
"pop{l}\t%0\n\t"
diff --git a/volk/lib/qa_16s_add_quad_aligned16.cc b/volk/lib/qa_16s_add_quad_aligned16.cc
index 5d5eb7e188..8da43b9724 100644
--- a/volk/lib/qa_16s_add_quad_aligned16.cc
+++ b/volk/lib/qa_16s_add_quad_aligned16.cc
@@ -16,7 +16,7 @@ void qa_16s_add_quad_aligned16::t1() {
void qa_16s_add_quad_aligned16::t1() {
-
+
volk_environment_init();
clock_t start, end;
double total;
@@ -27,7 +27,7 @@ void qa_16s_add_quad_aligned16::t1() {
__VOLK_ATTR_ALIGNED(16) short input2[vlen];
__VOLK_ATTR_ALIGNED(16) short input3[vlen];
__VOLK_ATTR_ALIGNED(16) short input4[vlen];
-
+
__VOLK_ATTR_ALIGNED(16) short output0[vlen];
__VOLK_ATTR_ALIGNED(16) short output1[vlen];
__VOLK_ATTR_ALIGNED(16) short output2[vlen];
@@ -48,13 +48,13 @@ void qa_16s_add_quad_aligned16::t1() {
short minus3 = ((short) (rand() - (RAND_MAX/2))) >> 2;
short plus4 = ((short) (rand() - (RAND_MAX/2))) >> 2;
short minus4 = ((short) (rand() - (RAND_MAX/2))) >> 2;
-
+
input0[i] = plus0 - minus0;
input1[i] = plus1 - minus1;
input2[i] = plus2 - minus2;
input3[i] = plus3 - minus3;
input4[i] = plus4 - minus4;
-
+
}
printf("16s_add_quad_aligned\n");
@@ -76,7 +76,7 @@ void qa_16s_add_quad_aligned16::t1() {
//printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
//printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
}
-
+
for(int i = 0; i < vlen; ++i) {
//printf("%d...%d\n", output0[i], output01[i]);
CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
diff --git a/volk/lib/qa_16s_branch_4_state_8_aligned16.cc b/volk/lib/qa_16s_branch_4_state_8_aligned16.cc
index 2e6e6a1a0b..5a58569a1d 100644
--- a/volk/lib/qa_16s_branch_4_state_8_aligned16.cc
+++ b/volk/lib/qa_16s_branch_4_state_8_aligned16.cc
@@ -22,17 +22,17 @@ void qa_16s_branch_4_state_8_aligned16::t1() {
static char permute2[16]__attribute__((aligned(16))) = {0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d, 0x00, 0x01, 0x04, 0x05, 0x0a, 0x0b, 0x0e, 0x0f};
static char permute3[16]__attribute__((aligned(16))) = {0x00, 0x01, 0x04, 0x05, 0x0a, 0x0b, 0x0e, 0x0f, 0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d};
static char* permuters[4] = {permute0, permute1, permute2, permute3};
-
+
unsigned int num_bytes = vlen << 1;
volk_environment_init();
clock_t start, end;
double total;
-
+
__VOLK_ATTR_ALIGNED(16) short target[vlen];
__VOLK_ATTR_ALIGNED(16) short target2[vlen];
__VOLK_ATTR_ALIGNED(16) short target3[vlen];
-
+
__VOLK_ATTR_ALIGNED(16) short src0[vlen];
__VOLK_ATTR_ALIGNED(16) short permute_indexes[vlen] = {
7, 5, 2, 0, 6, 4, 3, 1, 6, 4, 3, 1, 7, 5, 2, 0, 1, 3, 4, 6, 0, 2, 5, 7, 0, 2, 5, 7, 1, 3, 4, 6 };
@@ -45,29 +45,29 @@ void qa_16s_branch_4_state_8_aligned16::t1() {
__VOLK_ATTR_ALIGNED(16) short cntl3[vlen] = {
0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff };
__VOLK_ATTR_ALIGNED(16) short scalars[4] = {1, 2, 3, 4};
-
-
+
+
for(int i = 0; i < vlen; ++i) {
src0[i] = i;
-
+
}
-
+
printf("16s_branch_4_state_8_aligned\n");
-
-
+
+
start = clock();
for(int i = 0; i < num_iters; ++i) {
volk_16s_permute_and_scalar_add_aligned16_manual(target, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "sse2");
}
end = clock();
-
+
total = (double)(end-start)/(double)CLOCKS_PER_SEC;
printf("permute_and_scalar_add_time: %f\n", total);
-
-
+
+
start = clock();
for(int i = 0; i < num_iters; ++i) {
@@ -78,25 +78,25 @@ void qa_16s_branch_4_state_8_aligned16::t1() {
total = (double)(end-start)/(double)CLOCKS_PER_SEC;
printf("branch_4_state_8_time, ssse3: %f\n", total);
-
+
start = clock();
for(int i = 0; i < num_iters; ++i) {
volk_16s_branch_4_state_8_aligned16_manual(target3, src0, permuters, cntl2, cntl3, scalars, "generic");
}
end = clock();
-
+
total = (double)(end-start)/(double)CLOCKS_PER_SEC;
printf("permute_and_scalar_add_time, generic: %f\n", total);
-
-
-
+
+
+
for(int i = 0; i < vlen; ++i) {
printf("psa... %d, b4s8... %d\n", target[i], target3[i]);
}
-
+
for(int i = 0; i < vlen; ++i) {
-
+
CPPUNIT_ASSERT(target[i] == target2[i]);
CPPUNIT_ASSERT(target[i] == target3[i]);
}
diff --git a/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc
index 3cd4e906df..dadd2c5804 100644
--- a/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc
+++ b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc
@@ -16,13 +16,13 @@ void qa_16s_permute_and_scalar_add_aligned16::t1() {
void qa_16s_permute_and_scalar_add_aligned16::t1() {
const int vlen = 64;
-
+
unsigned int num_bytes = vlen << 1;
volk_environment_init();
clock_t start, end;
double total;
-
+
__VOLK_ATTR_ALIGNED(16) short target[vlen];
__VOLK_ATTR_ALIGNED(16) short target2[vlen];
__VOLK_ATTR_ALIGNED(16) short src0[vlen];
@@ -43,7 +43,7 @@ void qa_16s_permute_and_scalar_add_aligned16::t1() {
}
printf("16s_permute_and_scalar_add_aligned\n");
-
+
start = clock();
for(int i = 0; i < 100000; ++i) {
volk_16s_permute_and_scalar_add_aligned16_manual(target, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "generic");
@@ -53,24 +53,24 @@ void qa_16s_permute_and_scalar_add_aligned16::t1() {
total = (double)(end-start)/(double)CLOCKS_PER_SEC;
printf("generic_time: %f\n", total);
-
+
start = clock();
for(int i = 0; i < 100000; ++i) {
volk_16s_permute_and_scalar_add_aligned16_manual(target2, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "sse2");
}
end = clock();
-
+
total = (double)(end-start)/(double)CLOCKS_PER_SEC;
printf("sse2_time: %f\n", total);
-
-
+
+
for(int i = 0; i < vlen; ++i) {
//printf("generic... %d, sse2... %d\n", target[i], target2[i]);
}
-
+
for(int i = 0; i < vlen; ++i) {
-
+
CPPUNIT_ASSERT(target[i] == target2[i]);
}
}
diff --git a/volk/lib/qa_16s_quad_max_star_aligned16.cc b/volk/lib/qa_16s_quad_max_star_aligned16.cc
index 192a69e350..2a5dec44ab 100644
--- a/volk/lib/qa_16s_quad_max_star_aligned16.cc
+++ b/volk/lib/qa_16s_quad_max_star_aligned16.cc
@@ -16,7 +16,7 @@ void qa_16s_quad_max_star_aligned16::t1() {
void qa_16s_quad_max_star_aligned16::t1() {
const int vlen = 34;
-
+
__VOLK_ATTR_ALIGNED(16) short input0[vlen];
__VOLK_ATTR_ALIGNED(16) short input1[vlen];
__VOLK_ATTR_ALIGNED(16) short input2[vlen];
@@ -50,9 +50,9 @@ void qa_16s_quad_max_star_aligned16::t1() {
for(int i = 0; i < vlen; ++i) {
printf("generic... %d, sse2... %d, inputs: %d, %d, %d, %d\n", output0[i], output1[i], input0[i], input1[i], input2[i], input3[i]);
}
-
+
for(int i = 0; i < vlen; ++i) {
-
+
CPPUNIT_ASSERT_EQUAL(output0[i], output1[i]);
}
}
diff --git a/volk/lib/qa_32f_fm_detect_aligned16.cc b/volk/lib/qa_32f_fm_detect_aligned16.cc
index a2e7a85be3..4e792ec6cb 100644
--- a/volk/lib/qa_32f_fm_detect_aligned16.cc
+++ b/volk/lib/qa_32f_fm_detect_aligned16.cc
@@ -15,18 +15,18 @@ void qa_32f_fm_detect_aligned16::t1() {
#else
void qa_32f_fm_detect_aligned16::t1() {
-
+
volk_environment_init();
clock_t start, end;
double total;
const int vlen = 3201;
const int ITERS = 10000;
__VOLK_ATTR_ALIGNED(16) float input0[vlen];
-
+
__VOLK_ATTR_ALIGNED(16) float output0[vlen];
__VOLK_ATTR_ALIGNED(16) float output01[vlen];
- for(int i = 0; i < vlen; ++i) {
+ for(int i = 0; i < vlen; ++i) {
input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
}
printf("32f_fm_detect_aligned\n");
@@ -51,7 +51,7 @@ void qa_32f_fm_detect_aligned16::t1() {
//printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
//printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
}
-
+
for(int i = 0; i < vlen; ++i) {
//printf("%d...%d\n", output0[i], output01[i]);
CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output01[i], fabs(output0[i]) * 1e-4);
diff --git a/volk/lib/qa_32f_index_max_aligned16.cc b/volk/lib/qa_32f_index_max_aligned16.cc
index a1c3d4cd14..2df206726d 100644
--- a/volk/lib/qa_32f_index_max_aligned16.cc
+++ b/volk/lib/qa_32f_index_max_aligned16.cc
@@ -34,12 +34,12 @@ void qa_32f_index_max_aligned16::t1(){
void qa_32f_index_max_aligned16::t1(){
-
+
const int vlen = VEC_LEN;
-
+
volk_runtime_init();
-
+
volk_environment_init();
int ret;
@@ -47,8 +47,8 @@ void qa_32f_index_max_aligned16::t1(){
unsigned int* target_sse;
unsigned int* target_generic;
float* src0 ;
-
-
+
+
unsigned int i_target_sse4_1;
target_sse4_1 = &i_target_sse4_1;
unsigned int i_target_sse;
@@ -57,20 +57,20 @@ void qa_32f_index_max_aligned16::t1(){
target_generic = &i_target_generic;
ret = posix_memalign((void**)&src0, 16, vlen *sizeof(float));
-
+
random_floats((float*)src0, vlen);
-
+
printf("32f_index_max_aligned16\n");
clock_t start, end;
double total;
-
-
+
+
start = clock();
for(int k = 0; k < NUM_ITERS; ++k) {
volk_32f_index_max_aligned16_manual(target_generic, src0, vlen, "generic");
}
- end = clock();
+ end = clock();
total = (double)(end-start)/(double)CLOCKS_PER_SEC;
printf("generic time: %f\n", total);
@@ -78,25 +78,25 @@ void qa_32f_index_max_aligned16::t1(){
for(int k = 0; k < NUM_ITERS; ++k) {
volk_32f_index_max_aligned16_manual(target_sse, src0, vlen, "sse2");
}
-
- end = clock();
+
+ end = clock();
total = (double)(end-start)/(double)CLOCKS_PER_SEC;
printf("sse time: %f\n", total);
-
+
start = clock();
for(int k = 0; k < NUM_ITERS; ++k) {
get_volk_runtime()->volk_32f_index_max_aligned16(target_sse4_1, src0, vlen);
}
-
- end = clock();
+
+ end = clock();
total = (double)(end-start)/(double)CLOCKS_PER_SEC;
printf("sse4.1 time: %f\n", total);
-
-
+
+
printf("generic: %u, sse: %u, sse4.1: %u\n", target_generic[0], target_sse[0], target_sse4_1[0]);
CPPUNIT_ASSERT_EQUAL(target_generic[0], target_sse[0]);
CPPUNIT_ASSERT_EQUAL(target_generic[0], target_sse4_1[0]);
-
+
free(src0);
}
diff --git a/volk/lib/qa_32fc_index_max_aligned16.cc b/volk/lib/qa_32fc_index_max_aligned16.cc
index 4d83f16395..3859bcb522 100644
--- a/volk/lib/qa_32fc_index_max_aligned16.cc
+++ b/volk/lib/qa_32fc_index_max_aligned16.cc
@@ -33,36 +33,36 @@ void qa_32fc_index_max_aligned16::t1(){
void qa_32fc_index_max_aligned16::t1(){
-
+
const int vlen = VEC_LEN;
-
+
volk_environment_init();
int ret;
-
+
unsigned int* target;
unsigned int* target_generic;
std::complex<float>* src0 ;
-
-
+
+
unsigned int i_target;
target = &i_target;
unsigned int i_target_generic;
target_generic = &i_target_generic;
ret = posix_memalign((void**)&src0, 16, vlen << 3);
-
+
random_floats((float*)src0, vlen * 2);
-
+
printf("32fc_index_max_aligned16\n");
clock_t start, end;
double total;
-
-
+
+
start = clock();
for(int k = 0; k < NUM_ITERS; ++k) {
volk_32fc_index_max_aligned16_manual(target_generic, src0, vlen << 3, "generic");
}
- end = clock();
+ end = clock();
total = (double)(end-start)/(double)CLOCKS_PER_SEC;
printf("generic time: %f\n", total);
@@ -70,19 +70,19 @@ void qa_32fc_index_max_aligned16::t1(){
for(int k = 0; k < NUM_ITERS; ++k) {
volk_32fc_index_max_aligned16_manual(target, src0, vlen << 3, "sse3");
}
-
- end = clock();
+
+ end = clock();
total = (double)(end-start)/(double)CLOCKS_PER_SEC;
printf("sse3 time: %f\n", total);
-
-
-
+
+
+
printf("generic: %u, sse3: %u\n", target_generic[0], target[0]);
CPPUNIT_ASSERT_DOUBLES_EQUAL(target_generic[0], target[0], 1.1);
-
-
+
+
free(src0);
}
diff --git a/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc
index 981bb19e69..daca31d9ce 100644
--- a/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc
+++ b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc
@@ -15,14 +15,14 @@ void qa_32fc_power_spectral_density_32f_aligned16::t1() {
#else
void qa_32fc_power_spectral_density_32f_aligned16::t1() {
-
+
volk_environment_init();
clock_t start, end;
double total;
const int vlen = 3201;
const int ITERS = 10000;
__VOLK_ATTR_ALIGNED(16) std::complex<float> input0[vlen];
-
+
__VOLK_ATTR_ALIGNED(16) float output_generic[vlen];
__VOLK_ATTR_ALIGNED(16) float output_sse3[vlen];
@@ -30,7 +30,7 @@ void qa_32fc_power_spectral_density_32f_aligned16::t1() {
const float rbw = 1.7;
float* inputLoad = (float*)input0;
- for(int i = 0; i < 2*vlen; ++i) {
+ for(int i = 0; i < 2*vlen; ++i) {
inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)));
}
printf("32fc_power_spectral_density_32f_aligned\n");
@@ -54,7 +54,7 @@ void qa_32fc_power_spectral_density_32f_aligned16::t1() {
//printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
//printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
}
-
+
for(int i = 0; i < vlen; ++i) {
//printf("%d...%d\n", output0[i], output01[i]);
CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i]*1e-4));
diff --git a/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc b/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc
index fefdf06eeb..b825c20e4e 100644
--- a/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc
+++ b/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc
@@ -7,7 +7,7 @@
#define assertcomplexEqual(expected, actual, delta) \
CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \
- CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta);
+ CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta);
#define ERR_DELTA (1e-4)
@@ -35,7 +35,7 @@ void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() {
std::complex<float>* input;
std::complex<float>* taps;
-
+
std::complex<float>* result_generic;
std::complex<float>* result;
@@ -43,19 +43,19 @@ void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() {
ret = posix_memalign((void**)&taps, 16, vlen << 3);
ret = posix_memalign((void**)&result_generic, 16, 8);
ret = posix_memalign((void**)&result, 16, 8);
-
+
result_generic[0] = std::complex<float>(0,0);
result[0] = std::complex<float>(0,0);
random_floats((float*)input, vlen * 2);
random_floats((float*)taps, vlen * 2);
-
-
+
+
volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result_generic, input, taps, vlen * 8, "generic");
-
+
volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result, input, taps, vlen * 8, "sse");
printf("32fc_x2_conjugate_dot_prod_32fc_u\n");
@@ -67,7 +67,7 @@ void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() {
free(taps);
free(result_generic);
free(result);
-
+
}
@@ -87,13 +87,13 @@ random_floats (float *buf, unsigned n)
void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() {
const int vlen = 789743;
-
+
volk_environment_init();
int ret;
std::complex<float>* input;
std::complex<float>* taps;
-
+
std::complex<float>* result_generic;
std::complex<float>* result;
@@ -101,19 +101,19 @@ void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() {
ret = posix_memalign((void**)&taps, 16, vlen << 3);
ret = posix_memalign((void**)&result_generic, 16, 8);
ret = posix_memalign((void**)&result, 16, 8);
-
+
result_generic[0] = std::complex<float>(0,0);
result[0] = std::complex<float>(0,0);
random_floats((float*)input, vlen * 2);
random_floats((float*)taps, vlen * 2);
-
-
+
+
volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result_generic, input, taps, vlen * 8, "generic");
-
+
volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result, input, taps, vlen * 8, "sse_32");
printf("32fc_x2_conjugate_dot_prod_32fc_u\n");
@@ -125,7 +125,7 @@ void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() {
free(taps);
free(result_generic);
free(result);
-
+
}
diff --git a/volk/lib/qa_32u_popcnt_aligned16.cc b/volk/lib/qa_32u_popcnt_aligned16.cc
index c880260f2a..5559d933df 100644
--- a/volk/lib/qa_32u_popcnt_aligned16.cc
+++ b/volk/lib/qa_32u_popcnt_aligned16.cc
@@ -16,8 +16,8 @@ void qa_32u_popcnt_aligned16::t1() {
#else
void qa_32u_popcnt_aligned16::t1() {
-
-
+
+
volk_runtime_init();
volk_environment_init();
@@ -26,7 +26,7 @@ void qa_32u_popcnt_aligned16::t1() {
const int ITERS = 10000000;
__VOLK_ATTR_ALIGNED(16) uint32_t input0;
-
+
__VOLK_ATTR_ALIGNED(16) uint32_t output0;
__VOLK_ATTR_ALIGNED(16) uint32_t output01;
@@ -55,7 +55,7 @@ void qa_32u_popcnt_aligned16::t1() {
total = (double)(end-start)/(double)CLOCKS_PER_SEC;
printf("sse4.2_time: %f\n", total);
-
+
CPPUNIT_ASSERT_EQUAL(output0, output01);
}
diff --git a/volk/lib/qa_64u_popcnt_aligned16.cc b/volk/lib/qa_64u_popcnt_aligned16.cc
index 6be4e50ea0..391601f223 100644
--- a/volk/lib/qa_64u_popcnt_aligned16.cc
+++ b/volk/lib/qa_64u_popcnt_aligned16.cc
@@ -16,8 +16,8 @@ void qa_64u_popcnt_aligned16::t1() {
#else
void qa_64u_popcnt_aligned16::t1() {
-
-
+
+
volk_runtime_init();
volk_environment_init();
@@ -26,7 +26,7 @@ void qa_64u_popcnt_aligned16::t1() {
const int ITERS = 10000000;
__VOLK_ATTR_ALIGNED(16) uint64_t input0;
-
+
__VOLK_ATTR_ALIGNED(16) uint64_t output0;
__VOLK_ATTR_ALIGNED(16) uint64_t output01;
@@ -55,7 +55,7 @@ void qa_64u_popcnt_aligned16::t1() {
total = (double)(end-start)/(double)CLOCKS_PER_SEC;
printf("sse4.2_time: %f\n", total);
-
+
CPPUNIT_ASSERT_EQUAL(output0, output01);
}
diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc
index bb37801c94..4e361aece2 100644
--- a/volk/lib/qa_utils.cc
+++ b/volk/lib/qa_utils.cc
@@ -15,6 +15,7 @@
#include <volk/volk_common.h>
#include <boost/typeof/typeof.hpp>
#include <boost/type_traits.hpp>
+#include <stdio.h>
float uniform() {
return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1)
@@ -46,7 +47,7 @@ void load_random_data(void *data, volk_type_t type, unsigned int n) {
case 4:
if(type.is_signed) ((int32_t *)data)[i] = (int32_t) scaled_rand;
else ((uint32_t *)data)[i] = (uint32_t) scaled_rand;
- break;
+ break;
case 2:
if(type.is_signed) ((int16_t *)data)[i] = (int16_t) scaled_rand;
else ((uint16_t *)data)[i] = (uint16_t) scaled_rand;
@@ -69,7 +70,7 @@ static std::vector<std::string> get_arch_list(struct volk_func_desc desc) {
//if(!(archs[i+1] & volk_get_lvarch())) continue; //this arch isn't available on this pc
archlist.push_back(std::string(desc.indices[i]));
}
-
+
return archlist;
}
@@ -81,15 +82,15 @@ volk_type_t volk_type_from_string(std::string name) {
type.is_signed = false;
type.size = 0;
type.str = name;
-
+
if(name.size() < 2) throw std::string("name too short to be a datatype");
-
+
//is it a scalar?
- if(name[0] == 's') {
+ if(name[0] == 's') {
type.is_scalar = true;
name = name.substr(1, name.size()-1);
}
-
+
//get the data size
size_t last_size_pos = name.find_last_of("0123456789");
if(last_size_pos < 0) throw std::string("no size spec in type ").append(name);
@@ -98,7 +99,7 @@ volk_type_t volk_type_from_string(std::string name) {
assert(((size % 8) == 0) && (size <= 64) && (size != 0));
type.size = size/8; //in bytes
-
+
for(size_t i=last_size_pos+1; i < name.size(); i++) {
switch (name[i]) {
case 'f':
@@ -117,19 +118,19 @@ volk_type_t volk_type_from_string(std::string name) {
throw;
}
}
-
+
return type;
}
-static void get_signatures_from_name(std::vector<volk_type_t> &inputsig,
- std::vector<volk_type_t> &outputsig,
+static void get_signatures_from_name(std::vector<volk_type_t> &inputsig,
+ std::vector<volk_type_t> &outputsig,
std::string name) {
boost::char_separator<char> sep("_");
boost::tokenizer<boost::char_separator<char> > tok(name, sep);
std::vector<std::string> toked;
tok.assign(name);
toked.assign(tok.begin(), tok.end());
-
+
assert(toked[0] == "volk");
toked.erase(toked.begin());
@@ -143,7 +144,7 @@ static void get_signatures_from_name(std::vector<volk_type_t> &inputsig,
try {
type = volk_type_from_string(token);
if(side == SIDE_NAME) side = SIDE_OUTPUT; //if this is the first one after the name...
-
+
if(side == SIDE_INPUT) inputsig.push_back(type);
else outputsig.push_back(type);
} catch (...){
@@ -160,7 +161,7 @@ static void get_signatures_from_name(std::vector<volk_type_t> &inputsig,
side = SIDE_NAME;
fn_name.append("_");
fn_name.append(token);
- }
+ }
else if(side == SIDE_OUTPUT) {
if(token != toked.back()) throw; //the last token in the name is the alignment
}
@@ -168,6 +169,7 @@ static void get_signatures_from_name(std::vector<volk_type_t> &inputsig,
}
//we don't need an output signature (some fn's operate on the input data, "in place"), but we do need at least one input!
assert(inputsig.size() != 0);
+
}
inline void run_cast_test1(volk_fn_1arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
@@ -223,7 +225,7 @@ bool fcompare(t *in1, t *in2, unsigned int vlen, float tol) {
}
}
}
-
+
return fail;
}
@@ -239,7 +241,7 @@ bool icompare(t *in1, t *in2, unsigned int vlen, unsigned int tol) {
}
}
}
-
+
return fail;
}
@@ -261,13 +263,14 @@ bool run_volk_tests(struct volk_func_desc desc,
lv_32fc_t scalar,
int vlen,
int iter,
- std::vector<std::string> *best_arch_vector = 0
+ std::vector<std::string> *best_arch_vector = 0,
+ std::string puppet_master_name = "NULL"
) {
std::cout << "RUN_VOLK_TESTS: " << name << std::endl;
-
+
//first let's get a list of available architectures for the test
std::vector<std::string> arch_list = get_arch_list(desc);
-
+
if(arch_list.size() < 2) {
std::cout << "no architectures to test" << std::endl;
return false;
@@ -286,9 +289,9 @@ bool run_volk_tests(struct volk_func_desc desc,
if(inputsig[i].is_scalar) {
inputsc.push_back(inputsig[i]);
inputsig.erase(inputsig.begin() + i);
+ i -= 1;
}
}
-
//for(int i=0; i<inputsig.size(); i++) std::cout << "Input: " << inputsig[i].str << std::endl;
//for(int i=0; i<outputsig.size(); i++) std::cout << "Output: " << outputsig[i].str << std::endl;
std::vector<void *> inbuffs;
@@ -299,7 +302,7 @@ bool run_volk_tests(struct volk_func_desc desc,
for(size_t i=0; i<inbuffs.size(); i++) {
load_random_data(inbuffs[i], inputsig[i], vlen);
}
-
+
//ok let's make a vector of vector of void buffers, which holds the input/output vectors for each arch
std::vector<std::vector<void *> > test_data;
for(size_t i=0; i<arch_list.size(); i++) {
@@ -312,7 +315,7 @@ bool run_volk_tests(struct volk_func_desc desc,
}
test_data.push_back(arch_buffs);
}
-
+
std::vector<volk_type_t> both_sigs;
both_sigs.insert(both_sigs.end(), outputsig.begin(), outputsig.end());
both_sigs.insert(both_sigs.end(), inputsig.begin(), inputsig.end());
@@ -326,7 +329,7 @@ bool run_volk_tests(struct volk_func_desc desc,
switch(both_sigs.size()) {
case 1:
if(inputsc.size() == 0) {
- run_cast_test1((volk_fn_1arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+ run_cast_test1((volk_fn_1arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
} else if(inputsc.size() == 1 && inputsc[0].is_float) {
if(inputsc[0].is_complex) {
run_cast_test1_s32fc((volk_fn_1arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
@@ -364,23 +367,23 @@ bool run_volk_tests(struct volk_func_desc desc,
throw "no function handler for this signature";
break;
}
-
+
end = clock();
double arch_time = (double)(end-start)/(double)CLOCKS_PER_SEC;
std::cout << arch_list[i] << " completed in " << arch_time << "s" << std::endl;
profile_times.push_back(arch_time);
}
-
+
//and now compare each output to the generic output
//first we have to know which output is the generic one, they aren't in order...
size_t generic_offset=0;
- for(size_t i=0; i<arch_list.size(); i++)
+ for(size_t i=0; i<arch_list.size(); i++)
if(arch_list[i] == "generic") generic_offset=i;
//now compare
//if(outputsig.size() == 0) outputsig = inputsig; //a hack, i know
-
+
bool fail = false;
bool fail_global = false;
std::vector<bool> arch_results;
@@ -438,7 +441,7 @@ bool run_volk_tests(struct volk_func_desc desc,
}
arch_results.push_back(!fail);
}
-
+
double best_time = std::numeric_limits<double>::max();
std::string best_arch = "generic";
for(size_t i=0; i < arch_list.size(); i++) {
@@ -447,10 +450,15 @@ bool run_volk_tests(struct volk_func_desc desc,
best_arch = arch_list[i];
}
}
-
+
std::cout << "Best arch: " << best_arch << std::endl;
if(best_arch_vector) {
- best_arch_vector->push_back(name + std::string(" ") + best_arch);
+ if(puppet_master_name == "NULL") {
+ best_arch_vector->push_back(name + std::string(" ") + best_arch);
+ }
+ else {
+ best_arch_vector->push_back(puppet_master_name + std::string(" ") + best_arch);
+ }
}
return fail_global;
diff --git a/volk/lib/qa_utils.h b/volk/lib/qa_utils.h
index b998df852e..1e639ac3c6 100644
--- a/volk/lib/qa_utils.h
+++ b/volk/lib/qa_utils.h
@@ -21,10 +21,12 @@ volk_type_t volk_type_from_string(std::string);
float uniform(void);
void random_floats(float *buf, unsigned n);
-bool run_volk_tests(struct volk_func_desc, void(*)(), std::string, float, lv_32fc_t, int, int, std::vector<std::string> *);
+bool run_volk_tests(struct volk_func_desc, void(*)(), std::string, float, lv_32fc_t, int, int, std::vector<std::string> *, std::string);
-#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) BOOST_AUTO_TEST_CASE(func##_test) { BOOST_CHECK_EQUAL(run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, 0), 0); }
-#define VOLK_PROFILE(func, tol, scalar, len, iter, results) run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, results)
+
+#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) BOOST_AUTO_TEST_CASE(func##_test) { BOOST_CHECK_EQUAL(run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, 0, "NULL"), 0); }
+#define VOLK_PROFILE(func, tol, scalar, len, iter, results) run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, results, "NULL")
+#define VOLK_PUPPET_PROFILE(func, puppet_master_func, tol, scalar, len, iter, results) run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, results, std::string(#puppet_master_func))
typedef void (*volk_fn_1arg)(void *, unsigned int, const char*); //one input, operate in place
typedef void (*volk_fn_2arg)(void *, void *, unsigned int, const char*);
typedef void (*volk_fn_3arg)(void *, void *, void *, unsigned int, const char*);
diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc
index 593087f85d..f0011190e1 100644
--- a/volk/lib/testqa.cc
+++ b/volk/lib/testqa.cc
@@ -20,6 +20,7 @@ VOLK_RUN_TESTS(volk_16i_convert_8i_u, 0, 0, 20460, 1);
//VOLK_RUN_TESTS(volk_16i_permute_and_scalar_add_a, 1e-4, 0, 2046, 1000);
//VOLK_RUN_TESTS(volk_16i_x4_quad_max_star_16i_a, 1e-4, 0, 2046, 1000);
VOLK_RUN_TESTS(volk_16u_byteswap_a, 0, 0, 20460, 1);
+//VOLK_RUN_TESTS(volk_16i_32fc_dot_prod_32fc_a, 1e-4, 0, 204600, 1);
VOLK_RUN_TESTS(volk_32f_accumulator_s32f_a, 1e-4, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32f_x2_add_32f_a, 1e-4, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32f_x2_add_32f_u, 1e-4, 0, 20460, 1);
@@ -33,8 +34,10 @@ VOLK_RUN_TESTS(volk_32fc_deinterleave_32f_x2_a, 1e-4, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32fc_deinterleave_64f_x2_a, 1e-4, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32fc_s32f_deinterleave_real_16i_a, 0, 32768, 20460, 1);
VOLK_RUN_TESTS(volk_32fc_deinterleave_real_32f_a, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_deinterleave_imag_32f_a, 1e-4, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 204600, 1);
+VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 2046000, 1);
+VOLK_RUN_TESTS(volk_32fc_32f_dot_prod_32fc_a, 1e-4, 0, 2046000, 1);
VOLK_RUN_TESTS(volk_32fc_index_max_16u_a, 3, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i_a, 1, 32768, 20460, 1);
VOLK_RUN_TESTS(volk_32fc_magnitude_32f_a, 1e-4, 0, 20460, 1);
@@ -52,10 +55,11 @@ VOLK_RUN_TESTS(volk_32fc_x2_square_dist_32f_a, 1e-4, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a, 1e-4, 10, 20460, 1);
VOLK_RUN_TESTS(volk_32f_x2_divide_32f_a, 1e-4, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_a, 1e-4, 0, 204600, 1);
-VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 204600, 1);
+//VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 204600, 1);
+VOLK_RUN_TESTS(volk_32f_x2_dot_prod_16i_a, 1e-4, 0, 204600, 1);
//VOLK_RUN_TESTS(volk_32f_s32f_32f_fm_detect_32f_a, 1e-4, 2046, 10000);
VOLK_RUN_TESTS(volk_32f_index_max_16u_a, 3, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic_a, 1, 32768, 20460, 1);
+VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic_a, 1, 32767, 20460, 1);
VOLK_RUN_TESTS(volk_32f_x2_interleave_32fc_a, 0, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32f_x2_max_32f_a, 1e-4, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32f_x2_min_32f_a, 1e-4, 0, 20460, 1);
@@ -101,3 +105,4 @@ VOLK_RUN_TESTS(volk_32fc_s32fc_multiply_32fc_a, 1e-4, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32fc_s32fc_multiply_32fc_u, 1e-4, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32f_s32f_multiply_32f_a, 1e-4, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32f_s32f_multiply_32f_u, 1e-4, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_32fc_s32fc_rotatorpuppet_32fc_a, 1e-2, (lv_32fc_t)lv_cmake(0.953939201, 0.3), 20460, 1);
diff --git a/volk/lib/volk_prefs.c b/volk/lib/volk_prefs.c
index 7e705bed46..5e5c9dfff7 100644
--- a/volk/lib/volk_prefs.c
+++ b/volk/lib/volk_prefs.c
@@ -26,7 +26,7 @@ int load_preferences(struct volk_arch_pref **prefs) {
char path[512], line[512], function[128], arch[32];
int n_arch_prefs = 0;
struct volk_arch_pref *t_pref;
-
+
//get the config path
get_config_path(path);
if (path == NULL) return n_arch_prefs; //no prefs found
diff --git a/volk/lib/volk_rank_archs.c b/volk/lib/volk_rank_archs.c
index 4baa078bca..865d60955c 100644
--- a/volk/lib/volk_rank_archs.c
+++ b/volk/lib/volk_rank_archs.c
@@ -26,14 +26,14 @@ unsigned int volk_rank_archs(const char *indices[], const int* arch_defs, unsign
n_arch_prefs = load_preferences(&volk_arch_prefs);
prefs_loaded = 1;
}
-
+
//now look for the function name in the prefs list
for(i=0; i < n_arch_prefs; i++) {
if(!strncmp(name, volk_arch_prefs[i].name, 128)) { //found it
return get_index(indices, n_archs, volk_arch_prefs[i].arch);
}
}
-
+
for(i=1; i < n_archs; ++i) {
if((arch_defs[i]&(!arch)) == 0) {
best_val = (arch_defs[i] > arch_defs[best_val + 1]) ? i-1 : best_val;