diff --git a/CMakeLists.txt b/CMakeLists.txt index 909272b9d870330487d2bf2ae6bf4211aaf0a60e..8e88df1d441cb1cb847d01de13b0ca352f61ffae 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,6 +6,10 @@ SET(CMAKE_INCLUDE_CURRENT_DIR ON) SET(CMAKE_CXX_STANDARD 11) SET(CMAKE_C_STANDARD 11) +# get system architecture +EXECUTE_PROCESS(COMMAND uname -m COMMAND tr -d '\n' OUTPUT_VARIABLE ARCHITECTURE) +set(CMAKE_LIBRARY_ARCHITECTURE ${ARCHITECTURE}) + # fix win stack size issue if(WIN32) SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,--stack,10000000") @@ -52,68 +56,99 @@ IF(NOT BOOST_LIB) ENDIF() INCLUDE_DIRECTORIES("${BOOST_LIB}") -SET(MKLROOT "$ENV{MKLROOT}") -IF(NOT MKLROOT) - MESSAGE(FATAL_ERROR "Specify environment variable MKLROOT to the location of Intel MKL package") -ENDIF() -INCLUDE_DIRECTORIES("${MKLROOT}/include") - # PLINK2.0 pgen library; INCLUDE_DIRECTORIES("submods/plink-ng/2.0") if(WIN32) set(CMAKE_FIND_LIBRARY_SUFFIXES ".lib") else() - set(CMAKE_FIND_LIBRARY_SUFFIXES ".a") + set(CMAKE_FIND_LIBRARY_SUFFIXES ".a" ".so") endif() -find_library(lib_lp64 NAMES mkl_intel_lp64 PATHS "${MKLROOT}/lib/intel64" "${MKLROOT}/lib") -if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR APPLE) - find_library(lib_thread NAMES mkl_intel_thread PATHS "${MKLROOT}/lib/intel64" "${MKLROOT}/lib") -else() - find_library(lib_thread NAMES mkl_gnu_thread PATHS "${MKLROOT}/lib/intel64" "${MKLROOT}/lib") -endif() -find_library(lib_core NAMES mkl_core PATHS "${MKLROOT}/lib/intel64" "${MKLROOT}/lib") -set(MKL_LIB ${lib_lp64} ${lib_thread} ${lib_core}) - IF(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE RELEASE) message("Setting to release build") ENDIF() -IF(NOT SSE) - set(SSE "-msse2") - message("SSE build") -ENDIF() -if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64 ${SSE} -DMKL_LP64 -pthread") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -DNDEBUG -DEIGEN_USE_MKL_ALL") - set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -g") - add_compile_options(-fopenmp) - set(CMAKE_EXE_LINKER_FLAGS "-Wl,-no_implicit_dylibs") - if(NOT APPLE) - # don't delete this it will result in _implibs +message(STATUS "ARCHITECTURE:${CMAKE_LIBRARY_ARCHITECTURE}") +if(CMAKE_LIBRARY_ARCHITECTURE STREQUAL "aarch64") + # Use blas library of KML and lapack of openblas + # TODO, connect lapack library of kml when kml_lapck completed + SET(KMLROOT "$ENV{KMLROOT}") + IF(NOT KMLROOT) + MESSAGE(FATAL_ERROR "Specify environment variable KMLROOT to the location of KML package") + ENDIF() + INCLUDE_DIRECTORIES("${KMLROOT}/include") + + SET(OPENBLAS "$ENV{OPENBLAS}") + IF(NOT OPENBLAS) + MESSAGE(FATAL_ERROR "Specify environment variable OPENBLAS to the location of OPENBLAS package") + ENDIF() + INCLUDE_DIRECTORIES("${OPENBLAS}/include") + + find_library(lib_blas64 NAMES kblas PATHS "${KMLROOT}/lib/kblas/nolocking" "${KMLROOT}/lib") + find_library(lib_openblas NAMES openblas PATHS "${OPENBLAS}" "${OPENBLAS}/lib") + set(KML_LIB ${lib_blas64} ${lib_openblas}) + + if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I ${CMAKE_CURRENT_SOURCE_DIR} -pthread -fvisibility=hidden -fvisibility-inlines-hidden") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -DNDEBUG") #-flto + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -fno-inline -fno-implicit-inline-templates -g3") + add_compile_options(-fopenmp) set(CMAKE_EXE_LINKER_FLAGS "-static-libgcc -static-libstdc++") + else() + message(FATAL_ERROR "In aarch64 architecture, please use GNU compiler") + endif() +else() + SET(MKLROOT "$ENV{MKLROOT}") + IF(NOT MKLROOT) + MESSAGE(FATAL_ERROR "Specify environment variable MKLROOT to the location of Intel MKL package") + ENDIF() + INCLUDE_DIRECTORIES("${MKLROOT}/include") + + find_library(lib_lp64 NAMES mkl_intel_lp64 PATHS "${MKLROOT}/lib/intel64" "${MKLROOT}/lib") + if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR APPLE) + find_library(lib_thread NAMES mkl_intel_thread PATHS "${MKLROOT}/lib/intel64" "${MKLROOT}/lib") + else() + find_library(lib_thread NAMES mkl_gnu_thread PATHS "${MKLROOT}/lib/intel64" "${MKLROOT}/lib") + endif() + find_library(lib_core NAMES mkl_core PATHS "${MKLROOT}/lib/intel64" "${MKLROOT}/lib") + set(MKL_LIB ${lib_lp64} ${lib_thread} ${lib_core}) + + IF(NOT SSE) + set(SSE "-msse2") + message("SSE build") + ENDIF() + + if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64 ${SSE} -DMKL_LP64 -pthread") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -DNDEBUG -DEIGEN_USE_MKL_ALL") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -g") + add_compile_options(-fopenmp) + set(CMAKE_EXE_LINKER_FLAGS "-Wl,-no_implicit_dylibs") + if(NOT APPLE) + # don't delete this it will result in _implibs + set(CMAKE_EXE_LINKER_FLAGS "-static-libgcc -static-libstdc++") + endif() + elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64 ${SSE} -pthread -DMKL_LP64 -fvisibility=hidden -fvisibility-inlines-hidden") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -DNDEBUG -DEIGEN_USE_MKL_ALL") #-flto + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -fno-inline -fno-implicit-inline-templates -g3") + add_compile_options(-fopenmp) + set(CMAKE_EXE_LINKER_FLAGS "-static-libgcc -static-libstdc++") + elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -openmp -axAVX2 -pthread") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -ipo -DNDEBUG") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -g") + elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") + message(FALTAL_ERROR "GCTA can not compile by Microsft C++ compiler in CMake currently, please use gcta_win64 folder to build") endif() -elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64 ${SSE} -pthread -DMKL_LP64 -fvisibility=hidden -fvisibility-inlines-hidden") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -DNDEBUG -DEIGEN_USE_MKL_ALL") #-flto - set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -fno-inline -fno-implicit-inline-templates -g3") - add_compile_options(-fopenmp) - set(CMAKE_EXE_LINKER_FLAGS "-static-libgcc -static-libstdc++") -elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -openmp -axAVX2 -pthread") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -ipo -DNDEBUG") - set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -g") -elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") - message(FALTAL_ERROR "GCTA can not compile by Microsft C++ compiler in CMake currently, please use gcta_win64 folder to build") endif() set(COMMON_INCLUDES ${PROJECT_SOURCE_DIR}/include) set(MAIN_SOURCE "${PROJECT_SOURCE_DIR}/src/main.cpp") include_directories(${COMMON_INCLUDES}) - file(GLOB SRCS "${PROJECT_SOURCE_DIR}/src/*.cpp") list(REMOVE_ITEM SRCS "${MAIN_SOURCE}") @@ -132,7 +167,7 @@ if(APPLE) link_directories(${MKLROOT}/lib) link_directories(/usr/local/lib) else(CMAKE_CXX_COMPILER_ID MATCHES "Clang") - #linux + #linux for amd64 link_directories(${MKLROOT}/../../../lib/intel64) endif() @@ -150,8 +185,12 @@ else() #target_link_libraries(gcta64 mainV1 ${libs_list} -Wl,-Bstatic z -Wl,--start-group ${MKL_LIB} -Wl,--end-group iomp5 -Wl,-Bdynamic pthread m dl) target_link_libraries(gcta64 mainV1 ${libs_list} Pgenlib -static z sqlite3 zstd -Wl,--start-group ${MKL_LIB} -Wl,--end-group iomp5 -Wl,--whole-archive -lpthread -Wl,--no-whole-archive m dl) else() - #target_link_libraries(gcta64 mainV1 ${libs_list} -Wl,-Bstatic z -Wl,--start-group ${MKL_LIB} -Wl,--end-group gomp -Wl,-Bdynamic pthread m dl) - target_link_libraries(gcta64 mainV1 ${libs_list} Pgenlib -static z sqlite3 zstd -Wl,--start-group ${MKL_LIB} -Wl,--end-group gomp -Wl,--whole-archive -lpthread -Wl,--no-whole-archive m dl) + if(CMAKE_LIBRARY_ARCHITECTURE STREQUAL "aarch64") + target_link_libraries(gcta64 mainV1 ${libs_list} Pgenlib -static z sqlite3 zstd -Wl,--start-group ${KML_LIB} -lgfortran -Wl,--end-group gomp -Wl,--whole-archive -lpthread -Wl,--no-whole-archive m dl) + else() + #target_link_libraries(gcta64 mainV1 ${libs_list} -Wl,-Bstatic z -Wl,--start-group ${MKL_LIB} -Wl,--end-group gomp -Wl,-Bdynamic pthread m dl) + target_link_libraries(gcta64 mainV1 ${libs_list} Pgenlib -static z sqlite3 zstd -Wl,--start-group ${MKL_LIB} -Wl,--end-group gomp -Wl,--whole-archive -lpthread -Wl,--no-whole-archive m dl) + endif() endif() endif() diff --git a/include/Matrix.hpp b/include/Matrix.hpp index f31aa83369efd5033ad5c172cd46522aa6a1f96c..3e3038be646544c64f81b6bd4b5e475276ad5711 100644 --- a/include/Matrix.hpp +++ b/include/Matrix.hpp @@ -6,6 +6,12 @@ #include #include +#if defined(__aarch64__) + #include +#else + #include +#endif + static_assert(std::numeric_limits::is_iec559, "Not a supported compiler"); // two step to inverse the matrix @@ -29,12 +35,20 @@ bool _LLT(MatrixType &A, double &logdet){ int info, cols = (int)A.cols(); char uplo = 'L'; LOGGER.ts("LLT"); - dpotrf(&uplo, &cols, vi, &cols, &info); + #if defined(__aarch64__) + dpotrf_(&uplo, &cols, vi, &cols, &info); + #else + dpotrf(&uplo, &cols, vi, &cols, &info); + #endif //LOGGER << " LLT time: " << LOGGER.tp("LLT") << std::endl; if(info == 0){ logdet = A.diagonal().array().square().log().sum(); //LOGGER.ts("LLT_INV"); - dpotri(&uplo, &cols, vi, &cols, &info); + #if defined(__aarch64__) + dpotri_(&uplo, &cols, vi, &cols, &info); + #else + dpotri(&uplo, &cols, vi, &cols, &info); + #endif //LOGGER << " LLT inverse time: " << LOGGER.tp("LLT_INV") << std::endl; if(info == 0){ A.template triangularView() = A.transpose(); diff --git a/main/gcta.h b/main/gcta.h index a6049e8453dbf07b7ff122755faa211edc62f6d7..2a589d957f0777a848abf0bca01ff1c9abf54ec6 100644 --- a/main/gcta.h +++ b/main/gcta.h @@ -37,11 +37,17 @@ #include #include #include -#include -#include #include "Logger.h" #include "Matrix.hpp" +#if defined(__aarch64__) + #include + #include +#else + #include + #include +#endif + #ifdef SINGLE_PRECISION typedef Eigen::SparseMatrix eigenSparseMat; #else diff --git a/main/mkl.cpp b/main/mkl.cpp index dbf043574e55dd7f422662c56fc74e8c842e0aac..c126fc71851b439dbe98320ea038034d109125bd 100644 --- a/main/mkl.cpp +++ b/main/mkl.cpp @@ -362,7 +362,11 @@ bool gcta::comput_inverse_logdet_LDLT_mkl(eigenMatrix &Vi, double &logdet) // MKL's Cholesky decomposition int info = 0, int_n = (int) n; char uplo = 'L'; - dpotrf(&uplo, &int_n, Vi_mkl, &int_n, &info); + #if defined(__aarch64__) + dpotrf_(&uplo, &int_n, Vi_mkl, &int_n, &info); + #else + dpotrf(&uplo, &int_n, Vi_mkl, &int_n, &info); + #endif //LOGGER << "Finished decompose" << endl; //spotrf( &uplo, &n, Vi_mkl, &n, &info ); if (info < 0){ @@ -379,7 +383,11 @@ bool gcta::comput_inverse_logdet_LDLT_mkl(eigenMatrix &Vi, double &logdet) //LOGGER << "start inverse" << endl; // Calcualte V inverse - dpotri(&uplo, &int_n, Vi_mkl, &int_n, &info); + #if defined(__aarch64__) + dpotri_(&uplo, &int_n, Vi_mkl, &int_n, &info); + #else + dpotri(&uplo, &int_n, Vi_mkl, &int_n, &info); + #endif //LOGGER << "Inverse finished" << endl; //spotri( &uplo, &n, Vi_mkl, &n, &info ); if (info < 0){ @@ -420,7 +428,11 @@ bool gcta::comput_inverse_logdet_LU_mkl(eigenMatrix &Vi, double &logdet) int LWORK = N*N; double *WORK = new double[n * n]; int INFO; - dgetrf(&N, &N, Vi_mkl, &N, IPIV, &INFO); + #if defined(__aarch64__) + dgetrf_(&N, &N, Vi_mkl, &N, IPIV, &INFO); + #else + dgetrf(&N, &N, Vi_mkl, &N, IPIV, &INFO); + #endif if (INFO < 0) LOGGER.e(0, "LU decomposition failed. Invalid values found in the matrix.\n"); else if (INFO > 0) { delete[] Vi_mkl; @@ -435,7 +447,11 @@ bool gcta::comput_inverse_logdet_LU_mkl(eigenMatrix &Vi, double &logdet) } // Calcualte V inverse - dgetri(&N, Vi_mkl, &N, IPIV, WORK, &LWORK, &INFO); + #if defined(__aarch64__) + dgetri_(&N, Vi_mkl, &N, IPIV, WORK, &LWORK, &INFO); + #else + dgetri(&N, Vi_mkl, &N, IPIV, WORK, &LWORK, &INFO); + #endif if (INFO < 0){ LOGGER.e(0, "invalid values found in the varaince-covaraince (V) matrix.\n"); }else if (INFO > 0){ @@ -474,7 +490,11 @@ bool gcta::comput_inverse_logdet_LU_mkl_array(int n, float *Vi, double &logdet) int LWORK = N*N; double *WORK = new double[n * n]; int INFO; - dgetrf(&N, &N, Vi_mkl, &N, IPIV, &INFO); + #if defined(__aarch64__) + dgetrf_(&N, &N, Vi_mkl, &N, IPIV, &INFO); + #else + dgetrf(&N, &N, Vi_mkl, &N, IPIV, &INFO); + #endif if (INFO < 0) LOGGER.e(0, "LU decomposition failed. Invalid values found in the matrix.\n"); else if (INFO > 0) { // free memory @@ -491,7 +511,11 @@ bool gcta::comput_inverse_logdet_LU_mkl_array(int n, float *Vi, double &logdet) } // Calcualte V inverse - dgetri(&N, Vi_mkl, &N, IPIV, WORK, &LWORK, &INFO); + #if defined(__aarch64__) + dgetri_(&N, Vi_mkl, &N, IPIV, WORK, &LWORK, &INFO); + #else + dgetri(&N, Vi_mkl, &N, IPIV, WORK, &LWORK, &INFO); + #endif if (INFO < 0) LOGGER.e(0, "invalid values found in the varaince-covaraince (V) matrix.\n"); else if (INFO > 0) { // free memory diff --git a/src/GRM.cpp b/src/GRM.cpp index 15cfedc03e99a7f8e82476db8f6497d1fc225d5d..02c3866f3c2dfd870d290b779b7afef84e7c11aa 100644 --- a/src/GRM.cpp +++ b/src/GRM.cpp @@ -33,9 +33,34 @@ #include #include #include -#include #include +#if defined( __i386__ ) || defined(i386) || defined(_M_IX86) + /* + * __i386__ is defined by gcc and Intel compiler on Linux, + * _M_IX86 by VS compiler, + * i386 by Sun compilers on opensolaris at least + */ + #define CPU_X86 +#elif defined(__x86_64__) || defined(__amd64__) || defined(__x86_64) || defined(_M_AMD64) + /* + * both __x86_64__ and __amd64__ are defined by gcc + * __x86_64 defined by sun compiler on opensolaris at least + * _M_AMD64 defined by MS compiler + */ + #define CPU_AMD64 +#elif defined(__arm__) || defined(__aarch64__) + #define CPU_ARM +#else + #error Unknown CPU +#endif + +#if defined(CPU_ARM) + #include +#else + #include +#endif + using std::to_string; map GRM::options; @@ -919,14 +944,14 @@ void flip64(uint64_t a[64]) { //#pragma message("multiple target of N thread") //__attribute__((target_clones("popcnt","default"))) //#endif -#ifdef __linux__ +#if defined(__linux__) && !defined(CPU_ARM) __attribute__((target("default"))) #endif uint32_t popcounts(uint64_t dw){ return popcount(dw); } -#ifdef __linux__ +#if defined(__linux__) && !defined(CPU_ARM) __attribute__((target("popcnt"))) uint32_t popcounts(uint64_t dw){ return popcount(dw); @@ -977,12 +1002,24 @@ void GRM::calculate_GRM_blas(uintptr_t *buf, const vector &markerIndex static const char uplo='L'; // A * At if(part_keep_indices.first == 0){ - dsyrk(&uplo, ¬rans, &n, &curNumValidMarkers, &alpha, stdGeno, &n_sample, &beta, grm, &m); + #if defined(CPU_ARM) + dsyrk_(&uplo, ¬rans, &n, &curNumValidMarkers, &alpha, stdGeno, &n_sample, &beta, grm, &m); + #else + dsyrk(&uplo, ¬rans, &n, &curNumValidMarkers, &alpha, stdGeno, &n_sample, &beta, grm, &m); + #endif }else{ - //dgemm(¬rans, &trans, &m, &n, &num_marker, &alpha, stdGeno + part_keep_indices.first, &n_sample, stdGeno, &n_sample, &beta, grm, &m); - dgemm(¬rans, &trans, &m, &s_n, &curNumValidMarkers, &alpha, stdGeno + part_keep_indices.first, &n_sample, stdGeno, &n_sample, &beta, grm, &m); + #if defined(CPU_ARM) + dgemm_(¬rans, &trans, &m, &s_n, &curNumValidMarkers, &alpha, stdGeno + part_keep_indices.first, &n_sample, stdGeno, &n_sample, &beta, grm, &m); + #else + //dgemm(¬rans, &trans, &m, &n, &num_marker, &alpha, stdGeno + part_keep_indices.first, &n_sample, stdGeno, &n_sample, &beta, grm, &m); + dgemm(¬rans, &trans, &m, &s_n, &curNumValidMarkers, &alpha, stdGeno + part_keep_indices.first, &n_sample, stdGeno, &n_sample, &beta, grm, &m); + #endif double * grm_start = grm + ((uint64_t)s_n) * m; - dsyrk(&uplo, ¬rans, &m, &curNumValidMarkers, &alpha, stdGeno + part_keep_indices.first, &n_sample, &beta, grm_start, &m); + #if defined(CPU_ARM) + dsyrk_(&uplo, ¬rans, &m, &curNumValidMarkers, &alpha, stdGeno + part_keep_indices.first, &n_sample, &beta, grm_start, &m); + #else + dsyrk(&uplo, ¬rans, &m, &curNumValidMarkers, &alpha, stdGeno + part_keep_indices.first, &n_sample, &beta, grm_start, &m); + #endif } //memset(this->cmask_buf, 0, num_byte_cmask); diff --git a/src/LD.cpp b/src/LD.cpp index d34fa24c6547708fcfe677139bfc3fde3fef68d9..319b07d9a8182e87b0dd7f04684e34712b5eb10b 100644 --- a/src/LD.cpp +++ b/src/LD.cpp @@ -7,10 +7,15 @@ #include #include #include -#include #include #include +#if defined(__aarch64__) + #include +#else + #include +#endif + map LD::options; map LD::options_i; vector LD::processFunctions; @@ -62,7 +67,11 @@ void LD::calcLD(){ double alpha = 1.0 / (nr - 1); double *ptr1 = geno_buffer[cacl_index_buffer].get(); double *res1 = new double[nc1 * nc1]; - dsyrk(uplo, trans, &nc1, &nr, &alpha, ptr1, &nr, &zero, res1, &nc1); + #if defined(__aarch64__) + dsyrk_(uplo, trans, &nc1, &nr, &alpha, ptr1, &nr, &zero, res1, &nc1); + #else + dsyrk(uplo, trans, &nc1, &nr, &alpha, ptr1, &nr, &zero, res1, &nc1); + #endif double *res2 = nullptr; // is previous buffer active? @@ -72,7 +81,11 @@ void LD::calcLD(){ nc2 = cur_buffer_offset[!cacl_index_buffer] / nr; double *ptr2 = geno_buffer[!cacl_index_buffer].get(); res2 = new double[nc2 * nc1]; - dgemm(trans, notrans, &nc2, &nc1, &nr, &alpha, ptr2, &nr, ptr1, &nr, &zero, res2, &nc2); + #if defined(__aarch64__) + dgemm_(trans, notrans, &nc2, &nc1, &nr, &alpha, ptr2, &nr, ptr1, &nr, &zero, res2, &nc2); + #else + dgemm(trans, notrans, &nc2, &nc1, &nr, &alpha, ptr2, &nr, ptr1, &nr, &zero, res2, &nc2); + #endif } for(int i = 0; i < nc1; i++){ diff --git a/src/StatLib.cpp b/src/StatLib.cpp index 4656ed2b02760a1815a8eb127bb5f0172f1c383c..d1116360a5b9df55106a3394ef38048a8e03f8ed 100644 --- a/src/StatLib.cpp +++ b/src/StatLib.cpp @@ -18,7 +18,12 @@ #include #include #include -#include + +#if defined(__aarch64__) + #include +#else + #include +#endif using namespace boost::math; @@ -73,8 +78,13 @@ namespace StatLib{ char side = 'L'; char t = 'N'; - dormqr(&side, &t, &n, &n, &n, X, &lda, tau, c, - &lda, work, &lwork, &info); + #if defined(__aarch64__) + dormqr_(&side, &t, &n, &n, &n, X, &lda, tau, c, + &lda, work, &lwork, &info); + #else + dormqr(&side, &t, &n, &n, &n, X, &lda, tau, c, + &lda, work, &lwork, &info); + #endif if(info != 0){ return false;