From dc5cda0b874e3e24760f9fe5932910424f4ccc3e Mon Sep 17 00:00:00 2001 From: hujiahui8 Date: Fri, 19 Jan 2024 18:02:45 +0800 Subject: [PATCH] adapt <<<...>>> kernel launch --- prebuild/aarch64/build_cce.cc.o | 4 +- prebuild/aarch64/codegen_cce.cc.o | 4 +- prebuild/aarch64/lib_info.txt | 2 +- prebuild/aarch64/lower_cce.cc.o | 4 +- prebuild/aarch64/update_cce_tiling.cc.o | 4 +- prebuild/x86_64/build_cce.cc.o | 4 +- prebuild/x86_64/codegen_cce.cc.o | 4 +- prebuild/x86_64/lib_info.txt | 2 +- prebuild/x86_64/lower_cce.cc.o | 4 +- prebuild/x86_64/update_cce_tiling.cc.o | 4 +- python/akg/composite/build_module.py | 1 + src/codegen/util.h | 2 + src/profiler/ascend/profile_mgr.cc | 10 +- src/profiler/ascend/profile_mgr.h | 2 +- src/runtime/ascend/ascend_kernel_runtime.cc | 145 ++++++----- src/runtime/ascend/ascend_kernel_runtime.h | 11 +- src/runtime/ascend/ascend_memory_manager.cc | 26 +- src/runtime/ascend/kernel.h | 2 +- src/runtime/ascend/kernel_pack.cc | 40 +-- src/runtime/ascend/runtime_error_codes.cc | 1 + src/runtime/ascend/runtime_error_codes.h | 1 - src/runtime/stub/runtime_stub.cc | 126 ---------- .../incubator-tvm/src/runtime/cce/cce_acl.h | 220 +++++++++++++++++ .../src/runtime/cce/cce_common.h | 40 +-- .../src/runtime/cce/cce_device_api.cc | 51 ++-- .../src/runtime/cce/cce_module.cc | 106 ++++---- .../src/runtime/cce/cce_module.h | 1 + .../src/runtime/cce/cce_wrapper.cc | 229 +++++++++--------- .../src/runtime/cce/cce_wrapper.h | 99 ++++---- 29 files changed, 603 insertions(+), 546 deletions(-) delete mode 100644 src/runtime/stub/runtime_stub.cc create mode 100644 third_party/incubator-tvm/src/runtime/cce/cce_acl.h diff --git a/prebuild/aarch64/build_cce.cc.o b/prebuild/aarch64/build_cce.cc.o index a3baa04b..5bb0e2b7 100644 --- a/prebuild/aarch64/build_cce.cc.o +++ b/prebuild/aarch64/build_cce.cc.o @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8133cf9501549231c88271cf9e9ea7e93066aea3c1f67f7a0da53419536b1697 -size 738888 +oid sha256:72a9ad83d5fd23a92f637cfc6ca641889622413b9059ad47a16312dfc89c4116 +size 816520 diff --git a/prebuild/aarch64/codegen_cce.cc.o b/prebuild/aarch64/codegen_cce.cc.o index 7bd959af..d53b2a32 100644 --- a/prebuild/aarch64/codegen_cce.cc.o +++ b/prebuild/aarch64/codegen_cce.cc.o @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:98bfdd51830580273504ac29d88c61d1f56c67dabd23ba0f919eeddcf0a757ea -size 517888 +oid sha256:34cf2ce8fe69e15fbaaa4aa7ab0c6276a39291455025a88707ad511648090312 +size 575416 diff --git a/prebuild/aarch64/lib_info.txt b/prebuild/aarch64/lib_info.txt index 57b5a011..23df6670 100644 --- a/prebuild/aarch64/lib_info.txt +++ b/prebuild/aarch64/lib_info.txt @@ -1,3 +1,3 @@ [lib information] git branch: master -commit id: dc02ec14853ce89e2c8da53742ede9bc70148c5b +commit id: cdcabb5e5465827935b4ee84c1ca8f90dc15eeec diff --git a/prebuild/aarch64/lower_cce.cc.o b/prebuild/aarch64/lower_cce.cc.o index 222a866b..0c763f75 100644 --- a/prebuild/aarch64/lower_cce.cc.o +++ b/prebuild/aarch64/lower_cce.cc.o @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7233e465b31eb29b271910348daf847d2f63e86ce85463637c7b0b8b2566f95a -size 401816 +oid sha256:997f18fa3caaabf4655cafff575a4d460714eba84802c7c5f32b0856abf72431 +size 403912 diff --git a/prebuild/aarch64/update_cce_tiling.cc.o b/prebuild/aarch64/update_cce_tiling.cc.o index 0fe27df4..4c5032ec 100644 --- a/prebuild/aarch64/update_cce_tiling.cc.o +++ b/prebuild/aarch64/update_cce_tiling.cc.o @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dcad4489557990b9c321235826b093354f1f078380408c9a1855a6cd271fa025 -size 373064 +oid sha256:681e529f73ef1b76f9d4b4fb201aeef929b505a33f69028545d83e75b7563c8c +size 377384 diff --git a/prebuild/x86_64/build_cce.cc.o b/prebuild/x86_64/build_cce.cc.o index 12b9c4ce..2c9efb19 100644 --- a/prebuild/x86_64/build_cce.cc.o +++ b/prebuild/x86_64/build_cce.cc.o @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a7d7405aa602d798c5e225f198ce3ae05eb03a2972d66a8ebf7214e6c3536b3b -size 700536 +oid sha256:db9360ba412cf4fb438e4d68b38c0058473467d2ec822e4701db9a78b136eb3f +size 772608 diff --git a/prebuild/x86_64/codegen_cce.cc.o b/prebuild/x86_64/codegen_cce.cc.o index f9e86fee..0bf8589a 100644 --- a/prebuild/x86_64/codegen_cce.cc.o +++ b/prebuild/x86_64/codegen_cce.cc.o @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f35987b02dfa11547cde1480ccdf5eb0e69d26af3bc0e586b849110f555393db -size 473296 +oid sha256:e000d296953d4cc221b02e536bb6a2c862a008e7c7b2654ad1a7eac47e811609 +size 528240 diff --git a/prebuild/x86_64/lib_info.txt b/prebuild/x86_64/lib_info.txt index 57b5a011..23df6670 100644 --- a/prebuild/x86_64/lib_info.txt +++ b/prebuild/x86_64/lib_info.txt @@ -1,3 +1,3 @@ [lib information] git branch: master -commit id: dc02ec14853ce89e2c8da53742ede9bc70148c5b +commit id: cdcabb5e5465827935b4ee84c1ca8f90dc15eeec diff --git a/prebuild/x86_64/lower_cce.cc.o b/prebuild/x86_64/lower_cce.cc.o index 1e418541..c6366fa2 100644 --- a/prebuild/x86_64/lower_cce.cc.o +++ b/prebuild/x86_64/lower_cce.cc.o @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ee177b149e642dd08ee737f2b427553f022126abcd9484abe4da92829afae9fd -size 382880 +oid sha256:7dd9f41c7d997a70199538735a68f1167912423f20f652ec83332e1fd93fe281 +size 384568 diff --git a/prebuild/x86_64/update_cce_tiling.cc.o b/prebuild/x86_64/update_cce_tiling.cc.o index 9f22ef78..5c866114 100644 --- a/prebuild/x86_64/update_cce_tiling.cc.o +++ b/prebuild/x86_64/update_cce_tiling.cc.o @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1a9b4cf57b305c6e86a37561ff9f25e1a708066cd1e7d4ff063e55983ddd2f2f -size 361424 +oid sha256:86487c300282f2762a8a9532f8e161691e926aa8845c13f1f1246c851bfca80d +size 367168 diff --git a/python/akg/composite/build_module.py b/python/akg/composite/build_module.py index 936dd435..26c530e7 100644 --- a/python/akg/composite/build_module.py +++ b/python/akg/composite/build_module.py @@ -672,6 +672,7 @@ def _build_to_module_ascend(desc_s_in, desc_d_in, attr, use_repo=True): is_success = build_tbe_codegen(kernel_name, stmt_json, args_json, attr, ascend_type) if not is_success: raise TypeError("npu_inference codegen failed.") + akg.tvm.get_global_func("build_host_cce")(res[1], kernel_name) return kernel_name return res diff --git a/src/codegen/util.h b/src/codegen/util.h index 6b0ad62b..6cd0062c 100644 --- a/src/codegen/util.h +++ b/src/codegen/util.h @@ -116,6 +116,8 @@ constexpr auto kIsPolyConfigReset = "is_poly_config_reset"; constexpr auto kWorkspaceAttr = "workspace"; constexpr auto kWorkspaceNum = "num"; constexpr auto kWorkspaceSize = "size"; +constexpr auto kWorkspaceName = "name"; +constexpr auto kWorkspaceType = "type"; static std::unordered_map help_tiling_level = { {"None", 0}, diff --git a/src/profiler/ascend/profile_mgr.cc b/src/profiler/ascend/profile_mgr.cc index 0ef62461..39af2c21 100644 --- a/src/profiler/ascend/profile_mgr.cc +++ b/src/profiler/ascend/profile_mgr.cc @@ -19,7 +19,7 @@ #include #include "profile_mgr.h" #include "toolchain/prof_acl_api.h" -#include +#include #include #include @@ -171,8 +171,8 @@ ProfileMgr &ProfileMgr::GetInstance() { } bool ProfileMgr::ProfRegisterCtrlCallback() const { - rtError_t rt_ret = MsprofRegisterCallback(GE, CtrlCallbackHandle); - if (rt_ret != RT_ERROR_NONE) { + aclError rt_ret = MsprofRegisterCallback(GE, CtrlCallbackHandle); + if (rt_ret != ACL_SUCCESS) { LOG(ERROR) << "Call rtProfRegisterCtrlCallback failed."; return false; } @@ -180,7 +180,7 @@ bool ProfileMgr::ProfRegisterCtrlCallback() const { return true; } -rtError_t CtrlCallbackHandle(uint32_t rt_type, void *data, uint32_t len) { +aclError CtrlCallbackHandle(uint32_t rt_type, void *data, uint32_t len) { if (rt_type == RT_PROF_CTRL_REPORTER) { ProfileMgr::GetInstance().RegReporterCallback(reinterpret_cast(data)); LOG(INFO) << "Set MsprofReporterCallback success."; @@ -191,7 +191,7 @@ rtError_t CtrlCallbackHandle(uint32_t rt_type, void *data, uint32_t len) { } } - return RT_ERROR_NONE; + return ACL_SUCCESS; } Status ProfCtrlSwitchHandle(void *data) { diff --git a/src/profiler/ascend/profile_mgr.h b/src/profiler/ascend/profile_mgr.h index dc7fac98..ea3992c2 100644 --- a/src/profiler/ascend/profile_mgr.h +++ b/src/profiler/ascend/profile_mgr.h @@ -102,7 +102,7 @@ class ProfileMgr { Status ProfCommandHandle(ProfCommandHandleType type); -rtError_t CtrlCallbackHandle(uint32_t rt_type, void *data, uint32_t len); +aclError CtrlCallbackHandle(uint32_t rt_type, void *data, uint32_t len); Status ProfCtrlSwitchHandle(void *data); } // namespace runtime } // namespace air diff --git a/src/runtime/ascend/ascend_kernel_runtime.cc b/src/runtime/ascend/ascend_kernel_runtime.cc index e85831f5..80bf2710 100644 --- a/src/runtime/ascend/ascend_kernel_runtime.cc +++ b/src/runtime/ascend/ascend_kernel_runtime.cc @@ -15,7 +15,6 @@ */ #include #include "ascend_kernel_runtime.h" -#include "runtime/rt.h" #include "ascend_memory_manager.h" #include "kernel.h" #include "tvm.h" @@ -31,7 +30,10 @@ using std::vector; namespace air { namespace runtime { -static thread_local rtContext_t thread_local_rt_context{nullptr}; +constexpr auto kBinFileSuffix = ".so"; +constexpr auto kDoBinFileSuffix = "_do"; + +static thread_local aclrtContext thread_local_rt_context{nullptr}; AscendKernelRuntime::AscendKernelRuntime(uint32_t device_id) { set_device_id(device_id); @@ -44,10 +46,10 @@ void AscendKernelRuntime::SetContext() { if (thread_local_rt_context == rt_context_) { return; } - auto ret = rtCtxSetCurrent(rt_context_); + auto ret = aclrtSetCurrentContext(rt_context_); thread_local_rt_context = rt_context_; - if (ret != RT_ERROR_NONE) { - LOG(FATAL) << "Call rtCtxSetCurrent, ret[" << GetErrorMsg(ret) << "]"; + if (ret != ACL_SUCCESS) { + LOG(FATAL) << "Call aclrtSetCurrentContext, ret[" << GetErrorMsg(ret) << "]"; } } @@ -55,9 +57,9 @@ void AscendKernelRuntime::SetCurrentContext() { if (rt_context_ == nullptr) { return; } - auto ret = rtCtxSetCurrent(rt_context_); - if (ret != RT_ERROR_NONE) { - LOG(FATAL) << "Call rtCtxSetCurrent, ret[" << GetErrorMsg(ret) << "]"; + auto ret = aclrtSetCurrentContext(rt_context_); + if (ret != ACL_SUCCESS) { + LOG(FATAL) << "Call aclrtSetCurrentContext, ret[" << GetErrorMsg(ret) << "]"; } } @@ -95,9 +97,9 @@ bool AscendKernelRuntime::Init() { void AscendKernelRuntime::CreateContext() { if (rt_context_ == nullptr) { - auto ret = rtCtxCreate(&rt_context_, 0, device_id_); - if (ret != RT_ERROR_NONE) { - LOG(FATAL) << "Call rtCtxCreate, ret[" << static_cast(ret) << "]"; + auto ret = aclrtCreateContext(&rt_context_, device_id_); + if (ret != ACL_SUCCESS) { + LOG(FATAL) << "Call aclrtCreateContext, ret[" << static_cast(ret) << "]"; } } SetCurrentContext(); @@ -105,48 +107,49 @@ void AscendKernelRuntime::CreateContext() { bool AscendKernelRuntime::InitDevice() { LOG(INFO) << "InitDevice: " << device_id_; - int device_count = 0; - auto ret = rtGetDeviceCount(&device_count); - if (ret != RT_ERROR_NONE) { - LOG(FATAL) << "Call rtGetDeviceCount, ret[" << static_cast(ret) << "]"; + uint32_t device_count = 0; + auto ret = aclrtGetDeviceCount(&device_count); + if (ret != ACL_SUCCESS) { + LOG(FATAL) << "Call aclrtGetDeviceCount, ret[" << static_cast(ret) << "]"; } - ret = rtSetDevice(device_id_); - if (ret != RT_ERROR_NONE) { - LOG(FATAL) << "Call rtSetDevice, ret[" << static_cast(ret) << "]"; + ret = aclrtSetDevice(device_id_); + if (ret != ACL_SUCCESS) { + LOG(FATAL) << "Call aclrtSetDevice, ret[" << static_cast(ret) << "]"; } - // Context will be created by rtSetDevice - ret = rtCtxGetCurrent(&rt_context_); - if (ret != RT_ERROR_NONE || rt_context_ == nullptr) { - LOG(FATAL) << "Call rtCtxGetCurrent failed, ret[" << GetErrorMsg(ret) << "]"; + // Context will be created by aclrtSetDevice + ret = aclrtGetCurrentContext(&rt_context_); + if (ret != ACL_SUCCESS || rt_context_ == nullptr) { + LOG(FATAL) << "Call aclrtGetCurrentContext failed, ret[" << GetErrorMsg(ret) << "]"; return false; } - ret = rtStreamCreateWithFlags(&stream_, 0, RT_STREAM_DEFAULT); - if (ret != RT_ERROR_NONE) { - LOG(FATAL) << "Call rtStreamCreate, ret[" << GetErrorMsg(ret) << "]"; + ret = aclrtCreateStreamWithConfig(&stream_, 0, RT_STREAM_DEFAULT); + if (ret != ACL_SUCCESS) { + LOG(FATAL) << "Call aclrtCreateStreamWithConfig, ret[" << GetErrorMsg(ret) << "]"; } return true; } AscendKernelRuntime::~AscendKernelRuntime() { ReleaseDeviceRes(); + UnLoadKernelFunc(); } bool AscendKernelRuntime::ResetDevice(uint32_t device_id) { SetCurrentContext(); int32_t ret; if (stream_ != nullptr) { - ret = rtStreamDestroy(stream_); - if (ret != RT_ERROR_NONE) { - LOG(FATAL) << "Call rtStreamDestroy, ret[" << GetErrorMsg(ret) << "]"; + ret = aclrtDestroyStream(stream_); + if (ret != ACL_SUCCESS) { + LOG(FATAL) << "Call aclrtDestroyStream, ret[" << GetErrorMsg(ret) << "]"; } stream_ = nullptr; } - ret = rtDeviceReset(device_id); - if (ret != RT_ERROR_NONE) { - LOG(FATAL) << "Call rtDeviceReset, ret[" << GetErrorMsg(ret) << "]"; + ret = aclrtResetDevice(device_id); + if (ret != ACL_SUCCESS) { + LOG(FATAL) << "Call aclrtResetDevice, ret[" << GetErrorMsg(ret) << "]"; } // set to nullptr as its not created, only bounded to existing context rt_context_ = nullptr; @@ -161,12 +164,41 @@ inline unsigned int UlongToUint(uint64_t u) { return static_cast(u); } +void *AscendKernelRuntime::GetKernelFunc(const std::string &kernel_name, const std::string &func_name) { + const auto *f = Registry::Get("get_kernel_meta_path"); + CHECK(f != nullptr) << "Function get_kernel_meta_path is not registered"; + std::string file_str = (*f)().operator std::string(); + (void)file_str.append(kernel_name).append(kBinFileSuffix); + char *file_c_str = (char *)file_str.c_str(); + + void *handle = dlopen(file_c_str, RTLD_LAZY | RTLD_LOCAL); + CHECK(handle != nullptr) << "dlopen failed, file: " << file_c_str; + + std::string func_str = func_name + kDoBinFileSuffix; + char *func_c_str = (char *)func_str.c_str(); + void *func = dlsym(handle, func_c_str); + CHECK(func != nullptr) << "dlsym failed, symbol: " << func_str; + cce_handle_ = handle; + return func; +} + +bool AscendKernelRuntime::UnLoadKernelFunc() { + if (cce_handle_ != nullptr) { + if (dlclose(cce_handle_) != 0) { + return false; + } + } + cce_handle_ = nullptr; + return true; +} + bool AscendKernelRuntime::Run(const std::string &kernel_name, const std::vector &input_tensors, const std::vector &input_shape_args) { uint32_t blockdim = 1; // default blockdim equal to 1. + std::string func_name = kernel_name; auto kernel_pack_ptr = GetKernelPack(kernel_name); - auto func_stub = GetFuncStub(*kernel_pack_ptr, &blockdim); - if (func_stub == 0) { + auto func_stub = GetFuncStub(*kernel_pack_ptr, &blockdim, &func_name); + if (!func_stub) { LOG(FATAL) << "GenFuncStub failed."; return false; } @@ -178,23 +210,20 @@ bool AscendKernelRuntime::Run(const std::string &kernel_name, const std::vector< for (const auto &shape_arg : input_shape_args) { runtimeargs.push_back(reinterpret_cast(shape_arg)); } - rtL2Ctrl_t *l2ctrl = nullptr; - const void *stubFunc = reinterpret_cast(func_stub); - auto argsSize = static_cast(UlongToUint(sizeof(void *)) * runtimeargs.size()); + if (input_shape_args.size() > 0 && blockdim == INT_MAX) { blockdim = input_shape_args[input_shape_args.size() - 1]; } - auto ret = rtKernelLaunch(stubFunc, blockdim, runtimeargs.data(), argsSize, l2ctrl, stream()); + + typedef void (*CallFunc)(uint32_t, void*, void*, void**); + auto func_ptr = reinterpret_cast(GetKernelFunc(kernel_name, func_name)); + func_ptr(blockdim, nullptr, stream(), runtimeargs.data()); SyncStream(); - if (ret != RT_ERROR_NONE) { - LOG(FATAL) << "Call runtime rtKernelLaunch error, ret[" << GetErrorMsg(ret) << "]"; - return false; - } #ifdef USE_CCE_PROFILING uint32_t stream_id; uint32_t task_id; auto rt_ret = rtGetTaskIdAndStreamID(&task_id, &stream_id); - if (rt_ret != RT_ERROR_NONE) { + if (rt_ret != ACL_SUCCESS) { LOG(FATAL) << "Profiling get task_id stream_id failed"; } auto label = std::to_string(stream_id) + "_" + std::to_string(task_id); @@ -206,27 +235,27 @@ bool AscendKernelRuntime::Run(const std::string &kernel_name, const std::vector< bool AscendKernelRuntime::SyncDeviceToHost(size_t size, void *device_ptr, void *host_ptr) { CHECK_NOTNULL(host_ptr); LOG(INFO) << "SyncDeviceToHost: " << size << " bytes from " << device_ptr << "(device) to " << host_ptr << "(host)"; - SyncMemory(host_ptr, device_ptr, size, RT_MEMCPY_DEVICE_TO_HOST); + SyncMemory(host_ptr, device_ptr, size, ACL_MEMCPY_DEVICE_TO_HOST); return true; } bool AscendKernelRuntime::SyncHostToDevice(size_t size, const void *host_ptr, void *device_ptr) { CHECK_NOTNULL(host_ptr); LOG(INFO) << "SyncHostToDevice: " << size << " bytes from " << host_ptr << "(host) to " << device_ptr << "(device)"; - SyncMemory(device_ptr, host_ptr, size, RT_MEMCPY_HOST_TO_DEVICE); + SyncMemory(device_ptr, host_ptr, size, ACL_MEMCPY_HOST_TO_DEVICE); return true; } -void AscendKernelRuntime::SyncMemory(void *dst, const void *src, uint64_t size, rtMemcpyKind_t kind) { +void AscendKernelRuntime::SyncMemory(void *dst, const void *src, uint64_t size, aclrtMemcpyKind kind) { SetContext(); // Only apply asynchronous copy in Pynative && RT_MEMCPY_HOST_TO_DEVICE mode - if (kind != RT_MEMCPY_HOST_TO_DEVICE) { - auto ret_rt_memcpy = rtMemcpy(dst, size, src, size, kind); - if (ret_rt_memcpy != RT_ERROR_NONE) { - LOG(FATAL) << "rtMemcpy failed, ret[" << ret_rt_memcpy << "]"; + if (kind != ACL_MEMCPY_HOST_TO_DEVICE) { + auto ret_rt_memcpy = aclrtMemcpy(dst, size, src, size, kind); + if (ret_rt_memcpy != ACL_SUCCESS) { + LOG(FATAL) << "aclrtMemcpy failed, ret[" << ret_rt_memcpy << "]"; } } else { - auto ret = MemcpyAsync(dst, src, size, static_cast(RT_MEMCPY_HOST_TO_DEVICE_EX)); + auto ret = MemcpyAsync(dst, src, size, static_cast(ACL_MEMCPY_HOST_TO_DEVICE)); if (!ret) { LOG(FATAL) << "MemcpyAsync failed, ret[" << GetErrorMsg(ret) << "]"; } @@ -240,13 +269,13 @@ bool AscendKernelRuntime::MemcpyAsync(void *dst, const void *src, uint64_t size, return false; } - auto copy_kind = static_cast(kind); - if (copy_kind != RT_MEMCPY_HOST_TO_DEVICE_EX) { + auto copy_kind = static_cast(kind); + if (copy_kind != ACL_MEMCPY_HOST_TO_DEVICE) { LOG(FATAL) << "Memory copy async not support cache host buffer in kind: " << kind; } - auto ret = rtMemcpyAsync(dst, size, src, size, static_cast(kind), stream_); - if (ret != RT_ERROR_NONE) { - LOG(FATAL) << "Call runtime rtMemcpyAsync error, ret[" << GetErrorMsg(ret) << "]"; + auto ret = aclrtMemcpyAsync(dst, size, src, size, static_cast(kind), stream_); + if (ret != ACL_SUCCESS) { + LOG(FATAL) << "Call runtime aclrtMemcpyAsync error, ret[" << GetErrorMsg(ret) << "]"; return false; } return true; @@ -258,9 +287,9 @@ bool AscendKernelRuntime::SyncStream() { LOG(FATAL) << "SyncStream failed. stream_ is nullptr"; return false; } - auto ret = rtStreamSynchronize(stream_); - if (ret != RT_ERROR_NONE) { // o for switch stream - LOG(FATAL) << "Call runtime rtStreamSynchronize error, ret[" << GetErrorMsg(ret) << "]"; + auto ret = aclrtSynchronizeStream(stream_); + if (ret != ACL_SUCCESS) { // o for switch stream + LOG(FATAL) << "Call runtime aclrtSynchronizeStream error, ret[" << GetErrorMsg(ret) << "]"; return false; } return true; diff --git a/src/runtime/ascend/ascend_kernel_runtime.h b/src/runtime/ascend/ascend_kernel_runtime.h index 82eef6c6..2b4460a3 100644 --- a/src/runtime/ascend/ascend_kernel_runtime.h +++ b/src/runtime/ascend/ascend_kernel_runtime.h @@ -25,8 +25,8 @@ #include #include "ascend_memory_manager.h" #include "tensor_device.h" -#include "runtime/context.h" -#include "runtime/mem.h" +#include "runtime/cce/cce_acl.h" +#include namespace air { namespace runtime { @@ -56,13 +56,16 @@ class AscendKernelRuntime { bool InitDevice(); bool ResetDevice(uint32_t device_id); void SetCurrentContext(); - void SyncMemory(void *dst, const void *src, uint64_t size, rtMemcpyKind_t kind); + void SyncMemory(void *dst, const void *src, uint64_t size, aclrtMemcpyKind kind); + void *GetKernelFunc(const std::string &kernel_name, const std::string &func_name); + bool UnLoadKernelFunc(); - rtContext_t rt_context_{nullptr}; + aclrtContext rt_context_{nullptr}; bool initialized_{false}; uint32_t device_id_{0}; void *stream_{nullptr}; std::shared_ptr mem_manager_{nullptr}; + void *cce_handle_{nullptr}; }; } // namespace runtime } // namespace air diff --git a/src/runtime/ascend/ascend_memory_manager.cc b/src/runtime/ascend/ascend_memory_manager.cc index a2d78cb0..9037d597 100644 --- a/src/runtime/ascend/ascend_memory_manager.cc +++ b/src/runtime/ascend/ascend_memory_manager.cc @@ -16,7 +16,7 @@ #include #include #include "ascend_memory_manager.h" -#include "runtime/mem.h" +#include "runtime/cce/cce_acl.h" #include "runtime_error_codes.h" namespace air { @@ -29,15 +29,15 @@ constexpr uint64_t kAscendDeviceMemSize = (kAscendInitDeviceMemGB << kMemSizeGB) uint64_t GetDeviceMemSize() { size_t free = 0; size_t total = 0; - rtError_t ret = rtMemGetInfoEx(RT_MEMORYINFO_HBM, &free, &total); - if (ret != RT_ERROR_NONE) { + aclError ret = aclrtGetMemInfo(ACL_HBM_MEM, &free, &total); + if (ret != ACL_SUCCESS) { LOG(FATAL) << "Get Device HBM memory size failed, ret = " << ret << ", total = " << total; } if (total != 0) { return total; } - ret = rtMemGetInfoEx(RT_MEMORYINFO_DDR, &free, &total); - if (ret != RT_ERROR_NONE) { + ret = aclrtGetMemInfo(ACL_DDR_MEM, &free, &total); + if (ret != ACL_SUCCESS) { LOG(FATAL) << "Get Device DDR memory size failed, ret = " << ret << ", total = " << total; } return total; @@ -54,10 +54,10 @@ uint64_t GetDefaultDeviceMemSize() { void AscendMemoryManager::MallocDeviceMemory() { device_mem_size_ = GetDefaultDeviceMemSize(); - rtError_t ret; + aclError ret; auto max_retry = 3; for (auto i = 0; i < max_retry; ++i) { - ret = rtMalloc(reinterpret_cast(&device_mem_base_), device_mem_size_, RT_MEMORY_HBM, 0); + ret = aclrtMalloc(reinterpret_cast(&device_mem_base_), device_mem_size_, ACL_MEM_MALLOC_HUGE_FIRST); if (ret == ACL_ERROR_RT_MEMORY_ALLOCATION) { LOG(WARNING) << "Device may be occupied, sleep 1s and retry again!"; device_mem_base_ = nullptr; @@ -67,20 +67,20 @@ void AscendMemoryManager::MallocDeviceMemory() { } } - if (ret != RT_ERROR_NONE) { - LOG(FATAL) << "rtMalloc mem size[" << device_mem_size_ << "] fail, ret[" << ret << "]"; + if (ret != ACL_SUCCESS) { + LOG(FATAL) << "aclrtMalloc mem size[" << device_mem_size_ << "] fail, ret[" << ret << "]"; } else { device_mem_offset_ = device_mem_size_; - LOG(INFO) << "Call rtMalloc to allocate device memory Success, size : " << device_mem_size_ + LOG(INFO) << "Call aclrtMalloc to allocate device memory Success, size : " << device_mem_size_ << " bytes , address : " << reinterpret_cast(device_mem_base_); } } void AscendMemoryManager::FreeDeviceMemory() { if (device_mem_base_ != nullptr) { - auto ret = rtFree(device_mem_base_); - if (ret != RT_ERROR_NONE) { - LOG(FATAL) << "rtFree mem size[" << device_mem_size_ << "] fail, ret[" << ret << "]"; + auto ret = aclrtFree(device_mem_base_); + if (ret != ACL_SUCCESS) { + LOG(FATAL) << "aclrtFree mem size[" << device_mem_size_ << "] fail, ret[" << ret << "]"; } device_mem_base_ = nullptr; } diff --git a/src/runtime/ascend/kernel.h b/src/runtime/ascend/kernel.h index 97cf7851..82485c28 100644 --- a/src/runtime/ascend/kernel.h +++ b/src/runtime/ascend/kernel.h @@ -74,7 +74,7 @@ class KernelPack { using KernelPackPtr = std::shared_ptr; -uintptr_t GetFuncStub(const KernelPack &kernel_pack, uint32_t *block_dim); +bool GetFuncStub(const KernelPack &kernel_pack, uint32_t *block_dim, std::string *func_name); KernelPackPtr GetKernelPack(const std::string &kernel_name); } // namespace runtime diff --git a/src/runtime/ascend/kernel_pack.cc b/src/runtime/ascend/kernel_pack.cc index 39185b4b..bb2655e3 100644 --- a/src/runtime/ascend/kernel_pack.cc +++ b/src/runtime/ascend/kernel_pack.cc @@ -15,13 +15,12 @@ */ #include #include -#include "runtime/rt.h" +#include #include "kernel.h" namespace air { namespace runtime { constexpr auto kJsonSuffix = ".json"; -static uintptr_t kernel_stub_gen_ = 0; inline size_t LongToSize(int64_t u) { if (u < 0) { @@ -120,49 +119,22 @@ bool KernelPack::LoadKernelMeta(const std::string &json_f) { KernelJsonInfo KernelPack::kernel_json_info() const { return kernel_json_info_; } -uintptr_t GetFuncStub(const KernelPack &kernel_pack, uint32_t *block_dim) { +bool GetFuncStub(const KernelPack &kernel_pack, uint32_t *block_dim, std::string *func_name) { auto kernel = kernel_pack.GetKernel(); if (kernel == nullptr) { LOG(FATAL) << "Invalid kernel pack, json or kernel is nullptr."; + return false; } auto kernel_contents = kernel->contents; if (kernel_contents == nullptr) { LOG(FATAL) << "Invalid kernel context, json or kernel is nullptr."; + return false; } auto kernel_json_info = kernel_pack.kernel_json_info(); *block_dim = kernel_json_info.block_dim; - std::string func_name = kernel_json_info.kernel_name; - std::string magic = kernel_json_info.magic; - - static std::map magic_maps = {{"RT_DEV_BINARY_MAGIC_PLAIN", RT_DEV_BINARY_MAGIC_PLAIN}, - {"RT_DEV_BINARY_MAGIC_PLAIN_AICPU", RT_DEV_BINARY_MAGIC_PLAIN_AICPU}, - {"RT_DEV_BINARY_MAGIC_PLAIN_AIVEC", RT_DEV_BINARY_MAGIC_PLAIN_AIVEC}, - {"RT_DEV_BINARY_MAGIC_ELF", RT_DEV_BINARY_MAGIC_ELF}, - {"RT_DEV_BINARY_MAGIC_ELF_AICPU", RT_DEV_BINARY_MAGIC_ELF_AICPU}, - {"RT_DEV_BINARY_MAGIC_ELF_AIVEC", RT_DEV_BINARY_MAGIC_ELF_AIVEC}, - {"RT_DEV_BINARY_MAGIC_ELF_AICUBE", RT_DEV_BINARY_MAGIC_ELF_AICUBE}}; - // object for device register. - auto iter = magic_maps.find(magic); - if (iter == magic_maps.end()) { - LOG(FATAL) << "Invalid magic number: " << magic << ", kernel: " << func_name; - } - - // BinaryRegister - void *module = nullptr; - rtDevBinary_t devBin; - devBin.magic = iter->second; - devBin.version = 0; - devBin.length = kernel->len; - devBin.data = kernel->contents; - static_cast(rtDevBinaryRegister(&devBin, &module)); - - // to diff different funcs. - uintptr_t func_stub = ++kernel_stub_gen_; - static_cast( - rtFunctionRegister(module, reinterpret_cast(func_stub), func_name.c_str(), func_name.c_str(), 0)); - - return func_stub; + *func_name = kernel_json_info.kernel_name; + return true; } KernelPackPtr GetKernelPack(const std::string &kernel_name) { diff --git a/src/runtime/ascend/runtime_error_codes.cc b/src/runtime/ascend/runtime_error_codes.cc index 31961eba..2a9c31f7 100644 --- a/src/runtime/ascend/runtime_error_codes.cc +++ b/src/runtime/ascend/runtime_error_codes.cc @@ -16,6 +16,7 @@ #include "runtime_error_codes.h" #include #include +#include "runtime/cce/cce_acl.h" const std::map error_msg = { {ACL_RT_SUCCESS, "success"}, diff --git a/src/runtime/ascend/runtime_error_codes.h b/src/runtime/ascend/runtime_error_codes.h index 0f5a23e9..c3c70901 100644 --- a/src/runtime/ascend/runtime_error_codes.h +++ b/src/runtime/ascend/runtime_error_codes.h @@ -18,7 +18,6 @@ #define SRC_RUNTIME_ASCEND_RUNTIME_ERROR_CODES_H_ #include -#include "external/runtime/rt_error_codes.h" namespace air { namespace runtime { std::string GetErrorMsg(uint32_t rt_error_code); diff --git a/src/runtime/stub/runtime_stub.cc b/src/runtime/stub/runtime_stub.cc deleted file mode 100644 index a9e7c3b1..00000000 --- a/src/runtime/stub/runtime_stub.cc +++ /dev/null @@ -1,126 +0,0 @@ -/** - * Copyright 2019-2023 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -#include - -#define EVENT_LENTH 10 - -#define FUNC_ENTRY LOG(INFO) << "Run in func " << __FUNCTION__; -#define RT_ERROR_MEMORY_ALLOCATION -1 - -void *ProfMgrStartUp(const ProfMgrCfg *cfg) { - return reinterpret_cast(0xffffff); -} - -int ProfMgrStop(void *handle) { - return 0; -} - -rtError_t rtEventCreate(rtEvent_t *event) { - *event = new (std::nothrow) int[EVENT_LENTH]; - if (*event == nullptr) { - return RT_ERROR_MEMORY_ALLOCATION; - } - return RT_ERROR_NONE; -} - -rtError_t rtMalloc(void **devPtr, uint64_t size, rtMemType_t type, const uint16_t moduleId) { - FUNC_ENTRY - CHECK_GT(size, 0); - *devPtr = new (std::nothrow) uint8_t[size]; - if (*devPtr == nullptr) { - return RT_ERROR_MEMORY_ALLOCATION; - } - return RT_ERROR_NONE; -} - -rtError_t rtFree(void *devPtr) { - FUNC_ENTRY - delete[] reinterpret_cast(devPtr); - return RT_ERROR_NONE; -} - -rtError_t rtStreamCreate(rtStream_t *stream, int32_t priority) { - *stream = new (std::nothrow) uint32_t; - if (*stream == nullptr) { - return RT_ERROR_MEMORY_ALLOCATION; - } - return RT_ERROR_NONE; -} - -rtError_t rtStreamDestroy(rtStream_t stream) { - delete reinterpret_cast(stream); - return RT_ERROR_NONE; -} - -rtError_t rtSetDevice(int32_t device) { - FUNC_ENTRY - return RT_ERROR_NONE; -} - -rtError_t rtStreamSynchronize(rtStream_t stream) { - FUNC_ENTRY - return RT_ERROR_NONE; -} - -rtError_t rtMemcpy(void *dst, uint64_t destMax, const void *src, uint64_t count, rtMemcpyKind_t kind) { - FUNC_ENTRY - return RT_ERROR_NONE; -} - -rtError_t rtMemcpyAsync(void *dst, uint64_t destMax, const void *src, uint64_t count, rtMemcpyKind_t kind, - rtStream_t stream) { - FUNC_ENTRY - return RT_ERROR_NONE; -} - -rtError_t rtStreamWaitEvent(rtStream_t stream, rtEvent_t event) { return RT_ERROR_NONE; } - -rtError_t rtGetDeviceCount(int32_t *count) { - *count = 1; - return RT_ERROR_NONE; -} - -rtError_t rtDevBinaryRegister(const rtDevBinary_t *bin, void **handle) { - FUNC_ENTRY - return RT_ERROR_NONE; -} - -rtError_t rtDevBinaryUnRegister(void *handle) { - FUNC_ENTRY - return RT_ERROR_NONE; -} - -rtError_t rtFunctionRegister(void *binHandle, const void *stubFunc, const char *stubName, const void *devFunc, - uint32_t funcMode) { - FUNC_ENTRY - return RT_ERROR_NONE; -} - -rtError_t rtKernelLaunch(const void *stubFunc, uint32_t blockDim, void *args, uint32_t argsSize, rtL2Ctrl_t *l2ctrl, - rtStream_t stream) { - FUNC_ENTRY - return RT_ERROR_NONE; -} - -rtError_t rtGetDevice(int32_t *device) { - FUNC_ENTRY - *device = 0; - return RT_ERROR_NONE; -} diff --git a/third_party/incubator-tvm/src/runtime/cce/cce_acl.h b/third_party/incubator-tvm/src/runtime/cce/cce_acl.h new file mode 100644 index 00000000..91c282d8 --- /dev/null +++ b/third_party/incubator-tvm/src/runtime/cce/cce_acl.h @@ -0,0 +1,220 @@ +/*! + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file cce_acl.h + * \brief cce acl symbols + */ + +/*! + * 2024.1.24 - Add file cce_acl.h. + */ + +#ifndef TVM_RUNTIME_CCE_ACL_H_ +#define TVM_RUNTIME_CCE_ACL_H_ + +#include +#include + +#define RT_STREAM_DEFAULT (0x00U) +#define RT_STREAM_PERSISTENT (0x01U) +#define RT_STREAM_FORCE_COPY (0x02U) +#define RT_STREAM_HUGE (0x04U) +#define RT_STREAM_AICPU (0x08U) +#define RT_STREAM_FORBIDDEN_DEFAULT (0x10U) +#define RT_STREAM_HEAD (0x20U) +#define RT_STREAM_PRIMARY_DEFAULT (0x40U) +#define RT_STREAM_PRIMARY_FIRST_DEFAULT (0x80U) +#define RT_STREAM_OVERFLOW (0x100U) +#define RT_STREAM_FAST_LAUNCH (0x200U) +#define RT_STREAM_FAST_SYNC (0x400U) +#define RT_STREAM_CP_PROCESS_USE (0x800U) + + +static const int32_t ACL_RT_SUCCESS = 0; // success +static const int32_t ACL_ERROR_RT_PARAM_INVALID = 107000; // param invalid +static const int32_t ACL_ERROR_RT_INVALID_DEVICEID = 107001; // invalid device id +static const int32_t ACL_ERROR_RT_CONTEXT_NULL = 107002; // current context null +static const int32_t ACL_ERROR_RT_STREAM_CONTEXT = 107003; // stream not in current context +static const int32_t ACL_ERROR_RT_MODEL_CONTEXT = 107004; // model not in current context +static const int32_t ACL_ERROR_RT_STREAM_MODEL = 107005; // stream not in model +static const int32_t ACL_ERROR_RT_EVENT_TIMESTAMP_INVALID = 107006; // event timestamp invalid +static const int32_t ACL_ERROR_RT_EVENT_TIMESTAMP_REVERSAL = 107007; // event timestamp reversal +static const int32_t ACL_ERROR_RT_ADDR_UNALIGNED = 107008; // memory address unaligned +static const int32_t ACL_ERROR_RT_FILE_OPEN = 107009; // open file failed +static const int32_t ACL_ERROR_RT_FILE_WRITE = 107010; // write file failed +static const int32_t ACL_ERROR_RT_STREAM_SUBSCRIBE = 107011; // error subscribe stream +static const int32_t ACL_ERROR_RT_THREAD_SUBSCRIBE = 107012; // error subscribe thread +static const int32_t ACL_ERROR_RT_GROUP_NOT_SET = 107013; // group not set +static const int32_t ACL_ERROR_RT_GROUP_NOT_CREATE = 107014; // group not create +static const int32_t ACL_ERROR_RT_STREAM_NO_CB_REG = 107015; // callback not register to stream +static const int32_t ACL_ERROR_RT_INVALID_MEMORY_TYPE = 107016; // invalid memory type +static const int32_t ACL_ERROR_RT_INVALID_HANDLE = 107017; // invalid handle +static const int32_t ACL_ERROR_RT_INVALID_MALLOC_TYPE = 107018; // invalid malloc type +static const int32_t ACL_ERROR_RT_WAIT_TIMEOUT = 107019; // wait timeout +static const int32_t ACL_ERROR_RT_TASK_TIMEOUT = 107020; // task timeout + +static const int32_t ACL_ERROR_RT_FEATURE_NOT_SUPPORT = 207000; // feature not support +static const int32_t ACL_ERROR_RT_MEMORY_ALLOCATION = 207001; // memory allocation error +static const int32_t ACL_ERROR_RT_MEMORY_FREE = 207002; // memory free error +static const int32_t ACL_ERROR_RT_AICORE_OVER_FLOW = 207003; // aicore over flow +static const int32_t ACL_ERROR_RT_NO_DEVICE = 207004; // no device +static const int32_t ACL_ERROR_RT_RESOURCE_ALLOC_FAIL = 207005; // resource alloc fail +static const int32_t ACL_ERROR_RT_NO_PERMISSION = 207006; // no permission +static const int32_t ACL_ERROR_RT_NO_EVENT_RESOURCE = 207007; // no event resource +static const int32_t ACL_ERROR_RT_NO_STREAM_RESOURCE = 207008; // no stream resource +static const int32_t ACL_ERROR_RT_NO_NOTIFY_RESOURCE = 207009; // no notify resource +static const int32_t ACL_ERROR_RT_NO_MODEL_RESOURCE = 207010; // no model resource +static const int32_t ACL_ERROR_RT_NO_CDQ_RESOURCE = 207011; // no cdq resource +static const int32_t ACL_ERROR_RT_OVER_LIMIT = 207012; // over limit +static const int32_t ACL_ERROR_RT_QUEUE_EMPTY = 207013; // queue is empty +static const int32_t ACL_ERROR_RT_QUEUE_FULL = 207014; // queue is full +static const int32_t ACL_ERROR_RT_REPEATED_INIT = 207015; // repeated init +static const int32_t ACL_ERROR_RT_AIVEC_OVER_FLOW = 207016; // aivec over flow +static const int32_t ACL_ERROR_RT_OVER_FLOW = 207017; // common over flow +static const int32_t ACL_ERROR_RT_DEVIDE_OOM = 207018; // device oom +static const int32_t ACL_ERROR_RT_SEND_MSG = 207019; // hdc send msg fail +static const int32_t ACL_ERROR_RT_COPY_USER_FAIL = 207020; // copy data fail + +static const int32_t ACL_ERROR_RT_INTERNAL_ERROR = 507000; // runtime internal error +static const int32_t ACL_ERROR_RT_TS_ERROR = 507001; // ts internel error +static const int32_t ACL_ERROR_RT_STREAM_TASK_FULL = 507002; // task full in stream +static const int32_t ACL_ERROR_RT_STREAM_TASK_EMPTY = 507003; // task empty in stream +static const int32_t ACL_ERROR_RT_STREAM_NOT_COMPLETE = 507004; // stream not complete +static const int32_t ACL_ERROR_RT_END_OF_SEQUENCE = 507005; // end of sequence +static const int32_t ACL_ERROR_RT_EVENT_NOT_COMPLETE = 507006; // event not complete +static const int32_t ACL_ERROR_RT_CONTEXT_RELEASE_ERROR = 507007; // context release error +static const int32_t ACL_ERROR_RT_SOC_VERSION = 507008; // soc version error +static const int32_t ACL_ERROR_RT_TASK_TYPE_NOT_SUPPORT = 507009; // task type not support +static const int32_t ACL_ERROR_RT_LOST_HEARTBEAT = 507010; // ts lost heartbeat +static const int32_t ACL_ERROR_RT_MODEL_EXECUTE = 507011; // model execute failed +static const int32_t ACL_ERROR_RT_REPORT_TIMEOUT = 507012; // report timeout +static const int32_t ACL_ERROR_RT_SYS_DMA = 507013; // sys dma error +static const int32_t ACL_ERROR_RT_AICORE_TIMEOUT = 507014; // aicore timeout +static const int32_t ACL_ERROR_RT_AICORE_EXCEPTION = 507015; // aicore exception +static const int32_t ACL_ERROR_RT_AICORE_TRAP_EXCEPTION = 507016; // aicore trap exception +static const int32_t ACL_ERROR_RT_AICPU_TIMEOUT = 507017; // aicpu timeout +static const int32_t ACL_ERROR_RT_AICPU_EXCEPTION = 507018; // aicpu exception +static const int32_t ACL_ERROR_RT_AICPU_DATADUMP_RSP_ERR = 507019; // aicpu datadump response error +static const int32_t ACL_ERROR_RT_AICPU_MODEL_RSP_ERR = 507020; // aicpu model operate response error +static const int32_t ACL_ERROR_RT_PROFILING_ERROR = 507021; // profiling error +static const int32_t ACL_ERROR_RT_IPC_ERROR = 507022; // ipc error +static const int32_t ACL_ERROR_RT_MODEL_ABORT_NORMAL = 507023; // model abort normal +static const int32_t ACL_ERROR_RT_KERNEL_UNREGISTERING = 507024; // kernel unregistering +static const int32_t ACL_ERROR_RT_RINGBUFFER_NOT_INIT = 507025; // ringbuffer not init +static const int32_t ACL_ERROR_RT_RINGBUFFER_NO_DATA = 507026; // ringbuffer no data +static const int32_t ACL_ERROR_RT_KERNEL_LOOKUP = 507027; // kernel lookup error +static const int32_t ACL_ERROR_RT_KERNEL_DUPLICATE = 507028; // kernel register duplicate +static const int32_t ACL_ERROR_RT_DEBUG_REGISTER_FAIL = 507029; // debug register failed +static const int32_t ACL_ERROR_RT_DEBUG_UNREGISTER_FAIL = 507030; // debug unregister failed +static const int32_t ACL_ERROR_RT_LABEL_CONTEXT = 507031; // label not in current context +static const int32_t ACL_ERROR_RT_PROGRAM_USE_OUT = 507032; // program register num use out +static const int32_t ACL_ERROR_RT_DEV_SETUP_ERROR = 507033; // device setup error +static const int32_t ACL_ERROR_RT_VECTOR_CORE_TIMEOUT = 507034; // vector core timeout +static const int32_t ACL_ERROR_RT_VECTOR_CORE_EXCEPTION = 507035; // vector core exception +static const int32_t ACL_ERROR_RT_VECTOR_CORE_TRAP_EXCEPTION = 507036; // vector core trap exception +static const int32_t ACL_ERROR_RT_CDQ_BATCH_ABNORMAL = 507037; // cdq alloc batch abnormal +static const int32_t ACL_ERROR_RT_DIE_MODE_CHANGE_ERROR = 507038; // can not change die mode +static const int32_t ACL_ERROR_RT_DIE_SET_ERROR = 507039; // single die mode can not set die +static const int32_t ACL_ERROR_RT_INVALID_DIEID = 507040; // invalid die id +static const int32_t ACL_ERROR_RT_DIE_MODE_NOT_SET = 507041; // die mode not set +static const int32_t ACL_ERROR_RT_AICORE_TRAP_READ_OVERFLOW = 507042; // aic trap read overflow +static const int32_t ACL_ERROR_RT_AICORE_TRAP_WRITE_OVERFLOW = 507043; // aic trap write overflow +static const int32_t ACL_ERROR_RT_VECTOR_CORE_TRAP_READ_OVERFLOW = 507044; // aiv trap read overflow +static const int32_t ACL_ERROR_RT_VECTOR_CORE_TRAP_WRITE_OVERFLOW = 507045; // aiv trap write overflow +static const int32_t ACL_ERROR_RT_STREAM_SYNC_TIMEOUT = 507046; // stream sync time out +static const int32_t ACL_ERROR_RT_EVENT_SYNC_TIMEOUT = 507047; // event sync time out +static const int32_t ACL_ERROR_RT_FFTS_PLUS_TIMEOUT = 507048; // ffts+ timeout +static const int32_t ACL_ERROR_RT_FFTS_PLUS_EXCEPTION = 507049; // ffts+ exception +static const int32_t ACL_ERROR_RT_FFTS_PLUS_TRAP_EXCEPTION = 507050; // ffts+ trap exception + +static const int32_t ACL_ERROR_RT_DRV_INTERNAL_ERROR = 507899; // drv internal error +static const int32_t ACL_ERROR_RT_AICPU_INTERNAL_ERROR = 507900; // aicpu internal error +static const int32_t ACL_ERROR_RT_SOCKET_CLOSE = 507901; // hdc disconnect + +typedef void *aclrtStream; +typedef void *aclrtContext; +typedef int aclError; +typedef int rtError_t; + +typedef enum aclrtMemcpyKind { + ACL_MEMCPY_HOST_TO_HOST, + ACL_MEMCPY_HOST_TO_DEVICE, + ACL_MEMCPY_DEVICE_TO_HOST, + ACL_MEMCPY_DEVICE_TO_DEVICE, +} aclrtMemcpyKind; + +typedef enum aclrtMemMallocPolicy { + ACL_MEM_MALLOC_HUGE_FIRST, + ACL_MEM_MALLOC_HUGE_ONLY, + ACL_MEM_MALLOC_NORMAL_ONLY, + ACL_MEM_MALLOC_HUGE_FIRST_P2P, + ACL_MEM_MALLOC_HUGE_ONLY_P2P, + ACL_MEM_MALLOC_NORMAL_ONLY_P2P, + ACL_MEM_TYPE_LOW_BAND_WIDTH = 0x0100, + ACL_MEM_TYPE_HIGH_BAND_WIDTH = 0x1000, +} aclrtMemMallocPolicy; + + +typedef enum aclrtMemAttr { + ACL_DDR_MEM, + ACL_HBM_MEM, + ACL_DDR_MEM_HUGE, + ACL_DDR_MEM_NORMAL, + ACL_HBM_MEM_HUGE, + ACL_HBM_MEM_NORMAL, + ACL_DDR_MEM_P2P_HUGE, + ACL_DDR_MEM_P2P_NORMAL, + ACL_HBM_MEM_P2P_HUGE, + ACL_HBM_MEM_P2P_NORMAL, +} aclrtMemAttr; + + +typedef struct tagRtDevBinary { + uint32_t magic; // magic number + uint32_t version; // version of binary + const void *data; // binary data + uint64_t length; // binary length +} rtDevBinary_t; + + +static const int ACL_SUCCESS = 0; + +aclError aclrtSetCurrentContext(aclrtContext context); +aclError aclrtGetDeviceCount(uint32_t *count); +aclError aclrtGetCurrentContext(aclrtContext *context); +aclError aclrtCreateStreamWithConfig(aclrtStream *stream, uint32_t priority, uint32_t flag); +aclError aclrtMemcpyAsync(void *dst, size_t destMax, const void *src, size_t count, + aclrtMemcpyKind kind, aclrtStream stream); +aclError aclrtGetMemInfo(aclrtMemAttr attr, size_t *free, size_t *total); +aclError aclrtSetDevice(int32_t deviceId); +aclError aclrtCreateContext(aclrtContext *context, int32_t deviceId); +aclError aclrtCreateStream(aclrtStream *stream); +aclError aclrtMallocHost(void **hostPtr, size_t size); +aclError aclrtMalloc(void **devPtr, size_t size, aclrtMemMallocPolicy policy); +aclError aclrtMemcpy(void *dst, size_t destMax, const void *src, size_t count, aclrtMemcpyKind kind); +aclError aclrtSynchronizeStream(aclrtStream stream); +aclError aclrtFree(void *devPtr); +aclError aclrtFreeHost(void *hostPtr); +aclError aclrtDestroyStream(aclrtStream stream); +aclError aclrtDestroyContext(aclrtContext context); +aclError aclrtResetDevice(int32_t deviceId); +aclError aclrtGetDevice(int32_t *deviceId); + +#endif // TVM_RUNTIME_CCE_ACL_H_ diff --git a/third_party/incubator-tvm/src/runtime/cce/cce_common.h b/third_party/incubator-tvm/src/runtime/cce/cce_common.h index 7ce1c0b7..1a005be2 100644 --- a/third_party/incubator-tvm/src/runtime/cce/cce_common.h +++ b/third_party/incubator-tvm/src/runtime/cce/cce_common.h @@ -25,22 +25,23 @@ /*! * 2019.12.30 - Add file cce_common.h. * 2023.4.21 - Include cce_wrapper.h. + * 2024.1.24 - Change rt*** to aclrt***. */ #ifndef TVM_RUNTIME_CCE_CCE_COMMON_H_ #define TVM_RUNTIME_CCE_CCE_COMMON_H_ #include -#include #include #include #include "cce_wrapper.h" +#include "runtime/cce/cce_acl.h" namespace air { namespace runtime { -inline const char* CceGetErrorString(rtError_t e) { +inline const char* CceGetErrorString(aclError e) { switch (e) { - case RT_ERROR_NONE: + case ACL_SUCCESS: return "success"; default: return "Unknow cce error code"; @@ -49,15 +50,15 @@ inline const char* CceGetErrorString(rtError_t e) { #define CCE_CALL(func) \ { \ - rtError_t e = (func); \ - CHECK(e == RT_ERROR_NONE) << "Cce runtime error: errno=" << e << ", info=" << CceGetErrorString(e); \ + aclError e = (func); \ + CHECK(e == ACL_SUCCESS) << "Cce runtime error: errno=" << e << ", info=" << CceGetErrorString(e); \ } /*! \brief Thread local workspace */ class CceThreadEntry { public: /*! \brief The cce stream */ - rtStream_t stream{nullptr}; + aclrtStream stream{nullptr}; /*! \brief thread local pool */ air::runtime::WorkspacePool pool; /*! \brief profiting handle */ @@ -72,33 +73,6 @@ class CceThreadEntry { static CceThreadEntry* ThreadLocal(); }; -#ifndef RT_KERNEL_LAUNCH_PARAMETRIC_SHAPE -#define RT_KERNEL_LAUNCH_PARAMETRIC_SHAPE -/** - * @ingroup kt_kernel - * @brief launch kernel to device - * @param [in] stubFunc stub function - * @param [in] blockDim block dimentions - * @param [in] args tensor arguments 64-bit address for kernel function - * @param [in] argsSize tensor arguments size - * @param [in] shapes 64-bit integer arguments for kernel function - * @param [in] shapeSize integer arguments size - * @param [in] smDesc shared memory description - * @param [in] stream associated stream - * @return RT_ERROR_NONE for ok, errno for failed - */ -#ifdef USE_KC_AIR -extern "C" rtError_t rtKernelLaunchShapes(const void *stubFunc, - uint32_t blockDIm, - void *args, - uint32_t argsSize, - int64_t *shapes, - uint32_t shapeSize, - rtSmDesc_t *smDesc, - rtStream_t stream); -#endif // USE_KC_AIR -#endif // RT_KERNEL_LAUNCH_PARAMETRIC_SHAPE - } // namespace runtime } // namespace air diff --git a/third_party/incubator-tvm/src/runtime/cce/cce_device_api.cc b/third_party/incubator-tvm/src/runtime/cce/cce_device_api.cc index 4b13c908..40c048af 100644 --- a/third_party/incubator-tvm/src/runtime/cce/cce_device_api.cc +++ b/third_party/incubator-tvm/src/runtime/cce/cce_device_api.cc @@ -24,21 +24,22 @@ /*! * 2019.12.30 - Add file cce_device_api.cc. + * 2024.1.24 - Change rt*** to aclrt***. */ #include #include #include -#include #include #include "runtime/cce/cce_common.h" +#include "runtime/cce/cce_acl.h" namespace air { namespace runtime { class CceDeviceAPI final : public DeviceAPI { public: - void SetDevice(TVMContext ctx) final { CCE_CALL(rtSetDevice(ctx.device_id)); } + void SetDevice(TVMContext ctx) final { CCE_CALL(aclrtSetDevice(ctx.device_id)); } void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final { switch (kind) { @@ -80,53 +81,53 @@ class CceDeviceAPI final : public DeviceAPI { void* ptr = nullptr; // alignment check here - CCE_CALL(rtSetDevice(ctx.device_id)); - CCE_CALL(rtMalloc(&ptr, size + 32, RT_MEMORY_HBM, 0)); + CCE_CALL(aclrtSetDevice(ctx.device_id)); + CCE_CALL(aclrtMalloc(&ptr, size + 32, ACL_MEM_MALLOC_HUGE_FIRST)); return ptr; } void FreeDataSpace(TVMContext ctx, void* ptr) final { if (ptr != nullptr) { - CCE_CALL(rtSetDevice(ctx.device_id)); - CCE_CALL(rtFree(ptr)); + CCE_CALL(aclrtSetDevice(ctx.device_id)); + CCE_CALL(aclrtFree(ptr)); } } void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to, TVMType type_hint, TVMStreamHandle stream) final { LOG(INFO) << " from " << from << " to " << to << " ctx_from " << ctx_from; - auto cce_stream = static_cast(stream); + auto cce_stream = static_cast(stream); from = static_cast(from) + from_offset; to = static_cast(to) + to_offset; if (ctx_from.device_type == kDLCce && ctx_to.device_type == kDLCce) { - CCE_CALL(rtSetDevice(ctx_from.device_id)); + CCE_CALL(aclrtSetDevice(ctx_from.device_id)); if (ctx_from.device_id == ctx_to.device_id) { - CceCopy(from, to, num_bytes, RT_MEMCPY_DEVICE_TO_DEVICE, cce_stream); + CceCopy(from, to, num_bytes, ACL_MEMCPY_DEVICE_TO_DEVICE, cce_stream); } else { LOG(FATAL) << "expect the same device id copy between Cce"; } } else if (ctx_from.device_type == kDLCce && ctx_to.device_type == kDLCPU) { - CCE_CALL(rtSetDevice(ctx_from.device_id)); - CceCopy(from, to, num_bytes, RT_MEMCPY_DEVICE_TO_HOST, cce_stream); + CCE_CALL(aclrtSetDevice(ctx_from.device_id)); + CceCopy(from, to, num_bytes, ACL_MEMCPY_DEVICE_TO_HOST, cce_stream); } else if (ctx_from.device_type == kDLCPU && ctx_to.device_type == kDLCce) { - CCE_CALL(rtSetDevice(ctx_to.device_id)); - CceCopy(from, to, num_bytes, RT_MEMCPY_HOST_TO_DEVICE, cce_stream); + CCE_CALL(aclrtSetDevice(ctx_to.device_id)); + CceCopy(from, to, num_bytes, ACL_MEMCPY_HOST_TO_DEVICE, cce_stream); } else { LOG(FATAL) << "expect copy from/to Cce or between Cce"; } } void StreamSync(TVMContext ctx, TVMStreamHandle stream) final { - auto cce_stream = static_cast(stream); + auto cce_stream = static_cast(stream); - CCE_CALL(rtSetDevice(ctx.device_id)); - CCE_CALL(rtStreamSynchronize(cce_stream)); + CCE_CALL(aclrtSetDevice(ctx.device_id)); + CCE_CALL(aclrtSynchronizeStream(cce_stream)); } void SetStream(TVMContext ctx, TVMStreamHandle stream) final { - CceThreadEntry::ThreadLocal()->stream = static_cast(stream); + CceThreadEntry::ThreadLocal()->stream = static_cast(stream); } void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint = {}) final { @@ -141,34 +142,34 @@ class CceDeviceAPI final : public DeviceAPI { } private: - static void CceCopy(const void* from, void* to, size_t num_bytes, rtMemcpyKind_t kind, rtStream_t stream) { - if (stream != RT_STREAM_DEFAULT) { + static void CceCopy(const void* from, void* to, size_t num_bytes, aclrtMemcpyKind kind, aclrtStream stream) { + if (stream != nullptr) { #ifdef USE_CCE_RT - CCE_CALL(rtMemcpyAsync(to, num_bytes + 1, const_cast(from), num_bytes, kind, stream)); + CCE_CALL(aclrtMemcpyAsync(to, num_bytes + 1, const_cast(from), num_bytes, kind, stream)); #else - CCE_CALL(rtMemcpyAsync(to, const_cast(from), num_bytes, kind, stream)); + CCE_CALL(aclrtMemcpyAsync(to, const_cast(from), num_bytes, kind, stream)); #endif } else { #ifdef USE_CCE_RT #ifdef USE_KC_AIR - CCE_CALL(rtMemcpy(to, num_bytes, from, num_bytes, kind)); + CCE_CALL(aclrtMemcpy(to, num_bytes, from, num_bytes, kind)); #else // because cce runtime cannot support large size memcpy when des/src is alloc by general // malloc(not page-pinned), so we copy with large size by small blocks. size_t block_size = 1024 * 1024 * 1024; for (size_t i = 0; i < num_bytes / block_size; i++) { - CCE_CALL(rtMemcpy(to, block_size, const_cast(from), block_size, kind)); + CCE_CALL(aclrtMemcpy(to, block_size, const_cast(from), block_size, kind)); from = reinterpret_cast(const_cast(from)) + block_size; to = reinterpret_cast(to) + block_size; } size_t remain = num_bytes % block_size; if (remain > 0) { - CCE_CALL(rtMemcpy(to, remain, const_cast(from), remain, kind)); + CCE_CALL(aclrtMemcpy(to, remain, const_cast(from), remain, kind)); } #endif #else - CCE_CALL(rtMemcpy(to, const_cast(from), num_bytes, kind)); + CCE_CALL(aclrtMemcpy(to, const_cast(from), num_bytes, kind)); #endif } } diff --git a/third_party/incubator-tvm/src/runtime/cce/cce_module.cc b/third_party/incubator-tvm/src/runtime/cce/cce_module.cc index a39099d9..98713e81 100644 --- a/third_party/incubator-tvm/src/runtime/cce/cce_module.cc +++ b/third_party/incubator-tvm/src/runtime/cce/cce_module.cc @@ -24,21 +24,23 @@ /*! * 2019.12.30 - Add file cce_module.cc. * 2023.4.21 - load cce symbols. + * 2024.1.24 - Change rt*** to aclrt***. */ #include "runtime/cce/cce_module.h" #include #include -#include #include #include #include #include "runtime/cce/cce_common.h" +#include "runtime/cce/cce_acl.h" #include "codegen/util.h" #include +#include #ifdef USE_CCE_PROFILING #include "profile_mgr.h" @@ -63,15 +65,7 @@ class CceModuleNode : public air::runtime::ModuleNode { } // destructor ~CceModuleNode() override { - for (int i = 0; i < static_cast(module_.size()); ++i) { - if (module_[i] != nullptr) { - try { - CCE_CALL(rtSetDevice(i)); - static_cast(rtDevBinaryUnRegister(module_[i])); - } catch (...) { - } - } - } + UnLoadKernelFunc(); } const char* type_key() const final { return "cce"; } @@ -110,32 +104,31 @@ class CceModuleNode : public air::runtime::ModuleNode { } } - // get a funcStub from primary context in device_id - void* GetFuncStub(int device_id, const std::string& func_name) { - std::lock_guard lock(mutex_); - // must recheck under the lock scope - if (module_[device_id] == nullptr) { - rtDevBinary_t devBin; - devBin.magic = RT_DEV_BINARY_MAGIC_ELF; - devBin.version = 1; - devBin.length = data_.size(); - devBin.data = data_.c_str(); - static_cast(rtDevBinaryRegister(&devBin, &module_[device_id])); - } + void *GetKernelFunc(const std::string &func_name) { + const auto *f = Registry::Get("get_kernel_meta_path"); + CHECK(f != nullptr) << "Function get_kernel_meta_path is not registered"; + std::string file_str = (*f)().operator std::string(); + (void)file_str.append(func_name).append(".o"); + char *file_c_str = (char *)file_str.c_str(); + + void *handle = dlopen(file_c_str, RTLD_LAZY | RTLD_LOCAL); + CHECK(handle != nullptr) << "dlopen failed, file: " << file_c_str; + + std::string func_str = func_name + "_do"; + char *func_c_str = (char *)func_str.c_str(); + void *func = dlsym(handle, func_c_str); + CHECK(func != nullptr) << "dlsym failed, symbol: " << func_str; + return func; + } - void* func_stub = nullptr; - auto search = stub_[device_id].find(func_name); - if (search != stub_[device_id].end()) { - func_stub = search->second; - } else { - kernel_stub_gen_++; - func_stub = kernel_stub_gen_; - static_cast(rtFunctionRegister(module_[device_id], func_stub, func_name.c_str(), - func_name.c_str(), 0)); - stub_[device_id][func_name] = func_stub; + bool UnLoadKernelFunc() { + if (cce_handle_ != nullptr) { + if (dlclose(cce_handle_) != 0) { + return false; + } } - - return func_stub; + cce_handle_ = nullptr; + return true; } private: @@ -154,6 +147,7 @@ class CceModuleNode : public air::runtime::ModuleNode { std::mutex mutex_; // global increate to make stub unique static int* kernel_stub_gen_; + void *cce_handle_{nullptr}; }; int* CceModuleNode::kernel_stub_gen_ = nullptr; @@ -177,25 +171,15 @@ class CceWrappedFunc { // invoke the function with void arguments void operator()(const TVMArgs args, TVMRetValue* rv, void** void_args, int64_t* shape_args, size_t shape_arg_size) const { - int device_id; - CCE_CALL(rtGetDevice(&device_id)); - - if (fcache_[device_id] == nullptr) { - fcache_[device_id] = m_->GetFuncStub(device_id, func_name_); - } + int32_t device_id; + CCE_CALL(aclrtGetDevice(&device_id)); ThreadWorkLoad wl = thread_axis_cfg_.Extract(args); int blockDim = static_cast(wl.grid_dim(0)); - rtL2Ctrl_t* l2ctrl = nullptr; - auto strm = static_cast(CceThreadEntry::ThreadLocal()->stream); + auto strm = static_cast(CceThreadEntry::ThreadLocal()->stream); size_t raw_size = arg_size_.size() - shape_arg_size; - void** raw_args; -#ifdef USE_KC_AIR - raw_args = new void*[raw_size]; -#else - raw_args = new void*[arg_size_.size()]; -#endif + void** raw_args = new void*[arg_size_.size()]; size_t args_size = 0; for (size_t i = 0; i < raw_size; ++i) { args_size += arg_size_[i]; @@ -203,30 +187,24 @@ class CceWrappedFunc { raw_args[i] = *ptr; } - rtError_t result; - + aclError result; + typedef void (*CallFunc)(uint32_t, void*, void*, void**); + auto func_ptr = reinterpret_cast(m_->GetKernelFunc(func_name_)); if (shape_arg_size == 0) { - result = rtKernelLaunch(fcache_[device_id], blockDim, - raw_args, // void_args, - static_cast(args_size), l2ctrl, strm); + func_ptr(blockDim, nullptr, strm, raw_args); } else { - result = RT_ERROR_NONE; + result = ACL_SUCCESS; if (blockDim == INT_MAX) { blockDim = shape_args[shape_arg_size - 1]; } -#ifdef USE_KC_AIR - result = rtKernelLaunchShapes(fcache_[device_id], blockDim, - raw_args, // void_args, - static_cast(args_size), shape_args, shape_arg_size, - l2ctrl, strm); -#else + for (size_t ssize = raw_size; ssize < arg_size_.size(); ++ssize) { void* tempshape = reinterpret_cast (shape_args[ssize - raw_size]); raw_args[ssize] = tempshape; args_size += 8; } - result = rtKernelLaunch(fcache_[device_id], blockDim, raw_args, static_cast(args_size), l2ctrl, strm); -#endif + + func_ptr(blockDim, nullptr, strm, raw_args); akg::RecordCore(blockDim, true); } @@ -234,7 +212,7 @@ class CceWrappedFunc { uint32_t stream_id; uint32_t task_id; auto rt_ret = rtGetTaskIdAndStreamID(&task_id, &stream_id); - if (rt_ret != RT_ERROR_NONE) { + if (rt_ret != ACL_SUCCESS) { LOG(FATAL) << "Profiling get task_id stream_id failed"; } auto label = std::to_string(stream_id) + "_" + std::to_string(task_id); @@ -242,7 +220,7 @@ class CceWrappedFunc { #endif delete[] raw_args; - if (result != RT_ERROR_NONE) { + if (result != ACL_SUCCESS) { const char* msg{nullptr}; std::ostringstream os; diff --git a/third_party/incubator-tvm/src/runtime/cce/cce_module.h b/third_party/incubator-tvm/src/runtime/cce/cce_module.h index 28a4bab8..69e07afe 100644 --- a/third_party/incubator-tvm/src/runtime/cce/cce_module.h +++ b/third_party/incubator-tvm/src/runtime/cce/cce_module.h @@ -24,6 +24,7 @@ /*! * 2019.12.30 - Add file cce_module.h. + * 2024.1.24 - Change rt*** to aclrt***. */ #ifndef TVM_RUNTIME_CCE_CCE_MODULE_H_ diff --git a/third_party/incubator-tvm/src/runtime/cce/cce_wrapper.cc b/third_party/incubator-tvm/src/runtime/cce/cce_wrapper.cc index b0f5c468..0b5e8e9e 100644 --- a/third_party/incubator-tvm/src/runtime/cce/cce_wrapper.cc +++ b/third_party/incubator-tvm/src/runtime/cce/cce_wrapper.cc @@ -24,6 +24,7 @@ /*! * 2023.4.21 - Add file cce_wrapper.cc. + * 2024.1.24 - Change rt*** to aclrt***. */ #include "cce_wrapper.h" @@ -51,48 +52,72 @@ CceWrapper::~CceWrapper() { } bool CceWrapper::UnLoadLibraries() { - if (cce_handle_ != nullptr) { - if (dlclose(cce_handle_) != 0) { + if (ascendcl_handle_ != nullptr) { + if (dlclose(ascendcl_handle_) != 0) { return false; } } - cce_handle_ = nullptr; + ascendcl_handle_ = nullptr; + + if (runtime_handle_ != nullptr) { + if (dlclose(runtime_handle_) != 0) { + return false; + } + } + runtime_handle_ = nullptr; return true; } bool CceWrapper::LoadLibraries() { + LoadAscendCL(); + LoadRuntime(); + return true; +} + +bool CceWrapper::LoadAscendCL() { + std::string library_path = "libascendcl.so"; + void *handle_ptr = dlopen(library_path.c_str(), RTLD_NOW | RTLD_LOCAL); + if (handle_ptr == nullptr) { + LOG(ERROR) << "load library " << library_path << " failed!"; + return false; + } + ascendcl_handle_ = handle_ptr; + + // aclrt + LOAD_FUNCTION_PTR(aclrtSetDevice); + LOAD_FUNCTION_PTR(aclrtCreateContext); + LOAD_FUNCTION_PTR(aclrtCreateStream); + LOAD_FUNCTION_PTR(aclrtMallocHost); + LOAD_FUNCTION_PTR(aclrtMalloc); + LOAD_FUNCTION_PTR(aclrtMemcpy); + LOAD_FUNCTION_PTR(aclrtSynchronizeStream); + LOAD_FUNCTION_PTR(aclrtFree); + LOAD_FUNCTION_PTR(aclrtFreeHost); + LOAD_FUNCTION_PTR(aclrtDestroyStream); + LOAD_FUNCTION_PTR(aclrtDestroyContext); + LOAD_FUNCTION_PTR(aclrtResetDevice); + LOAD_FUNCTION_PTR(aclrtSetCurrentContext); + LOAD_FUNCTION_PTR(aclrtGetDeviceCount); + LOAD_FUNCTION_PTR(aclrtGetCurrentContext); + LOAD_FUNCTION_PTR(aclrtCreateStreamWithConfig); + LOAD_FUNCTION_PTR(aclrtMemcpyAsync); + LOAD_FUNCTION_PTR(aclrtGetMemInfo); + LOAD_FUNCTION_PTR(aclrtGetDevice); + + return true; +} + +bool CceWrapper::LoadRuntime() { std::string library_path = "libruntime.so"; void *handle_ptr = dlopen(library_path.c_str(), RTLD_NOW | RTLD_LOCAL); if (handle_ptr == nullptr) { LOG(ERROR) << "load library " << library_path << " failed!"; return false; } - cce_handle_ = handle_ptr; - - LOAD_FUNCTION_PTR(rtGetDevice); - LOAD_FUNCTION_PTR(rtGetDeviceCount); - LOAD_FUNCTION_PTR(rtSetDevice); - LOAD_FUNCTION_PTR(rtDeviceReset); - LOAD_FUNCTION_PTR(rtCtxCreate); - LOAD_FUNCTION_PTR(rtCtxDestroy); - LOAD_FUNCTION_PTR(rtCtxGetCurrent); - LOAD_FUNCTION_PTR(rtCtxSetCurrent); - LOAD_FUNCTION_PTR(rtCtxSynchronize); - LOAD_FUNCTION_PTR(rtMemGetInfoEx); - LOAD_FUNCTION_PTR(rtEventCreate); - LOAD_FUNCTION_PTR(rtStreamCreate); - LOAD_FUNCTION_PTR(rtStreamCreateWithFlags); - LOAD_FUNCTION_PTR(rtStreamDestroy); - LOAD_FUNCTION_PTR(rtStreamSynchronize); - LOAD_FUNCTION_PTR(rtStreamWaitEvent); - LOAD_FUNCTION_PTR(rtMalloc); - LOAD_FUNCTION_PTR(rtFree); - LOAD_FUNCTION_PTR(rtMemcpy); - LOAD_FUNCTION_PTR(rtMemcpyAsync); - LOAD_FUNCTION_PTR(rtDevBinaryRegister); - LOAD_FUNCTION_PTR(rtDevBinaryUnRegister); - LOAD_FUNCTION_PTR(rtFunctionRegister); - LOAD_FUNCTION_PTR(rtKernelLaunch); + runtime_handle_ = handle_ptr; + + // rt + LOAD_FUNCTION_PTR(rtGetTaskIdAndStreamID); return true; } @@ -100,150 +125,124 @@ bool CceWrapper::LoadLibraries() { } // namespace runtime } // namespace air - -rtError_t rtGetDevice(int32_t *device) { - auto func = air::runtime::CceWrapper::GetInstance()->rtGetDevice; +aclError aclrtSetCurrentContext(aclrtContext context) { + auto func = air::runtime::CceWrapper::GetInstance()->aclrtSetCurrentContext; CHECK_NOTNULL(func); - return func(device); + return func(context); } -rtError_t rtGetDeviceCount(int32_t *count) { - auto func = air::runtime::CceWrapper::GetInstance()->rtGetDeviceCount; +aclError aclrtGetDeviceCount(uint32_t *count) { + auto func = air::runtime::CceWrapper::GetInstance()->aclrtGetDeviceCount; CHECK_NOTNULL(func); return func(count); } -rtError_t rtSetDevice(int32_t device) { - auto func = air::runtime::CceWrapper::GetInstance()->rtSetDevice; - CHECK_NOTNULL(func); - return func(device); -} - -rtError_t rtDeviceReset(int32_t device) { - auto func = air::runtime::CceWrapper::GetInstance()->rtDeviceReset; +aclError aclrtGetCurrentContext(aclrtContext *context) { + auto func = air::runtime::CceWrapper::GetInstance()->aclrtGetCurrentContext; CHECK_NOTNULL(func); - return func(device); + return func(context); } -rtError_t rtCtxCreate(rtContext_t *ctx, uint32_t flags, int32_t dev) { - auto func = air::runtime::CceWrapper::GetInstance()->rtCtxCreate; +aclError aclrtCreateStreamWithConfig(aclrtStream *stream, uint32_t priority, uint32_t flag) { + auto func = air::runtime::CceWrapper::GetInstance()->aclrtCreateStreamWithConfig; CHECK_NOTNULL(func); - return func(ctx, flags, dev); + return func(stream, priority, flag); } -rtError_t rtCtxDestroy(rtContext_t ctx) { - auto func = air::runtime::CceWrapper::GetInstance()->rtCtxDestroy; +aclError aclrtMemcpyAsync(void *dst, size_t destMax, const void *src, size_t count, + aclrtMemcpyKind kind, aclrtStream stream) { + auto func = air::runtime::CceWrapper::GetInstance()->aclrtMemcpyAsync; CHECK_NOTNULL(func); - return func(ctx); + return func(dst, destMax, src, count, kind, stream); } -rtError_t rtCtxGetCurrent(rtContext_t *ctx) { - auto func = air::runtime::CceWrapper::GetInstance()->rtCtxGetCurrent; +aclError aclrtGetMemInfo(aclrtMemAttr attr, size_t *free, size_t *total) { + auto func = air::runtime::CceWrapper::GetInstance()->aclrtGetMemInfo; CHECK_NOTNULL(func); - return func(ctx); + return func(attr, free, total); } -rtError_t rtCtxSetCurrent(rtContext_t ctx) { - auto func = air::runtime::CceWrapper::GetInstance()->rtCtxSetCurrent; +aclError aclrtSetDevice(int32_t deviceId) { + auto func = air::runtime::CceWrapper::GetInstance()->aclrtSetDevice; CHECK_NOTNULL(func); - return func(ctx); + return func(deviceId); } -rtError_t rtCtxSynchronize(void) { - auto func = air::runtime::CceWrapper::GetInstance()->rtCtxSynchronize; +aclError aclrtCreateContext(aclrtContext *context, int32_t deviceId) { + auto func = air::runtime::CceWrapper::GetInstance()->aclrtCreateContext; CHECK_NOTNULL(func); - return func(); + return func(context, deviceId); } -rtError_t rtMemGetInfoEx(rtMemInfoType_t info_type, size_t *free_size, size_t *total_size) { - auto func = air::runtime::CceWrapper::GetInstance()->rtMemGetInfoEx; - CHECK_NOTNULL(func); - return func(info_type, free_size, total_size); -} -rtError_t rtEventCreate(rtEvent_t *event) { - auto func = air::runtime::CceWrapper::GetInstance()->rtEventCreate; +aclError aclrtCreateStream(aclrtStream *stream) { + auto func = air::runtime::CceWrapper::GetInstance()->aclrtCreateStream; CHECK_NOTNULL(func); - return func(event); + return func(stream); } -rtError_t rtStreamCreate(rtStream_t *stream, int32_t priority) { - auto func = air::runtime::CceWrapper::GetInstance()->rtStreamCreate; +aclError aclrtMallocHost(void **hostPtr, size_t size) { + auto func = air::runtime::CceWrapper::GetInstance()->aclrtMallocHost; CHECK_NOTNULL(func); - return func(stream, priority); + return func(hostPtr, size); } -rtError_t rtStreamCreateWithFlags(rtStream_t *stm, int32_t priority, uint32_t flags) { - auto func = air::runtime::CceWrapper::GetInstance()->rtStreamCreateWithFlags; +aclError aclrtMalloc(void **devPtr, size_t size, aclrtMemMallocPolicy policy) { + auto func = air::runtime::CceWrapper::GetInstance()->aclrtMalloc; CHECK_NOTNULL(func); - return func(stm, priority, flags); + return func(devPtr, size, policy); } -rtError_t rtStreamDestroy(rtStream_t stream) { - auto func = air::runtime::CceWrapper::GetInstance()->rtStreamDestroy; +aclError aclrtMemcpy(void *dst, size_t destMax, const void *src, size_t count, aclrtMemcpyKind kind) { + auto func = air::runtime::CceWrapper::GetInstance()->aclrtMemcpy; CHECK_NOTNULL(func); - return func(stream); + return func(dst, destMax, src, count, kind); } -rtError_t rtStreamSynchronize(rtStream_t stream) { - auto func = air::runtime::CceWrapper::GetInstance()->rtStreamSynchronize; +aclError aclrtSynchronizeStream(aclrtStream stream) { + auto func = air::runtime::CceWrapper::GetInstance()->aclrtSynchronizeStream; CHECK_NOTNULL(func); return func(stream); } -rtError_t rtStreamWaitEvent(rtStream_t stream, rtEvent_t event) { - auto func = air::runtime::CceWrapper::GetInstance()->rtStreamWaitEvent; +aclError aclrtFree(void *devPtr) { + auto func = air::runtime::CceWrapper::GetInstance()->aclrtFree; CHECK_NOTNULL(func); - return func(stream, event); + return func(devPtr); } -rtError_t rtMalloc(void **dev_ptr, uint64_t size, rtMemType_t type, uint16_t moduleId) { - auto func = air::runtime::CceWrapper::GetInstance()->rtMalloc; +aclError aclrtFreeHost(void *hostPtr) { + auto func = air::runtime::CceWrapper::GetInstance()->aclrtFreeHost; CHECK_NOTNULL(func); - return func(dev_ptr, size, type, moduleId); + return func(hostPtr); } -rtError_t rtFree(void *dev_ptr) { - auto func = air::runtime::CceWrapper::GetInstance()->rtFree; +aclError aclrtDestroyStream(aclrtStream stream) { + auto func = air::runtime::CceWrapper::GetInstance()->aclrtDestroyStream; CHECK_NOTNULL(func); - return func(dev_ptr); -} - -rtError_t rtMemcpy(void *dst, uint64_t dest_max, const void *src, uint64_t count, rtMemcpyKind_t kind) { - auto func = air::runtime::CceWrapper::GetInstance()->rtMemcpy; - CHECK_NOTNULL(func); - return func(dst, dest_max, src, count, kind); -} - -rtError_t rtMemcpyAsync(void *dst, uint64_t dest_max, const void *src, uint64_t count, - rtMemcpyKind_t kind, rtStream_t stream) { - auto func = air::runtime::CceWrapper::GetInstance()->rtMemcpyAsync; - CHECK_NOTNULL(func); - return func(dst, dest_max, src, count, kind, stream); + return func(stream); } -rtError_t rtDevBinaryRegister(const rtDevBinary_t *bin, void **handle) { - auto func = air::runtime::CceWrapper::GetInstance()->rtDevBinaryRegister; +aclError aclrtDestroyContext(aclrtContext context) { + auto func = air::runtime::CceWrapper::GetInstance()->aclrtDestroyContext; CHECK_NOTNULL(func); - return func(bin, handle); + return func(context); } -rtError_t rtDevBinaryUnRegister(void *handle) { - auto func = air::runtime::CceWrapper::GetInstance()->rtDevBinaryUnRegister; +aclError aclrtResetDevice(int32_t deviceId) { + auto func = air::runtime::CceWrapper::GetInstance()->aclrtResetDevice; CHECK_NOTNULL(func); - return func(handle); + return func(deviceId); } -rtError_t rtFunctionRegister(void *handle, const void *stub_func, const char *stub_name, - const void *dev_func, uint32_t func_mode) { - auto func = air::runtime::CceWrapper::GetInstance()->rtFunctionRegister; +aclError aclrtGetDevice(int32_t *deviceId) { + auto func = air::runtime::CceWrapper::GetInstance()->aclrtGetDevice; CHECK_NOTNULL(func); - return func(handle, stub_func, stub_name, dev_func, func_mode); + return func(deviceId); } -rtError_t rtKernelLaunch(const void *stub_func, uint32_t block_dim, void *args, uint32_t args_size, - rtL2Ctrl_t *l2ctrl, rtStream_t stream) { - auto func = air::runtime::CceWrapper::GetInstance()->rtKernelLaunch; +rtError_t rtGetTaskIdAndStreamID(uint32_t *taskId, uint32_t *streamId) { + auto func = air::runtime::CceWrapper::GetInstance()->rtGetTaskIdAndStreamID; CHECK_NOTNULL(func); - return func(stub_func, block_dim, args, args_size, l2ctrl, stream); -} + return func(taskId, streamId); +} \ No newline at end of file diff --git a/third_party/incubator-tvm/src/runtime/cce/cce_wrapper.h b/third_party/incubator-tvm/src/runtime/cce/cce_wrapper.h index b0a9f7a0..63a0266d 100644 --- a/third_party/incubator-tvm/src/runtime/cce/cce_wrapper.h +++ b/third_party/incubator-tvm/src/runtime/cce/cce_wrapper.h @@ -24,13 +24,14 @@ /*! * 2023.4.21 - Add file cce_wrapper.h. + * 2024.1.24 - Change rt*** to aclrt***. */ #ifndef TVM_RUNTIME_CCE_CCE_WRAPPER_H_ #define TVM_RUNTIME_CCE_CCE_WRAPPER_H_ #include "../symbols_wrapper.h" -#include +#include "runtime/cce/cce_acl.h" namespace air { namespace runtime { @@ -46,60 +47,62 @@ class CceWrapper : public SymbolsWrapper { bool UnLoadLibraries(); private: CceWrapper(); + bool LoadAscendCL(); + bool LoadRuntime(); static std::shared_ptr cce_wrapper_singleton_; - void *cce_handle_{nullptr}; + void *ascendcl_handle_{nullptr}; + void *runtime_handle_{nullptr}; public: - using rtGetDeviceFunc = rtError_t (*)(int32_t *); - using rtGetDeviceCountFunc = rtError_t (*)(int32_t *); - using rtSetDeviceFunc = rtError_t (*)(int32_t ); - using rtDeviceResetFunc = rtError_t (*)(int32_t); - using rtCtxCreateFunc = rtError_t (*)(rtContext_t *, uint32_t, int32_t); - using rtCtxDestroyFunc = rtError_t (*)(rtContext_t); - using rtCtxGetCurrentFunc = rtError_t (*)(rtContext_t *); - using rtCtxSetCurrentFunc = rtError_t (*)(rtContext_t); - using rtCtxSynchronizeFunc = rtError_t (*)(void); - using rtMemGetInfoExFunc = rtError_t (*)(rtMemInfoType_t, size_t *, size_t *); - using rtEventCreateFunc = rtError_t (*)(rtEvent_t *); - using rtStreamCreateFunc = rtError_t (*)(rtStream_t *, int32_t); - using rtStreamCreateWithFlagsFunc = rtError_t (*)(rtStream_t *, int32_t, uint32_t); - using rtStreamDestroyFunc = rtError_t (*)(rtStream_t); - using rtStreamSynchronizeFunc = rtError_t (*)(rtStream_t); - using rtStreamWaitEventFunc = rtError_t (*)(rtStream_t, rtEvent_t); - using rtMallocFunc = rtError_t (*)(void **, uint64_t, rtMemType_t, uint16_t); - using rtFreeFunc = rtError_t (*)(void *); - using rtMemcpyFunc = rtError_t (*)(void *, uint64_t, const void *, uint64_t, rtMemcpyKind_t); - using rtMemcpyAsyncFunc = rtError_t (*)(void *, uint64_t, const void *, uint64_t, rtMemcpyKind_t, rtStream_t); + using aclrtSetCurrentContextFunc = aclError (*)(aclrtContext); + using aclrtGetDeviceCountFunc = aclError (*)(uint32_t *); + using aclrtGetCurrentContextFunc = aclError (*)(aclrtContext *); + using aclrtCreateStreamWithConfigFunc = aclError (*)(aclrtStream *, uint32_t, uint32_t); + using aclrtMemcpyAsyncFunc = aclError (*)(void *, size_t, const void *, size_t, aclrtMemcpyKind, aclrtStream); + using aclrtGetMemInfoFunc = aclError (*)(aclrtMemAttr, size_t *, size_t *); + using aclrtSetDeviceFunc = aclError (*)(int32_t); + using aclrtCreateContextFunc = aclError (*)(aclrtContext *, int32_t); + using aclrtCreateStreamFunc = aclError (*)(aclrtStream *); + using aclrtMallocHostFunc = aclError (*)(void **, size_t); + using aclrtMallocFunc = aclError (*)(void **, size_t, aclrtMemMallocPolicy); + using aclrtMemcpyFunc = aclError (*)(void *, size_t, const void *, size_t, aclrtMemcpyKind); + using aclrtSynchronizeStreamFunc = aclError (*)(aclrtStream); + using aclrtFreeFunc = aclError (*)(void *); + using aclrtFreeHostFunc = aclError (*)(void *); + using aclrtDestroyStreamFunc = aclError (*)(aclrtStream); + using aclrtDestroyContextFunc = aclError (*)(aclrtContext); + using aclrtResetDeviceFunc = aclError (*)(int32_t); + using aclrtGetDeviceFunc = rtError_t (*)(int32_t *); + + using rtFunctionRegisterFunc = rtError_t (*)(void *, const void *, const char *, const void *, uint32_t); using rtDevBinaryRegisterFunc = rtError_t (*)(const rtDevBinary_t *, void **); using rtDevBinaryUnRegisterFunc = rtError_t (*)(void *); - using rtFunctionRegisterFunc = rtError_t (*)(void *, const void *, const char *, const void *, uint32_t); - using rtKernelLaunchFunc = rtError_t (*)(const void *, uint32_t, void *, uint32_t, rtL2Ctrl_t *, rtStream_t); + using rtGetTaskIdAndStreamIDFunc = rtError_t (*)(uint32_t *, uint32_t *); + + // aclrt + DEFINE_FUNC_PTR(aclrtSetCurrentContext); + DEFINE_FUNC_PTR(aclrtGetDeviceCount); + DEFINE_FUNC_PTR(aclrtGetCurrentContext); + DEFINE_FUNC_PTR(aclrtCreateStreamWithConfig); + DEFINE_FUNC_PTR(aclrtMemcpyAsync); + DEFINE_FUNC_PTR(aclrtGetMemInfo); + DEFINE_FUNC_PTR(aclrtSetDevice); + DEFINE_FUNC_PTR(aclrtCreateContext); + DEFINE_FUNC_PTR(aclrtCreateStream); + DEFINE_FUNC_PTR(aclrtMallocHost); + DEFINE_FUNC_PTR(aclrtMalloc); + DEFINE_FUNC_PTR(aclrtMemcpy); + DEFINE_FUNC_PTR(aclrtSynchronizeStream); + DEFINE_FUNC_PTR(aclrtFree); + DEFINE_FUNC_PTR(aclrtFreeHost); + DEFINE_FUNC_PTR(aclrtDestroyStream); + DEFINE_FUNC_PTR(aclrtDestroyContext); + DEFINE_FUNC_PTR(aclrtResetDevice); + DEFINE_FUNC_PTR(aclrtGetDevice); - DEFINE_FUNC_PTR(rtGetDevice); - DEFINE_FUNC_PTR(rtGetDeviceCount); - DEFINE_FUNC_PTR(rtSetDevice); - DEFINE_FUNC_PTR(rtDeviceReset); - DEFINE_FUNC_PTR(rtCtxCreate); - DEFINE_FUNC_PTR(rtCtxDestroy); - DEFINE_FUNC_PTR(rtCtxGetCurrent); - DEFINE_FUNC_PTR(rtCtxSetCurrent); - DEFINE_FUNC_PTR(rtCtxSynchronize); - DEFINE_FUNC_PTR(rtMemGetInfoEx); - DEFINE_FUNC_PTR(rtEventCreate); - DEFINE_FUNC_PTR(rtStreamCreate); - DEFINE_FUNC_PTR(rtStreamCreateWithFlags); - DEFINE_FUNC_PTR(rtStreamDestroy); - DEFINE_FUNC_PTR(rtStreamSynchronize); - DEFINE_FUNC_PTR(rtStreamWaitEvent); - DEFINE_FUNC_PTR(rtMalloc); - DEFINE_FUNC_PTR(rtFree); - DEFINE_FUNC_PTR(rtMemcpy); - DEFINE_FUNC_PTR(rtMemcpyAsync); - DEFINE_FUNC_PTR(rtDevBinaryRegister); - DEFINE_FUNC_PTR(rtDevBinaryUnRegister); - DEFINE_FUNC_PTR(rtFunctionRegister); - DEFINE_FUNC_PTR(rtKernelLaunch); + // rt + DEFINE_FUNC_PTR(rtGetTaskIdAndStreamID); }; } // namespace runtime -- Gitee