From 964188ef45f356121e66655f68442399a7b000a2 Mon Sep 17 00:00:00 2001 From: yiyanzhi_akane Date: Mon, 22 Mar 2021 14:42:57 +0800 Subject: [PATCH] [TUNING] add the gpu-tuning process to master bug fix for ascend gen tuning space bug fix: args in kernel_exec.py --- python/akg/build_module.py | 16 +- python/akg/utils/custom_tiling.py | 36 +- python/akg/utils/kernel_exec.py | 11 +- src/poly/tiling/custom_tiling.h | 24 + src/poly/tiling/gen_tiling_space.cc | 77 +- src/poly/tiling/tile_space.h | 10 + src/poly/tiling/tiling_analyzer.cc | 24 +- src/poly/tiling/tiling_analyzer.h | 31 +- src/poly/tiling/tiling_strategy_manager.h | 9 +- .../tiling/tiling_strategy_manager_gpu.cc | 242 +++++- tests/fuzz/tune_for_gpu/__init__.py | 0 .../autotuning/data_utils/sort_log.py | 17 + .../tune_for_gpu/autotuning/gen_spaces_gpu.py | 95 +++ tests/fuzz/tune_for_gpu/autotuning/job.py | 501 ++++++++++++ .../autotuning/kernel_compiler.py | 407 ++++++++++ tests/fuzz/tune_for_gpu/autotuning/runner.py | 243 ++++++ tests/fuzz/tune_for_gpu/autotuning/space.py | 217 +++++ .../autotuning/space_generators.py | 753 ++++++++++++++++++ .../autotuning/test_data_generators.py | 147 ++++ .../autotuning/tiling_strategies_gpu.py | 84 ++ tests/fuzz/tune_for_gpu/autotuning/tuner.py | 359 +++++++++ .../reduce_tuning_attrs_desc.json | 9 + .../tune_for_gpu/autotuning/tuning_utils.py | 155 ++++ .../autotuning/type_definitions.py | 49 ++ tests/fuzz/tune_for_gpu/config_gpu.sh | 16 + tests/fuzz/tune_for_gpu/test_gpu.py | 67 ++ tests/test_env.sh | 2 +- 27 files changed, 3520 insertions(+), 81 deletions(-) create mode 100644 tests/fuzz/tune_for_gpu/__init__.py create mode 100644 tests/fuzz/tune_for_gpu/autotuning/data_utils/sort_log.py create mode 100644 tests/fuzz/tune_for_gpu/autotuning/gen_spaces_gpu.py create mode 100644 tests/fuzz/tune_for_gpu/autotuning/job.py create mode 100644 tests/fuzz/tune_for_gpu/autotuning/kernel_compiler.py create mode 100644 tests/fuzz/tune_for_gpu/autotuning/runner.py create mode 100644 tests/fuzz/tune_for_gpu/autotuning/space.py create mode 100644 tests/fuzz/tune_for_gpu/autotuning/space_generators.py create mode 100644 tests/fuzz/tune_for_gpu/autotuning/test_data_generators.py create mode 100644 tests/fuzz/tune_for_gpu/autotuning/tiling_strategies_gpu.py create mode 100644 tests/fuzz/tune_for_gpu/autotuning/tuner.py create mode 100644 tests/fuzz/tune_for_gpu/autotuning/tuning_attrs_descs/reduce_tuning_attrs_desc.json create mode 100644 tests/fuzz/tune_for_gpu/autotuning/tuning_utils.py create mode 100644 tests/fuzz/tune_for_gpu/autotuning/type_definitions.py create mode 100644 tests/fuzz/tune_for_gpu/config_gpu.sh create mode 100644 tests/fuzz/tune_for_gpu/test_gpu.py diff --git a/python/akg/build_module.py b/python/akg/build_module.py index 3f70f311..54a16033 100644 --- a/python/akg/build_module.py +++ b/python/akg/build_module.py @@ -50,7 +50,17 @@ def dump_tiling_info(level): logging.info(info, tuning_spaces["index"][i][0], tuning_spaces["index"][i][1], tuning_spaces["c1_range"][i][0], tuning_spaces["c1_range"][i][1], tuning_spaces["c1_mod"][i][0], tuning_spaces["c0_range"][i][0], - tuning_spaces["c0_range"][i][1], tuning_spaces["c0_mod"][i][0]) + tuning_spaces["c0_range"][i][1], tuning_spaces["c0_mod"][i][0], + ) + idx_to_str = {0: "x", 1: "y", 2: "z"} + for i in range(len(tuning_spaces["thread_range"])): + info = "[thread.%s] range [%d, %d](jump by %d), " + logging.info(info, idx_to_str[i], tuning_spaces["thread_range"][i][0], tuning_spaces["thread_range"][i][1], + tuning_spaces['thread_mod'][i][0], ) + for i in range(len(tuning_spaces["block_range"])): + info = "[block.%s] range [%d, %d](jump by %d)" + logging.info(info, idx_to_str[i], tuning_spaces["block_range"][i][0], + tuning_spaces["block_range"][i][1], tuning_spaces['block_mod'][i][0],) logging.info("===============================================") elif isinstance(indice, int) and indice == EMPTY_CODE: logging.info("Empty tiling space.") @@ -108,6 +118,10 @@ def lower(sch, args, shape_params=None, name="default_function", binds=None, att tuning_spaces["c0_range"] = ret.c0_tile_range_table.asnumpy().tolist() tuning_spaces["c1_mod"] = ret.c1_tile_mod_table.asnumpy().tolist() tuning_spaces["c0_mod"] = ret.c0_tile_mod_table.asnumpy().tolist() + tuning_spaces["thread_range"] = ret.gpu_thread_range_table.asnumpy().tolist() + tuning_spaces["block_range"] = ret.gpu_block_range_table.asnumpy().tolist() + tuning_spaces["thread_mod"] = ret.gpu_thread_mod_table.asnumpy().tolist() + tuning_spaces["block_mod"] = ret.gpu_block_mod_table.asnumpy().tolist() if level >= help_tiling_level["Candidates"]: tuning_spaces["tuning_space"] = ret.tiling_candidate.asnumpy().tolist() if not tuning: diff --git a/python/akg/utils/custom_tiling.py b/python/akg/utils/custom_tiling.py index e5d7d060..4430af5b 100644 --- a/python/akg/utils/custom_tiling.py +++ b/python/akg/utils/custom_tiling.py @@ -70,15 +70,33 @@ class TileConstraint(Enum): SET_EXPANSION = "SET_EXPANSION" SET_MEM_RATIO = "SET_MEM_RATIO" SET_AXIS_INFO = "SET_AXIS_INFO" + THREAD_MIN = "THREAD_MIN" + THREAD_MAX = "THREAD_MAX" + THREAD_MOD = "THREAD_MOD" + BLOCK_MIN = "BLOCK_MIN" + BLOCK_MAX = "BLOCK_MAX" + BLOCK_MOD = "BLOCK_MOD" -@check_input_type((double, float, int), TileConstraint, TileLevel) +@check_input_type((double, float, int, list), TileConstraint, TileLevel) def modify_common_constraints(value, constraint, level=TileLevel.C1): """api for dsl to modify some default constraint used in auto tiling.""" if constraint not in TileConstraint: raise ValueError("Tile constraints must be chosen from {0}".format(TileConstraint)) if constraint == TileConstraint.SET_MEM_RATIO: return create_custom_tiling_node(TileMode.COMMON, tile_level=level, mem_ratio=double(value)) + if constraint == TileConstraint.THREAD_MIN: + return create_custom_tiling_node(TileMode.COMMON, thread_min=value) + if constraint == TileConstraint.THREAD_MAX: + return create_custom_tiling_node(TileMode.COMMON, thread_max=value) + if constraint == TileConstraint.THREAD_MOD: + return create_custom_tiling_node(TileMode.COMMON, thread_mod=value) + if constraint == TileConstraint.BLOCK_MIN: + return create_custom_tiling_node(TileMode.COMMON, block_min=value) + if constraint == TileConstraint.BLOCK_MAX: + return create_custom_tiling_node(TileMode.COMMON, block_max=value) + if constraint == TileConstraint.BLOCK_MOD: + return create_custom_tiling_node(TileMode.COMMON, block_mod=value) raise TypeError("Constraint {} is not supported in this api, please use other api" .format(constraint.value)) @@ -233,7 +251,13 @@ def create_custom_tiling_node(tile_mode, axis_info=DEFAULT_STRING, priority=DEFAULT_VALUE, expansion=DEFAULT_VALUE, - mem_ratio=double(DEFAULT_VALUE)): + mem_ratio=double(DEFAULT_VALUE), + thread_min=[], + thread_max=[], + thread_mod=[], + block_min=[], + block_max=[], + block_mod=[]): """default method to create custom tiling node, all values are default except tile mode.""" tile_min = to_tvm_type(tile_min, "tile_min") @@ -257,7 +281,13 @@ def create_custom_tiling_node(tile_mode, axis_info=akg.tvm.expr.StringImm(axis_info), priority=priority, expansion=expansion, - mem_ratio=mem_ratio) + mem_ratio=mem_ratio, + thread_min=thread_min, + thread_max=thread_max, + thread_mod=thread_mod, + block_min=block_min, + block_max=block_max, + block_mod=block_mod) def template_nc1hwc0(tensor_name, level): diff --git a/python/akg/utils/kernel_exec.py b/python/akg/utils/kernel_exec.py index cd98ec6b..1bee4e3c 100644 --- a/python/akg/utils/kernel_exec.py +++ b/python/akg/utils/kernel_exec.py @@ -35,6 +35,7 @@ import numpy as np import akg from akg.build_module import help_tiling_level +from akg import backend as cce import akg.tvm from akg.tvm import autotvm from akg.tvm import rpc @@ -88,7 +89,6 @@ def debug_mode(debug_flag): pass_list.append((0, ir_pass.inject_dma_intrin)) return pass_list - def func_time_required(func_name): """Checking the Time Required for Function Running.""" def wrapper(*args, **kwargs): @@ -467,7 +467,7 @@ def mod_launch_air(mod, args, outputs): return None @func_time_required -def mod_launch(mod, args, outputs=(-1,), tuning=False, device_id=0, expect=None): +def mod_launch(mod, args, outputs=(-1,), tuning=False, device_id=0, expect=None, repeat_time=400): """ unified run CCE kernel api. @@ -492,7 +492,7 @@ def mod_launch(mod, args, outputs=(-1,), tuning=False, device_id=0, expect=None) if not tuning: return out_list[0] if len(out_list) == 1 else tuple(out_list) else: - cycles = get_gpu_cycles(mod, *mod_args, device_id=device_id, save_log=True) + cycles = get_gpu_cycles(mod, *mod_args, device_id=device_id, save_log=True, repeat_time=repeat_time) return out_list[0] if len(out_list) == 1 else tuple(out_list), {'run_time': cycles} stat_info = {} @@ -996,7 +996,6 @@ def op_build(op_func, input_shapes, input_types, op_attrs=None, kernel_name="", level = attrs.get("help_tiling") if attrs and "help_tiling" in attrs else None if tuning or (level is not None and level > help_tiling_level['None']): return gen_spaces_dim_key(op_func, args, s, op_var, kernel_name, attrs, polyhedral, tuning, target) - mode = get_runtime_mode() if mode == "cpu": mod = akg.tvm.build(s, op_var, "llvm") @@ -1069,12 +1068,12 @@ def get_device_id(): logging.error(e) return 0 -def get_gpu_cycles(mod, *mod_args, device_id=0, save_log=False): +def get_gpu_cycles(mod, *mod_args, device_id=0, save_log=False, repeat_time=400): "get gpu profiling cycles." func = tvm.get_global_func('GPUProfilerInit') func("") from akg.utils.result_analysis import gpu_profiling - gpu_profiling(mod, *mod_args, repeat_time=400, device_id=device_id) + gpu_profiling(mod, *mod_args, repeat_time=repeat_time, device_id=device_id) func = tvm.get_global_func('GPUProfilerStop') a = func() return int(a) diff --git a/src/poly/tiling/custom_tiling.h b/src/poly/tiling/custom_tiling.h index b1da52cb..f3848af0 100644 --- a/src/poly/tiling/custom_tiling.h +++ b/src/poly/tiling/custom_tiling.h @@ -80,6 +80,24 @@ class CustomTilingNode : public Node { * default is 0.5 which is reserved for double buffer*/ double mem_ratio; + /*! \brief minimal thread binding factor on gpu, greater than 0*/ + Array thread_min; + + /*! \brief maximal thread binding factor on gpu*/ + Array thread_max; + + /*! \brief constraint thread binding factor % thread_mod == 0*/ + Array thread_mod; + + /*! \brief minimal block binding factor on gpu, greater than 0*/ + Array block_min; + + /*! \brief maximal block binding factor on gpu*/ + Array block_max; + + /*! \brief constraint block binding factor % block_mod == 0*/ + Array block_mod; + void VisitAttrs(AttrVisitor *v) { v->Visit("tile_level", &tile_level); v->Visit("tile_mode", &tile_mode); @@ -97,6 +115,12 @@ class CustomTilingNode : public Node { v->Visit("priority", &priority); v->Visit("expansion", &expansion); v->Visit("mem_ratio", &mem_ratio); + v->Visit("thread_min", &thread_min); + v->Visit("thread_max", &thread_max); + v->Visit("thread_mod", &thread_mod); + v->Visit("block_min", &block_min); + v->Visit("block_max", &block_max); + v->Visit("block_mod", &block_mod); } static constexpr const char *_type_key = "CustomTilingNode"; diff --git a/src/poly/tiling/gen_tiling_space.cc b/src/poly/tiling/gen_tiling_space.cc index 779ef84a..72f61623 100644 --- a/src/poly/tiling/gen_tiling_space.cc +++ b/src/poly/tiling/gen_tiling_space.cc @@ -36,6 +36,15 @@ class TileSpaceCollector { space_->c1_tile_mod_table = init_array; space_->c0_tile_mod_table = init_array; space_->tiling_candidate = init_array; + space_->gpu_thread_range_table = init_array; + space_->gpu_block_range_table = init_array; + space_->gpu_thread_mod_table = init_array; + space_->gpu_block_mod_table = init_array; + if (analyzer_.scop_info_.user_config_.GetTarget() == TARGET_CUDA) { + cared_info_ = {"index", "C1_range", "C0_range", "C1_mod", "C0_mod", "gpu_thread_range", "gpu_block_range", "gpu_thread_mod", "gpu_block_mod"}; + } else { + cared_info_ = {"index", "C1_range", "C0_range", "C1_mod", "C0_mod"}; + } } ~TileSpaceCollector() = default; @@ -122,38 +131,61 @@ class TileSpaceCollector { // step 2. collect cared info from each axis for (const auto &con : cared_info_) { int length = con.find("mod") != std::string::npos ? 1 : 2; - auto array = air::runtime::NDArray::Empty({static_cast(tile_size), length}, type, ctx); + auto size = static_cast(tile_size); + if (con.find("gpu") != std::string::npos) { + size = std::max(3, size); + } + auto array = air::runtime::NDArray::Empty({size, length}, type, ctx); auto spaceDlPack = array.ToDLPack(); auto ptr = reinterpret_cast(spaceDlPack->dl_tensor.data); - for (size_t b_idx = 0; b_idx < all_axes.size(); ++b_idx) { - for (size_t a_idx = 0; a_idx < all_axes[b_idx].size(); ++a_idx) { - if (con == "index") { - *ptr++ = b_idx; - *ptr++ = a_idx; + if (con.find("gpu") != std::string::npos) { + size_t s = con.find("thread") != std::string::npos ? 0 : 3; + size_t e = con.find("thread") != std::string::npos ? 3 : 6; + for (size_t i = s; i < e; ++i) { + if (length == 1) { + *ptr++ = analyzer_.binding_spaces_[i].map_mod_; } else { - if (con == "C1_range") { - TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE1); - *ptr++ = const_cons.tile_min_.as()->value; - *ptr++ = const_cons.tile_extent_.as()->value; - } else if (con == "C0_range") { - TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE0); - *ptr++ = const_cons.tile_min_.as()->value; - *ptr++ = const_cons.tile_extent_.as()->value; - } else if (con == "C1_mod") { - TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE1); - *ptr++ = const_cons.tile_mod_.as()->value; - } else if (con == "C0_mod") { - TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE0); - *ptr++ = const_cons.tile_mod_.as()->value; + *ptr++ = analyzer_.binding_spaces_[i].map_min_; + *ptr++ = analyzer_.binding_spaces_[i].map_extent_; + } + } + } else { + for (size_t b_idx = 0; b_idx < all_axes.size(); ++b_idx) { + for (size_t a_idx = 0; a_idx < all_axes[b_idx].size(); ++a_idx) { + if (con == "index") { + *ptr++ = b_idx; + *ptr++ = a_idx; + } else { + if (con == "C1_range") { + TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE1); + *ptr++ = const_cons.tile_min_.as()->value; + *ptr++ = const_cons.tile_extent_.as()->value; + } else if (con == "C0_range") { + TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE0); + *ptr++ = const_cons.tile_min_.as()->value; + *ptr++ = const_cons.tile_extent_.as()->value; + } else if (con == "C1_mod") { + TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE1); + *ptr++ = const_cons.tile_mod_.as()->value; + } else if (con == "C0_mod") { + TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE0); + *ptr++ = const_cons.tile_mod_.as()->value; + } } } } } + if (con == "index") space_->index_table = array; if (con == "C1_range") space_->c1_tile_range_table = array; if (con == "C0_range") space_->c0_tile_range_table = array; if (con == "C1_mod") space_->c1_tile_mod_table = array; if (con == "C0_mod") space_->c0_tile_mod_table = array; + if (con == "gpu_thread_range") space_->gpu_thread_range_table = array; + if (con == "gpu_block_range") space_->gpu_block_range_table = array; + if (con == "gpu_thread_mod") space_->gpu_thread_mod_table = array; + if (con == "gpu_block_mod") space_->gpu_block_mod_table = array; + delete spaceDlPack; } } @@ -196,7 +228,8 @@ class TileSpaceCollector { bool min_tile_ok = false; for (int64_t tile = tile_min->value; tile <= tile_extent->value; ++tile) { bool break_constraint = - (tile != tile_min->value) && (tile != tile_extent->value) && (tile % tile_mod->value != 0); + ((tile != tile_min->value) && (tile != tile_extent->value) && (tile % tile_mod->value != 0)) || + (axis->forbid_iso && tile_extent->value % tile != 0); if (analyzer_.scop_info_.user_config_.GetPruneTuningSpace() && break_constraint) { continue; } @@ -365,7 +398,7 @@ class TileSpaceCollector { DLContext ctx = {kDLCPU, 0}; std::vector tile_axes_; std::vector is_shared_; - std::unordered_set cared_info_ = {"index", "C1_range", "C0_range", "C1_mod", "C0_mod"}; + std::unordered_set cared_info_; struct Result { std::vector tile; diff --git a/src/poly/tiling/tile_space.h b/src/poly/tiling/tile_space.h index e1a00a02..5171e85a 100644 --- a/src/poly/tiling/tile_space.h +++ b/src/poly/tiling/tile_space.h @@ -28,6 +28,11 @@ class TileSpaceNode : public Node { air::runtime::NDArray c1_tile_mod_table; air::runtime::NDArray c0_tile_mod_table; air::runtime::NDArray tiling_candidate; + air::runtime::NDArray gpu_thread_range_table; + air::runtime::NDArray gpu_block_range_table; + air::runtime::NDArray gpu_thread_mod_table; + air::runtime::NDArray gpu_block_mod_table; + void VisitAttrs(AttrVisitor *v) { v->Visit("index_table", &index_table); @@ -36,6 +41,11 @@ class TileSpaceNode : public Node { v->Visit("c1_tile_mod_table", &c1_tile_mod_table); v->Visit("c0_tile_mod_table", &c0_tile_mod_table); v->Visit("tiling_candidate", &tiling_candidate); + v->Visit("gpu_thread_range_table", &gpu_thread_range_table); + v->Visit("gpu_block_range_table", &gpu_block_range_table); + v->Visit("gpu_thread_mod_table", &gpu_thread_mod_table); + v->Visit("gpu_block_mod_table", &gpu_block_mod_table); + } static constexpr const char *_type_key = "TileSpace"; TVM_DECLARE_NODE_TYPE_INFO(TileSpaceNode, Node); diff --git a/src/poly/tiling/tiling_analyzer.cc b/src/poly/tiling/tiling_analyzer.cc index e4b37dc5..bdb97de5 100644 --- a/src/poly/tiling/tiling_analyzer.cc +++ b/src/poly/tiling/tiling_analyzer.cc @@ -1351,19 +1351,34 @@ void TilingAnalyzer::AddPostTilingConstraints() { if (scop_info_.user_config_.GetTarget() == TARGET_CUDA) { ReduceStrategy reduce_strategy(this); - actived_strategies.push_back(&reduce_strategy); ModStrategy mod_strategy(this); - actived_strategies.push_back(&mod_strategy); - + GemmStrategy gemm_strategy(this); GpuDmaAnalysisStrategy dma_analysis_strategy(this); + CustomTilingStrategy custom_strategy(this); GpuStrategy gpu_strategy(this); if (scop_info_.analysis_result_.GetIsGpuDmaAnalysed()) { actived_strategies.push_back(&dma_analysis_strategy); } else { + if (scop_info_.user_config_.GetIsTuning()) { + actived_strategies.push_back(&custom_strategy); + } else { + actived_strategies.push_back(&reduce_strategy); + actived_strategies.push_back(&mod_strategy); + actived_strategies.push_back(&gemm_strategy); + } actived_strategies.push_back(&gpu_strategy); } strategy_manager->SetStrategies(actived_strategies); strategy_manager->ExecuteGpu(); + if (scop_info_.user_config_.GetIsTuning()) { + binding_spaces_.clear(); + for (auto i : gpu_strategy.thread_binding_spaces_) { + UpdateBindingSpace(i); + } + for (auto i : gpu_strategy.block_binding_spaces_) { + UpdateBindingSpace(i); + } + } return; } } @@ -1376,7 +1391,6 @@ void TilingAnalyzer::AddTilingConstraints() { if (scop_info_.user_config_.GetTarget() == TARGET_CUDA) { CastStrategy cast_strategy(this); actived_strategies.push_back(&cast_strategy); - strategy_manager->SetStrategies(actived_strategies); strategy_manager->ExecuteGpu(); return; @@ -1429,7 +1443,7 @@ void TilingAnalyzer::AddTilingConstraints() { bool TilingAnalyzer::Prepare() { logger_ = std::unique_ptr(new (std::nothrow) TileLogger( - scop_info_.AddDumpDir("tiling.log"), !scop_info_.user_config_.GetDumpPolyDir().empty())); + scop_info_.AddDumpDir("tiling.log"), !scop_info_.user_config_.GetDumpPolyDir().empty())); CHECK(logger_) << "memory alloc fail."; // Stage 1: Analyze schedule tree. ScheduleTreeAnalyzer sch_ana(this, this->sch_); diff --git a/src/poly/tiling/tiling_analyzer.h b/src/poly/tiling/tiling_analyzer.h index 3a00e83d..b3adc711 100644 --- a/src/poly/tiling/tiling_analyzer.h +++ b/src/poly/tiling/tiling_analyzer.h @@ -64,7 +64,7 @@ inline int64_t GetAlignBytes(const int64_t dtype) { return (ALIGN_BYTES + dtype - 1) / dtype; } -inline int64_t GetMaxAlignBytes(std::unordered_map> dtypes) { +inline int64_t GetMinBytes(std::unordered_map> dtypes) { int64_t min_byte = -1; for (const auto &it : dtypes) { if (it.second.empty()) { @@ -75,7 +75,11 @@ inline int64_t GetMaxAlignBytes(std::unordered_map min_byte = min_elem; } } - return GetAlignBytes(min_byte); + return min_byte; +} + +inline int64_t GetMaxAlignBytes(std::unordered_map> dtypes) { + return GetAlignBytes(GetMinBytes(dtypes)); } inline Expr CastToExpr(const std::string &value) { @@ -134,6 +138,12 @@ constexpr auto AT_DYNAMIC_BOUND = "DYNAMIC_BOUND"; constexpr auto AT_MOD = "MOD"; constexpr auto AT_CAST = "CAST"; constexpr auto AT_MEM_RATIO = "MEM_RATIO"; +constexpr auto AT_THREAD_MIN = "THREAD_MIN"; +constexpr auto AT_THREAD_MAX = "THREAD_MAX"; +constexpr auto AT_THREAD_MOD = "THREAD_MOD"; +constexpr auto AT_BLOCK_MIN = "BLOCK_MIN"; +constexpr auto AT_BLOCK_MAX = "BLOCK_MAX"; +constexpr auto AT_BLOCK_MOD = "BLOCK_MOD"; class TilingAnalyzer; @@ -233,12 +243,12 @@ class TilingAnalyzer { sch_(sch), scop_info_(scop_info), is_retry_(!global_attrs.GetStringAttr(kErrorInfo, "").empty()) { - if (scop_info.mmu_info_.IsGemm()) { - op_type_ = GEMM_OP; - } else if (scop_info.mmu_info_.IsConv()) { - op_type_ = CONV_OP; - } else { - op_type_ = VECTOR_OP; + if (scop_info.mmu_info_.IsGemm()) { + op_type_ = GEMM_OP; + } else if (scop_info.mmu_info_.IsConv()) { + op_type_ = CONV_OP; + } else { + op_type_ = VECTOR_OP; } } @@ -292,7 +302,7 @@ class TilingAnalyzer { CHECK(logger_); return *(logger_.get()); } - + void UpdateBindingSpace(TileAxis::MappingConstraint constraint) { binding_spaces_.emplace_back(constraint); } Stmt body_; Binds &binds_; isl::schedule sch_; @@ -306,9 +316,8 @@ class TilingAnalyzer { std::unordered_map> buffer_usage_timetable_; std::unordered_map> buf_info_; - bool is_retry_{false}; - + std::vector binding_spaces_; // [thread.x[min, max, mod], thread.y, thread.z, block.x, block.y, block.z] private: void AddTilingConstraints(); void AddPostTilingConstraints(); diff --git a/src/poly/tiling/tiling_strategy_manager.h b/src/poly/tiling/tiling_strategy_manager.h index 140b0cae..513745fc 100644 --- a/src/poly/tiling/tiling_strategy_manager.h +++ b/src/poly/tiling/tiling_strategy_manager.h @@ -284,8 +284,6 @@ class GemmStrategy : public TilingStrategy { ~GemmStrategy() {} void AddNpuConstraint(); void AddGpuConstraint(); - - std::string interested_attr_key = AT_GEMM; }; class GpuStrategy : public TilingStrategy { @@ -306,6 +304,8 @@ class GpuStrategy : public TilingStrategy { }; void AddNpuConstraint(); void AddGpuConstraint(); + std::vector thread_binding_spaces_; // [thread.x, thread.y, thread.z] + std::vector block_binding_spaces_; // [block.x, block.y, block.z] private: void DetermineTemplate(); @@ -326,6 +326,8 @@ class GpuStrategy : public TilingStrategy { // Step 1. Collect axes and sort them from inner to outer void BuildAxesQueue(); + void ApplyCustomConstraint(); + /* * Step 2. Tile inner axes first and map them to threads, and then tile outer axis and map the rest of them to blocks. * e.g. @@ -357,6 +359,7 @@ class GpuStrategy : public TilingStrategy { int64_t min_elem_for_io_bound_ = 2; size_t depth_{0}; bool need_reverse_{false}; + bool reverse_binding_{false}; int64_t fused_size_{1}; std::unordered_map template_map_ = {{0, "DEFAULT"}, {1, "PURE_ELEM"}, {2, "BROADCAST_OP"}, {3, "REDUCTION"}, {4, "ALL_REDUCE"}, {5, "BITWISE_REDUCTION"}, @@ -378,7 +381,7 @@ class MulticoreStrategy { class TilingPriorityScorer { public: - TilingPriorityScorer(TilingAnalyzer &analyzer) : analyzer_(analyzer), logger_(analyzer.GetTileLogger()) {} + TilingPriorityScorer(TilingAnalyzer &analyzer) : analyzer_(analyzer), logger_(analyzer.GetTileLogger()) {} ~TilingPriorityScorer() {} /* diff --git a/src/poly/tiling/tiling_strategy_manager_gpu.cc b/src/poly/tiling/tiling_strategy_manager_gpu.cc index f0f9083c..8233b21c 100644 --- a/src/poly/tiling/tiling_strategy_manager_gpu.cc +++ b/src/poly/tiling/tiling_strategy_manager_gpu.cc @@ -18,7 +18,6 @@ #include #include "tiling_analyzer.h" - namespace akg { namespace ir { namespace poly { @@ -377,13 +376,129 @@ void ReduceStrategy::DealWithPostReduceTensors() { } } +void GpuStrategy::ApplyCustomConstraint() { + auto ParseBindingConstraint = [](const std::string constraint, size_t max_size) { + std::vector sp = akg::common::Split(constraint, ","); + std::vector ret; + for (auto val : sp) { + if (ret.size() == max_size) { + break; + } + CHECK(!val.empty()); + ret.emplace_back(static_cast(std::strtol(val.c_str(), nullptr, 10))); + } + return ret; + }; + + // init binding space through template-determined limit + thread_binding_spaces_.clear(); + block_binding_spaces_.clear(); + for (size_t i = 0; i < thread_limit_.size(); ++i) { + TileAxis::MappingConstraint elem; + elem.map_extent_ = thread_limit_[i]; + thread_binding_spaces_.emplace_back(elem); + } + for (size_t i = 0; i < std::min(depth_, block_limit_.size()); ++i) { + TileAxis::MappingConstraint elem; + elem.map_extent_ = block_limit_[i]; + block_binding_spaces_.emplace_back(elem); + } + + // add constraints to binding space according to custom tiling + std::unordered_set thread_keys = {AT_THREAD_MIN, AT_THREAD_MAX, AT_THREAD_MOD}; + std::unordered_set block_keys = {AT_BLOCK_MIN, AT_BLOCK_MAX, AT_BLOCK_MOD}; + for (const auto attr : analyzer_->RootAxis()->attrs) { + std::vector constraint; + std::vector target; + if (thread_keys.find(attr.attr_key) != thread_keys.end()) { + constraint = ParseBindingConstraint(attr.attr_value, thread_binding_spaces_.size()); + target = thread_binding_spaces_; + } else if (block_keys.find(attr.attr_key) != block_keys.end()) { + constraint = ParseBindingConstraint(attr.attr_value, block_binding_spaces_.size()); + target = block_binding_spaces_; + } + if (constraint.empty()) { + continue; + } + + for (size_t i = 0; i < constraint.size(); ++i) { + if (attr.attr_key.find("MIN") != std::string::npos) { + target[i].map_min_ = std::max(target[i].map_min_, constraint[i]); + } else if (attr.attr_key.find("MAX") != std::string::npos && constraint[i] > 0) { + target[i].map_extent_ = std::min(target[i].map_extent_, constraint[i]); + } else if (attr.attr_key.find("MOD") != std::string::npos) { + target[i].map_mod_ = std::max(1, constraint[i]); + } + } + + if (thread_keys.find(attr.attr_key) != thread_keys.end()) { + thread_binding_spaces_ = target; + } else if (block_keys.find(attr.attr_key) != block_keys.end()) { + block_binding_spaces_ = target; + } + } + + // apply custom constraint to corresponding axis and modify binding space according to tile range of axis + size_t cur_depth = 0; + analyzer_->ForEachAxisTopDown([this, &cur_depth](TileAxis *axis) { + if (axis == analyzer_->RootAxis()) { + return; + } + auto cons = axis->GetConstConstraint(CACHE1); + auto range_extent = axis->GetConstExtent(); + int tile_min = cons.tile_min_.as()->value; + int tile_extent = cons.tile_extent_.as()->value; + auto idx = reverse_binding_ ? cur_depth : depth_ - 1 - cur_depth; + + auto thread_extent = tile_extent; + if (idx < thread_binding_spaces_.size()) { + thread_extent = std::min(thread_extent, thread_binding_spaces_[idx].map_extent_); + thread_binding_spaces_[idx].map_extent_ = thread_extent; + } + + auto block_extent = range_extent / tile_min; + if (idx < block_binding_spaces_.size()) { + block_extent = std::min(block_extent, block_binding_spaces_[idx].map_extent_); + block_binding_spaces_[idx].map_extent_ = block_extent; + } + + auto block_min = block_extent / std::max(1, thread_extent); + if (idx < block_binding_spaces_.size()) { + block_min = std::max(block_min, block_binding_spaces_[idx].map_min_); + block_binding_spaces_[idx].map_min_ = block_min; + } + + axis->thread_constraints.map_extent_ = thread_extent; + axis->block_constraints.map_extent_ = block_extent; + axis->block_constraints.map_min_ = block_min; + if (idx < thread_binding_spaces_.size()) { + axis->thread_constraints.map_mod_ = thread_binding_spaces_[idx].map_mod_; + } + if (idx < block_binding_spaces_.size()) { + axis->block_constraints.map_mod_ = block_binding_spaces_[idx].map_mod_; + } + ++cur_depth; + }); +} + void GpuStrategy::AddGpuConstraint() { InitMappingLimit(); - if (template_ == Template::BROADCAST_OP || template_ == Template::CUSTOM_CONFIG) { + if (!analyzer_->scop_info_.user_config_.GetIsTuning() && + (template_ == Template::BROADCAST_OP || template_ == Template::CUSTOM_CONFIG)) { BroadcastSpeedup(); } BuildAxesQueue(); if (analyzer_->scop_info_.user_config_.GetIsTuning()) { + ApplyCustomConstraint(); + for (size_t i = 0; i < max_dim_; ++i) { + TileAxis::MappingConstraint pad; + if (i >= thread_binding_spaces_.size()) { + thread_binding_spaces_.emplace_back(pad); + } + if (i >= block_binding_spaces_.size()) { + block_binding_spaces_.emplace_back(pad); + } + } return; } InnerThreadOuterBlock(); @@ -391,19 +506,27 @@ void GpuStrategy::AddGpuConstraint() { InjectiveSpeedup(); } SetMappingConfig(); + if (template_ != Template::MATMUL || !analyzer_->scop_info_.user_config_.GetEnableTensorCore()) { + analyzer_->ForEachAxisTopDown([this](TileAxis *axis) { + if (axis == analyzer_->RootAxis()) { + return; + } + axis->TileRestrainToSingleValue(axis->c1_constraints.tile_min_, TileLevel::CACHE0); + }); + } } void GpuStrategy::InitMappingLimit() { max_num_threads_ = analyzer_->scop_info_.user_config_.GetMaxElemPerThread(); DetermineTemplate(); std::stringstream ss; - need_reverse_ = analyzer_->scop_info_.user_config_.GetEnableAkgReduceLib() && - analyzer_->scop_info_.analysis_result_.GetReduceDirection() == Y_DIRECTION; + reverse_binding_ = analyzer_->scop_info_.user_config_.GetEnableAkgReduceLib() && + analyzer_->scop_info_.analysis_result_.GetReduceDirection() == Y_DIRECTION; if (template_ == Template::CUSTOM_CONFIG) { auto thread_config = analyzer_->scop_info_.user_config_.GetThreadConfig(); for (size_t i = 0; i < thread_config->bound; ++i) { - auto idx = need_reverse_ ? thread_config->bound - 1 - i : i; + auto idx = reverse_binding_ ? thread_config->bound - 1 - i : i; if (idx >= depth_) { continue; } @@ -427,12 +550,16 @@ void GpuStrategy::InitMappingLimit() { } else if (template_ == Template::MATMUL) { // This is a naive tiling strategy used in gpu when thread and block configs are already set. // This strategy will tile up to three inner-most axes to 32 (for thread binding). - thread_limit_ = {32, 8}; + if (analyzer_->scop_info_.user_config_.GetEnableTensorCore()) { + thread_limit_ = {warp_sizes_, 16}; + } else { + thread_limit_ = {warp_sizes_, 8}; + } } else { thread_limit_ = {max_x_y_dim_thread_, max_x_y_dim_thread_, max_z_dim_thread_}; } - if (template_ != Template::CUSTOM_CONFIG) { + if (template_ != Template::CUSTOM_CONFIG && !analyzer_->scop_info_.user_config_.GetEnableTensorCore()) { AdjustThreadMappingLimit(); } @@ -505,13 +632,21 @@ void GpuStrategy::InnerThreadOuterBlock() { tile = tile == SpItemPerThread::AUTO ? std::min(axis->thread_constraints.item_process_, max_elem_per_thread_) : tile == SpItemPerThread::FULL ? std::min(shape, max_elem_per_thread_) : 1; - if (axis->block_constraints.map_extent_ > 1) { - tile = - std::max(tile, std::max(ceil(static_cast(shape) / axis->block_constraints.map_extent_), 1)); - pending_axes_.push_back(std::make_pair(axis, std::max(ceil(static_cast(shape) / tile), 1))); - ss << ", map to block."; + auto tile_min = axis->c1_constraints.tile_min_.as()->value; + auto tile_extent = axis->c1_constraints.tile_extent_.as()->value; + if (tile_min == tile_extent && tile_extent != MIN_TILE) { + ss << "tile extent is already determined = " << tile_extent; + analyzer_->GetTileLogger().AppendLog(GPU_MAPPING, ss); + tile = tile_min; } else { - tile = std::min(tile, shape); + if (axis->block_constraints.map_extent_ > 1) { + tile = + std::max(tile, std::max(ceil(static_cast(shape) / axis->block_constraints.map_extent_), 1)); + pending_axes_.push_back(std::make_pair(axis, std::max(ceil(static_cast(shape) / tile), 1))); + ss << ", map to block."; + } else { + tile = std::min(tile, shape); + } } axis->TileRestrainLower(tile, TileLevel::CACHE1); ss << ", tile = " << tile; @@ -522,19 +657,11 @@ void GpuStrategy::InnerThreadOuterBlock() { rest_threads = std::min(rest_threads, axis->thread_constraints.map_extent_); } - if (thread_cfg_.size() >= thread_dim || inner_dim >= max_dim_) { + if (rest_threads <= 1 || thread_cfg_.size() >= thread_dim || inner_dim >= max_dim_) { ss << ", no thread/dim rests"; SkipMapping(); continue; } - if (rest_threads <= 1) { - if (axis->mc_sup || - (template_ == Template::REDUCTION && analyzer_->scop_info_.user_config_.GetEnableAkgReduceLib())) { - thread_cfg_.emplace_back(1); - } - SkipMapping(); - continue; - } auto item = elem_per_thread_[inner_dim] == SpItemPerThread::AUTO ? axis->thread_constraints.item_process_ : elem_per_thread_[inner_dim]; item = std::min(item, max_elem_per_thread_); @@ -575,6 +702,7 @@ void GpuStrategy::InnerThreadOuterBlock() { if (pending_axes_.size() - i > block_dim) { auto axis = pending_axes_[i].first; ss << "axis " << axis->index << "_" << axis->dim_axis + << " exceeded block dim and should be mapped to block for higher performance, consider flatten"; analyzer_->GetTileLogger().AppendLog(GPU_MAPPING, ss); continue; @@ -594,7 +722,7 @@ void GpuStrategy::InnerThreadOuterBlock() { int64_t shape; std::tie(axis, shape) = pending_axes_[i]; auto idx = pending_axes_.size() - 1 - i; - idx = need_reverse_ ? block_limit_.size() - 1 - idx : idx; + idx = reverse_binding_ ? block_limit_.size() - 1 - idx : idx; auto rest_blocks = std::min(max_num_blocks_ / activated_blocks, block_limit_[idx]); rest_blocks = std::min(rest_blocks, axis->block_constraints.map_extent_); ss << "axis " << axis->index << "_" << axis->dim_axis << " shape = " << shape << ", rest blocks = " << rest_blocks; @@ -635,11 +763,9 @@ void GpuStrategy::SetMappingConfig() { if (block_cfg_.empty()) { block_cfg_.emplace_back(1); } - bool reverse_binding = (analyzer_->scop_info_.user_config_.GetEnableAkgReduceLib() && - analyzer_->scop_info_.analysis_result_.GetReduceDirection() == Y_DIRECTION); std::string block_str = ""; std::string thread_str = ""; - if (reverse_binding) { + if (reverse_binding_) { for (int i = 0; i < static_cast(block_cfg_.size()); ++i) { if (i >= block_count_) { continue; @@ -753,7 +879,7 @@ int64_t GpuStrategy::TileAfterThreadMapping(TileAxis *axis, size_t inner_dim, in tile = thread_size; ss << "tile = thread size, "; } else { - auto block_dim = need_reverse_ ? inner_dim : block_limit_.size() - 1 - inner_dim; + auto block_dim = reverse_binding_ ? inner_dim : block_limit_.size() - 1 - inner_dim; int64_t least_blocks; if (block_dim >= 0 && block_dim < block_limit_.size()) { least_blocks = block_limit_[block_dim]; @@ -1139,12 +1265,70 @@ void GpuStrategy::GpuVectorBroadcastStrategy() { } } +void CustomTilingStrategy::AddGpuConstraint() { + auto interested_info = GetInterestedInfo(interested_attr_key, false); + for (auto it : interested_info) { + TileAxis *axis = it.first; + for (auto attr : it.second) { + std::vector modes = akg::common::Split(attr.attr_key, ":"); + CHECK_EQ(modes.size(), 2U); + std::string constraint_str = attr.attr_value; + if (constraint_str.find("->") != std::string::npos) { + std::vector res = akg::common::Split(constraint_str, "->"); + constraint_str = res[1]; + } + std::vector constraints = akg::common::Split(constraint_str, "_"); + CHECK_GE(constraints.size(), 1U); + std::vector level = akg::common::Split(constraints[0], ":"); + CHECK(level.size() == 2U && level[0] == "LEVEL"); + CHECK(level[1] == "C1" || level[1] == "C0"); + TileLevel lv = level[1] == "C1" ? CACHE1 : CACHE0; + constraints.erase(constraints.begin()); + for (const auto &con : constraints) { + std::vector items = akg::common::Split(con, ":"); + CHECK_EQ(items.size(), 2U); + CHECK_NE(items[0], ""); + CHECK_NE(items[1], ""); + if (items[0] == "MIN") { + if (items[1] == "MIN") { + if (lv == CACHE1) { + axis->c1_constraints.tile_extent_ = axis->c1_constraints.tile_min_; + } else if (lv == CACHE0) { + axis->c0_constraints.tile_extent_ = axis->c0_constraints.tile_min_; + } + } else { + if (lv == CACHE1) { + axis->c1_constraints.tile_min_ = CastToExpr(items[1]); + } else if (lv == CACHE0) { + axis->c0_constraints.tile_min_ = CastToExpr(items[1]); + } + } + } else if (items[0] == "FACTOR") { + axis->TileRestrainToSingleValue(CastToExpr(items[1]), lv); + } else if (items[0] == "FORBIDISO") { + axis->forbid_iso = true; + } else if (items[0] == "MAX") { + if (items[1] == "FULL") { + axis->TileRestrainEntire(lv); + } else { + if (lv == CACHE1) { + axis->c1_constraints.tile_extent_ = CastToExpr(items[1]); + } else if (lv == CACHE0) { + axis->c0_constraints.tile_extent_ = CastToExpr(items[1]); + } + } + } else if (items[0] == AT_MOD) { + axis->TileRestrainMod(CastToExpr(items[1]), lv); + } + } + } + } +} + // No constraint found in cuda void ModStrategy::AddGpuConstraint() {} -void CustomTilingStrategy::AddGpuConstraint() {} - void ConflictTreeRangeStrategy::AddGpuConstraint() {} void VectorizedStrategy::AddGpuConstraint() {} diff --git a/tests/fuzz/tune_for_gpu/__init__.py b/tests/fuzz/tune_for_gpu/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/fuzz/tune_for_gpu/autotuning/data_utils/sort_log.py b/tests/fuzz/tune_for_gpu/autotuning/data_utils/sort_log.py new file mode 100644 index 00000000..377cead2 --- /dev/null +++ b/tests/fuzz/tune_for_gpu/autotuning/data_utils/sort_log.py @@ -0,0 +1,17 @@ +import sys + +if __name__ == "__main__": + from_log_file = str(sys.argv[1]) + sorted_log_file = str(sys.argv[2]) + f_in = open(from_log_file, 'r') + f_out = open(sorted_log_file, "wt") + d = dict() + for line in f_in: + config = line.split("|") + d[str(config[1])] = float(config[2]) + sorted_dict = {k: v for k, v in sorted( + d.items(), key=lambda item: (item[1], item[0]))} + for k, v in sorted_dict.items(): + f_out.write("|" + str(k) + "|" + str(v) + "\n") + f_in.close() + f_out.close() diff --git a/tests/fuzz/tune_for_gpu/autotuning/gen_spaces_gpu.py b/tests/fuzz/tune_for_gpu/autotuning/gen_spaces_gpu.py new file mode 100644 index 00000000..3516bbf5 --- /dev/null +++ b/tests/fuzz/tune_for_gpu/autotuning/gen_spaces_gpu.py @@ -0,0 +1,95 @@ +from .kernel_compiler import compile_kernel +from collections import namedtuple +from .space import ListConfigSpace + +def get_reduce_axis_length(in_shape,reduce_axis): + lx, ly = 1, 1 + if reduce_axis == None or len(reduce_axis) == len(in_shape): + for v in in_shape: lx *= v + elif (len(in_shape) - 1) in reduce_axis: + for i in range(len(in_shape)): + if i in reduce_axis: + lx *= in_shape[i] + else: + ly *= in_shape[i] + + else: + for i in range(len(in_shape)): + if i in reduce_axis: + ly *= in_shape[i] + else: + lx *= in_shape[i] + + return lx, ly + + +def _get_space_reduce_gpu_manually(op_type: str, op_desc, tuning_attrs=[], tuning_attrs_info=None): + """get config space of reduce_sum operators in gpu""" + space_res, key, expect, input_for_mod = compile_kernel(op_type, op_desc, None, None, None, 0, + gen_tiling_spaces=True) + + in_shape, reduce_axis = op_desc[2].in_shape, op_desc[2].axis + dim_len = 1 if reduce_axis == None or len(reduce_axis) == len(in_shape) else 2 + dim_names = ['tiling_' + str(i) for i in range(dim_len)] + dim_names.append("block_x") + dim_names.append("block_y") + dim_names.append("block_z") + dim_names.append("thread_x") + dim_names.append("thread_y") + dim_names.append("thread_z") + for key in tuning_attrs_info[0]: + dim_names.append(key) + lx, ly = get_reduce_axis_length(in_shape, reduce_axis) + + tiling_spaces = [] + if reduce_axis == None or len(reduce_axis) == len(in_shape): + """all-reduce""" + possible_tx_list = [2**i for i in range(4,11)] + for tx in possible_tx_list: + if tx > lx: break + possible_dim0_list = [d0 for d0 in range(tx, lx+1, tx)] + if possible_dim0_list[-1] != lx: possible_dim0_list.append(lx) + for d0 in possible_dim0_list: + bx = lx//d0 if lx % d0 == 0 else lx//d0+1 + tiling_spaces.append([d0,bx,1,1,tx,1,1]) + + + elif (len(in_shape) - 1) in reduce_axis: + """reduce-x""" + possible_tx_list = [2**i for i in range(4,11)] + for tx in possible_tx_list: + if tx > lx: break + ty = 1 + by = ly + possible_dim1_list = [d1 for d1 in range(tx, lx+1, tx)] + if possible_dim1_list[-1] != lx: possible_dim1_list.append(lx) + for d1 in possible_dim1_list: + bx = lx//d1 if lx % d1 == 0 else lx//d1+1 + tiling_spaces.append([1,d1,bx,by,1,tx,ty,1]) + + else: + """reduce-y""" + tx = min(32,lx) + bx = lx//tx if lx %tx==0 else lx//tx + 1 + d0 = tx + for ty in range(min(8,ly),1025): + if ty * tx > 1024: break + possible_dim1_list = [d1 for d1 in range(ty, ly+1, ty)] + for d1 in possible_dim1_list: + by = ly//d1 if ly % d1 == 0 else ly//d1+1 + tiling_spaces.append([d0,d1,bx,by,1,tx,ty,1]) + + input_type = namedtuple(op_type, dim_names) + space = ListConfigSpace(input_type) + if len(tuning_attrs_info[0]) != 0: + for tiling_space in tiling_spaces: + for tuning_attrs_config in tuning_attrs_info[1]: + tmp = tiling_space[:] + tmp.extend(tuning_attrs_config) + config = input_type(*tmp) + space.add(config) + else: + for tiling_space in tiling_spaces: + config = input_type(*tiling_space) + space.add(config) + return space_res.index_table, space, key, expect, input_for_mod \ No newline at end of file diff --git a/tests/fuzz/tune_for_gpu/autotuning/job.py b/tests/fuzz/tune_for_gpu/autotuning/job.py new file mode 100644 index 00000000..50c1b446 --- /dev/null +++ b/tests/fuzz/tune_for_gpu/autotuning/job.py @@ -0,0 +1,501 @@ +# Copyright 2019 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""AutoTuning job""" +import os +import json +import time +import datetime +import importlib +import logging +import pandas as pd +import subprocess +import numpy as np +from collections import namedtuple +from multiprocessing import Process, Manager +from akg import composite +from akg.utils import kernel_exec as utils +from akg.composite.build_module import generate_trait +from autotuning.runner import KernelRunner, error_time_list, error_time_string +from autotuning.tuner import ModelBasedTuner, Tuner +from autotuning.type_definitions import ConvDesc, ConvBackpropDesc, MatmulCubeDesc +from autotuning.space_generators import get_space +from autotuning.space import ListConfigSpace +from autotuning.test_data_generators import gen_data +from autotuning.space_generators import gen_bool_list +from autotuning.tuning_utils import * + +logging.basicConfig(level=logging.DEBUG) + +logger = logging.getLogger('fuzz.tune.autotuning.job') + +storage_dir = './res/' + +if not os.path.exists(storage_dir): + os.makedirs(storage_dir) + +json_file = './res/' + "{0}" + ".json" +json_load = './autotuning/shapes/' + "{0}" + + +def get_repo(repo, keys, default=None): + for key in keys: + repo = repo.get(key) + if not repo: + return default + return repo + + +def get_json_space(json_input, space_dict): + space_res = composite.get_tiling_space(json_input, 2) + space_dict['res'] = space_res + + +def launch_json(debug_mode: bool = True, save_res: bool = False, json_dir="", repo_path="", all_space=False, + skip_exist=True, extra_tune=False, self_attrs=[], tuning_attrs=[]): + """composite json tuning launch""" + subprocess.run("mkdir -p res/", shell=True) + iter_times = [3, 3, 3] if debug_mode else [80, 160, 320] + files = os.listdir(json_dir) + with open(repo_path, 'r') as f: + repo = json.loads(f.read()) + for input_file in files: + print("----Start tuning for ", input_file) + with open(json_dir + '/' + input_file, 'r') as f: + json_input = f.read() + json_content = json.loads(json_input) + for input_desc in json_content["input_desc"]: + if input_desc[0]["shape"] == []: + input_desc[0]["shape"] = [1] + json_input = json.dumps(json_content) + + # skip tuning for info in repo + if skip_exist: + compute, shape, dtype = generate_trait(json_content) + if get_repo(repo, [compute, shape, dtype]): + print("Info for %s already exists" % input_file) + print("ops are ", str(compute)) + print("shape is ", str(shape)) + print("dtype is ", str(dtype)) + with open('res/skip_file.txt', 'a') as fe: + fe.write(input_file) + fe.write("\n") + continue + + # generate tuning space + if not extra_tune: + time_start_get_space = time.time() + with Manager() as manager: + space_dict = manager.dict() + p = Process(target=get_json_space, + args=(json_input, space_dict)) + p.start() + p.join(600) + if 'res' not in space_dict: + with open('res/error_space_list.txt', 'a') as fe: + fe.write(input_file) + fe.write("\n") + continue + space_res = space_dict['res'] + time_end_get_space = time.time() + print("get space time: ", time_end_get_space - time_start_get_space) + index_table = space_res['index'] + tiling_spaces = space_res['tuning_space'] + if not isinstance(tiling_spaces, list): + with open('res/empty_space_list.txt', 'a') as fe: + fe.write(input_file) + fe.write("\n") + continue + dim_names = ['tiling_' + str(i) + for i in range(len(tiling_spaces[0]))] + use_tuning_attrs = len(tiling_spaces) < 10 ** 5 + if tuning_attrs and use_tuning_attrs: + dim_names.extend(tuning_attrs) + input_type = namedtuple("json", dim_names) + space = ListConfigSpace(input_type) + if tuning_attrs and use_tuning_attrs: + attr_options = gen_bool_list(tuning_attrs) + for tiling_space in tiling_spaces: + for attr_option in attr_options: + tmp = tiling_space[:] + tmp.extend(attr_option) + config = input_type(*tmp) + space.add(config) + else: + for tiling_space in tiling_spaces: + config = input_type(*tiling_space) + space.add(config) + else: + index_table = [] + pre_lists = gen_bool_list(self_attrs) + pre_input_type = namedtuple("extra_tune", self_attrs) + space = ListConfigSpace(pre_input_type) + for item in pre_lists: + config = pre_input_type(*item) + space.add(config) + + key = json_content["op"] + try: + input_for_mod, expect = gen_data( + op_type="json", op_desc=json_input) + except BaseException as e: + logger.debug( + "gen numpy data from [%s] failed: %s", input_file, str(e)) + with open('res/error_gen_data_list.txt', 'a') as fe: + fe.write(input_file) + fe.write(": ") + fe.write(str(e)) + fe.write("\n") + continue + print('space size:', space.length) + print('index table:', index_table) + + output_para = None # this is for multi-output + if len(json_content["output_desc"]) > 1: + output_para = [] + for i in range(len(json_content["output_desc"])): + output_para.append(i - len(json_content["output_desc"])) + runner = KernelRunner(op_type="json", op_desc=json_input, index_table=index_table, self_attrs=self_attrs, + input_data=input_for_mod, expect=expect, mod_output_param=output_para, timeout=180, + repeat_times=1) + + # we can only get a valid tiling, or accurate get cycles + is_truly_profiling = utils.get_profiling_mode( + ) or os.environ['RUNTIME_MODE'] == "gpu" + + # available device numbers, normally is 8 or 1 + available_device_numbers = utils.get_available_devices_num() + + if all_space: + tuner = Tuner(runner, index_table, space, + n_parallel=available_device_numbers) + least_try_times = 3 # space.length + else: + tuner = ModelBasedTuner(runner, index_table, space, + n_parallel=available_device_numbers if is_truly_profiling else 1, + plan_size=64, pre_model=None) + least_try_times = iter_times[0 if space.length < + 10 ** 4 else 1 if space.length < 10 ** 5 else 2] + tuner.tune(least_try_times, output_file="json.log") + + print_tuning_result("json", space, index_table, tuner, key) + + if save_res: + if extra_tune: + save_tuning_result(key, "extra_tune", + json_content, index_table, tuner, repo_path) + else: + save_tuning_result(key, "json", json_content, + index_table, tuner, repo_path) + + +def jobs(op_type: str = 'add', desc=None, debug_mode: bool = True, save_res: bool = False, + all_space: bool = True, insert_key='', conf_of_set_dim="", tuning_attrs=[], skip_config_set=None, tuning_attrs_info=None): + """AutoTuning jobs""" + iter_times = [3, 3, 3] if debug_mode else [80, 160, 320] + time_start_get_space = time.time() + index_table, space, key, expect, input_for_mod = get_space( + op_type, desc, tuning_attrs=tuning_attrs, tuning_attrs_info=tuning_attrs_info) + time_end_get_space = time.time() + print("get space time: ", time_end_get_space - time_start_get_space) + print('space size:', space.length) + print('index table:', index_table) + key = key if insert_key == '' else insert_key + + # filter already tuned shape + if isinstance(conf_of_set_dim, dict) and key in conf_of_set_dim.keys(): + if isinstance(conf_of_set_dim[key], (list, tuple)) and conf_of_set_dim[key]: + return + + if isinstance(conf_of_set_dim[key], dict): + return + + output_para = None # this is for multi-output + if isinstance(input_for_mod, dict): + input_for_mod, output_para = input_for_mod['args'], input_for_mod['outputs'] + runner = KernelRunner(op_type, desc, index_table, + self_attrs=None, input_data=input_for_mod, + expect=expect, mod_output_param=output_para, + timeout=30, repeat_times=1, + is_all_space=all_space, + skip_config_set=skip_config_set, + need_tune_json=tuning_attrs_info[2]) + + # we can only get a valid tiling, or accurate get cycles + is_truly_profiling = utils.get_profiling_mode() + + # number of multi-processing for build kernels + available_device_numbers = get_parallel_build_num() + + time_start_tuning = time.time() + if all_space: + tuner = Tuner(runner, index_table, space, + n_parallel=available_device_numbers) + least_try_times = space.length + else: + tuner = ModelBasedTuner(runner, index_table, space, + n_parallel=available_device_numbers if is_truly_profiling else 1, + plan_size=100, pre_model=None) + least_try_times = space.length + tuner.tune(least_try_times, output_file=op_type + ".log") + + time_end_tuning = time.time() + print("tuning time: ", time_end_tuning - time_start_tuning) + print_tuning_result(op_type, space, index_table, tuner, key) + # save_results_to_csv(op_type, space, index_table, tuner, key) + + # if save_res: + # save_tuning_result(key, op_type, desc, index_table, tuner) + + +def print_tuning_result(op_type, space, index_table, tuner, key): + """print tuning result""" + print(op_type + " shape is:", key) + print('space size:', space.length) + print('index table:', index_table) + print('best config:', tuner.best_config) + print('best time:', + tuner.best_time if tuner.best_time not in error_time_string.keys() else error_time_string[tuner.best_time]) + print('original time:', tuner.original_time) + print('optimal result is ', tuner.original_time / + tuner.best_time, "faster then auto set dim.") + print("total try times", len(tuner.xs)) + for x, y in zip(tuner.xs, tuner.ys): + print(space.get(x), y if y not in error_time_string.keys() + else error_time_string[y]) + + +def save_results_to_csv(op_type, space, index_table, tuner, key): + """save all results to csv""" + data = [] + for x, y in zip(tuner.xs, tuner.ys): + data.append([space.get(x), y if y not in error_time_string.keys() + else 9999999]) + df = pd.DataFrame(data, columns=["config", "time"]) + df.to_csv(op_type + "_" + key + ".csv") + + +def save_tuning_result(key, op_type, desc, index_table, tuner, repo_path="", extra_tune=False, platform="gpu"): + """save tuning result""" + if tuner.best_config is not None and tuner.best_time not in error_time_list: + set_dim_configs = tuner.best_config.input + if op_type == "matmul": + param = [] + for _ in range(len(desc.x_shape) - 2): + param.append((1, 1)) + if set_dim_configs.n_l1 > 0: + param.append((set_dim_configs.n_l1, set_dim_configs.n_l0)) + if set_dim_configs.m_l1 > 0: + param.append((set_dim_configs.m_l1, set_dim_configs.m_l0)) + param.extend( + [(16, 16), (16, 16), (set_dim_configs.k_l1, set_dim_configs.k_l0)]) + tiling_param = (param, {"bypass": set_dim_configs.bypass}) + + # special case with different tiling parameter format + elif op_type in ("conv", "conv_bn1"): + param = [] + tile_hh = set_dim_configs.tile_h + tile_coco = set_dim_configs.tile_co + tile_mm = set_dim_configs.tile_m + tile_kk = set_dim_configs.tile_k + tile_nn = set_dim_configs.tile_n + tile_ww = set_dim_configs.tile_w + param = [tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, tile_ww] + tiling_param = (param, {"bypass": set_dim_configs.bypass}) + elif op_type == "conv_backprop_input": + param = [] + tile_hh = set_dim_configs.tile_h + tile_coco = set_dim_configs.tile_co + tile_mm = set_dim_configs.tile_m + tile_kk = set_dim_configs.tile_k + tile_nn = set_dim_configs.tile_n + tile_ww = set_dim_configs.tile_w + param = [tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, tile_ww] + tiling_param = (param) + elif op_type == "conv_backprop_filter": + param = [] + tile_cici = set_dim_configs.tile_ci + tile_khkh = set_dim_configs.tile_kh + tile_kwkw = set_dim_configs.tile_kw + tile_coco = set_dim_configs.tile_co + tile_bb = set_dim_configs.tile_batch + tile_hh = set_dim_configs.tile_h + tile_ww = set_dim_configs.tile_w + tile_mm = set_dim_configs.tile_m + tile_kk = set_dim_configs.tile_k + tile_nn = set_dim_configs.tile_n + param = [tile_cici, tile_khkh, tile_kwkw, tile_coco, + tile_bb, tile_hh, tile_ww, tile_mm, tile_kk, tile_nn] + tiling_param = (param) + elif ("batch_matmul" in op_type) and (platform == "gpu"): + tiling = [str(getattr(set_dim_configs, name)) for name in getattr( + set_dim_configs, "_fields") if name.startswith("tiling")] + tiling_param = "" + for i, tile_v in enumerate(tiling): + if i % 2 == 0: + tiling_param += "0 " + str(i) + " " + tiling_param += tile_v + " " + + block_param = get_block_str_from_config(set_dim_configs) + thread_param = get_thread_str_from_config(set_dim_configs) + config = { + 'attrs': { + 'dim': tiling_param, + 'bind_block': block_param, + 'bind_thread': thread_param + }, + 'best_cycles': tuner.best_time, + 'original_cycles': tuner.original_time, + 'date': str(datetime.datetime.now()), + 'tuning_time': tuner.tuning_time, + } + elif op_type == "json": + from autotuning.runner import get_attr_from_config + tiling_param = get_attr_from_config(set_dim_configs, index_table) + elif op_type == "reduce_sum_gpu": + print(set_dim_configs) + tiling = [str(getattr(set_dim_configs, name)) + for name in getattr(set_dim_configs, '_fields') if name.startswith('tiling')] + tiling_param = "" + for i, tile_v in enumerate(tiling): + tiling_param += "0 " + str(i) + " " + tiling_param += tile_v + " 1 " + + block_param = get_block_str_from_config(set_dim_configs) + thread_param = get_thread_str_from_config(set_dim_configs) + config = { + 'attrs': { + 'dim': tiling_param, + 'bind_block': block_param, + 'bind_thread': thread_param + }, + 'best_cycles': tuner.best_time, + 'original_cycles': tuner.original_time, + 'date': str(datetime.datetime.now()), + 'tuning_time': tuner.tuning_time, + } + else: + print(set_dim_configs) + tiling = [[getattr(set_dim_configs, name), 1] + for name in getattr(set_dim_configs, '_fields') if name.startswith('tiling')] + tiling_param = [] + for i, tile_v in enumerate(tiling): + tiling_param.append(index_table[i] + tile_v) + config = [] + else: + tiling_param = [] + + # when there is a valid result, save the result + if op_type in ("json", "extra_tune") and tuner.best_time not in error_time_list: + config = {'attrs': tiling_param, + 'best_cycles': tuner.best_time, + 'original_cycles': tuner.original_time, + "date": str(datetime.datetime.now()), + "tuning time": tuner.tuning_time, + } + if op_type == "json": + config["file_name"] = str(key) + compute, shape, dtype = generate_trait(desc) + tuner.export_dim_configs( + config, json_file.format(op_type), False, str(key)) + save_file = "autotuning/extra_tune.json" if extra_tune else repo_path + with open(save_file, 'r') as f: + repo = json.loads(f.read()) + if len(tiling_param) != 0 and (get_repo(repo, [compute, shape, dtype]) is None or + int(tuner.best_time) < int(repo[compute][shape][dtype]["metadata"]["best_cycles"])): + tuner.export_dim_configs_for_keys(config, save_file, False, [ + compute, shape, dtype, "metadata"]) + else: + try: + tuner.export_dim_configs( + config, json_file.format(op_type), False, str(key)) + except UnboundLocalError as e: + logger.warning(e) + print("[save_tuning_result]: ", "no result is saved.") + + +def load_json_configs(op_type): + """load json configs""" + dim_file = json_file.format(op_type) + file_path = os.path.realpath(dim_file) + if os.path.isfile(file_path): + try: + with open(file_path, 'r') as f: + data = json.load(f) + return data + except IOError as e: + logger.debug(e) + return {} + return {} + + +def read_shapes_from_file(debug_mode, save_res, all_space, conf_of_set_dim, op_type): + """read tuning shapes from file""" + file = importlib.import_module('autotuning.shapes.' + op_type) + shapes = file.shapes + for _, shp in enumerate(shapes): + do_profiling(shp, debug_mode, save_res, + all_space, op_type, conf_of_set_dim) + + +def do_profiling(shp, debug_mode, save_res, all_space, op_type, conf_of_set_dim=None, tuning_attrs=None, skip_config_set=None, tuning_attrs_info=None): + """do profiling""" + # remove undeleted JOB files for previous shapes + subprocess.run("rm -rf /var/log/npu/profiling/JOB*", shell=True) + if op_type == 'matmul': + key = shp[2][0:-1] + logger.debug("start profiling: [%s]", str(key)) + desc = MatmulCubeDesc(*key) + jobs(op_type, desc, debug_mode, save_res, + all_space, key.__str__(), conf_of_set_dim) + logger.debug("end profiling: [%s]", str(key)) + elif op_type.startswith('conv_backprop'): + key = shp[2] + logger.debug("start profiling: [%s]", str(key)) + desc = ConvBackpropDesc(*key) + jobs(op_type, desc, debug_mode, save_res, + all_space, key.__str__(), conf_of_set_dim) + logger.debug("end profiling: [%s]", str(key)) + elif op_type.startswith('conv') and "gpu" not in op_type: + key = shp[2] + logger.debug("start profiling: [%s]", str(key)) + desc = ConvDesc(*key) + jobs(op_type, desc, debug_mode, save_res, + all_space, key.__str__(), conf_of_set_dim) + logger.debug("end profiling: [%s]", str(key)) + elif op_type in ["batch_matmul_gpu", "conv_image2col_gemm_gpu", "reduce_sum_gpu"]: + logger.debug("start profiling: [%s]", str(shp)) + jobs(op_type, shp, debug_mode, save_res, + all_space, conf_of_set_dim=conf_of_set_dim, tuning_attrs=tuning_attrs, skip_config_set=skip_config_set, tuning_attrs_info=tuning_attrs_info) + else: + key = shp + logger.debug("start profiling: [%s]", str(key)) + desc = key + jobs(op_type, desc, debug_mode, save_res, + all_space, conf_of_set_dim=conf_of_set_dim, skip_config_set=skip_config_set) + logger.debug("end profiling: [%s]", str(key)) + + +def launch(op_type, debug_mode, save_res=False, desc=None, all_space=False, + from_json=False, tuning_attrs=None, skip_config_set=None, tuning_attrs_info=None): + # get the existed tiling + conf_of_set_dim = load_json_configs(op_type) if from_json else None + + if desc is None: + read_shapes_from_file(debug_mode, save_res, + all_space, conf_of_set_dim, op_type) + else: + shp = desc + do_profiling(shp, debug_mode, save_res, all_space, op_type, + tuning_attrs=tuning_attrs, skip_config_set=skip_config_set, tuning_attrs_info=tuning_attrs_info) diff --git a/tests/fuzz/tune_for_gpu/autotuning/kernel_compiler.py b/tests/fuzz/tune_for_gpu/autotuning/kernel_compiler.py new file mode 100644 index 00000000..efb10f6d --- /dev/null +++ b/tests/fuzz/tune_for_gpu/autotuning/kernel_compiler.py @@ -0,0 +1,407 @@ +# Copyright 2019 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Compile kernel module for operator""" +import os +from typing import NamedTuple +from base import TestBase +from akg.utils import kernel_exec as utils +from akg.utils import custom_tiling as ct_util +from akg.ops.nn import conv_bn1 +from akg.ops.nn import conv, conv_backprop_input, conv_backprop_filter, batchmatmul +from test_op.batch_matmul import batch_matmul +from akg.ops.math_gpu.reduce_sum import reduce_sum +from akg.build_module import tuning_spaces +from akg.ops.nn import matmul +from test_run import batchmatmul_run, matmul_run +from .type_definitions import ConvDesc, ConvBackpropDesc, MatmulCubeDesc, ConvConfig, ConvBackpropInputConfig, ConvBackpropFilterConfig, MatmulCubeConfig +import numpy as np +from gen_random import random_gaussian +from .tuning_utils import merge_attrs + + +def get_spaces_gpu_manually(op_type: str, op_desc: NamedTuple = None): + # wait for implementation + return + + +def gen_kernel_conv(op_desc: ConvDesc, input_shape, index_table, + config: ConvConfig = None, idx=None, gen_tiling_spaces=False): + """Compile kernel module for conv""" + if index_table is not None: + raise RuntimeError('index_table should be none') + kernel_name = "conv_poly" + if idx is not None: + kernel_name += str(idx) + + if config is None: + attrs = {'dim': ""} + else: + tile_hh = config.tile_h + tile_coco = config.tile_co + tile_mm = config.tile_m + tile_kk = config.tile_k + tile_nn = config.tile_n + tile_ww = config.tile_w + tiling_param = [tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, tile_ww] + attrs = {'conv_tile': tiling_param, 'bypass': config.bypass} + + if op_desc.use_bias: + shape = [input_shape[0], input_shape[1], input_shape[2]] + else: + shape = [input_shape[0], input_shape[1]] + conv_dtype = 'float16' + + return utils.op_build(conv.conv, [shape], [conv_dtype], + op_attrs=[op_desc.fmap_shape, op_desc.filter_shape, op_desc.pad, op_desc.stride, + op_desc.dilation, op_desc.use_bias, attrs], + kernel_name=kernel_name, attrs=attrs, polyhedral=True, tuning=gen_tiling_spaces) + + +def gen_kernel_conv_bn1(op_desc: ConvDesc, input_shape, index_table, config: ConvConfig = None, + idx=None, gen_tiling_spaces=False): + """Compile kernel module for conv_bn1""" + if index_table is not None: + raise RuntimeError('index_table should be none') + kernel_name = "conv_bn1_poly" + if idx is not None: + kernel_name += str(idx) + + if config is None: + attrs = {'dim': ""} + else: + tile_hh = config.tile_h + tile_coco = config.tile_co + tile_mm = config.tile_m + tile_kk = config.tile_k + tile_nn = config.tile_n + tile_ww = config.tile_w + tiling_param = [tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, tile_ww] + attrs = {'conv_tile': tiling_param, 'bypass': config.bypass} + + if op_desc.use_bias: + shape = [input_shape[0], input_shape[1], input_shape[2]] + else: + shape = [input_shape[0], input_shape[1]] + conv_dtype = 'float16' + + return utils.op_build(conv_bn1.conv_bn1, [shape], [conv_dtype], + op_attrs=[op_desc.fmap_shape, op_desc.filter_shape, op_desc.pad, op_desc.stride, + op_desc.dilation, op_desc.use_bias, attrs], + kernel_name=kernel_name, attrs=attrs, polyhedral=True, tuning=gen_tiling_spaces) + + +def gen_kernel_matmul_cube(op_desc: MatmulCubeDesc, _, index_table, + config: MatmulCubeConfig = None, idx=None, gen_tiling_spaces=False): + """Compile kernel module for matmul_cube""" + if index_table is not None: + raise RuntimeError('index_table should be none') + kernel_name = "matmul_cube_poly" + if idx is not None: + kernel_name += str(idx) + if config is None: + attrs = {'dim': ""} + else: + tiling_param = [] + for _ in range(len(op_desc.x_shape) - 2): + tiling_param.append((1, 1)) + if config.n_l1 > 0: + tiling_param.append((config.n_l1, config.n_l0)) + if config.m_l1 > 0: + tiling_param.append((config.m_l1, config.m_l0)) + tiling_param.extend([(16, 16), (16, 16), (config.k_l1, config.k_l0)]) + dim_info = ct_util.set_dims(tuple(tiling_param)) + attrs = {'dim': dim_info, 'bypass': config.bypass} + return matmul_run.matmul_compile(op_desc.x_shape, op_desc.y_shape, op_desc.bias, op_desc.left_format, + op_desc.right_format, op_desc.out_format, op_desc.adj_x, op_desc.adj_y, + op_desc.dtype, op_desc.bias_dtype, op_desc.out_dtype, kernel_name, + attrs, tuning=gen_tiling_spaces) + + +def gen_kernel_conv_backprop_input(op_desc: ConvBackpropDesc, _, index_table, config: ConvBackpropInputConfig = None, + idx=None, gen_tiling_spaces=False): + """Compile kernel module for conv_backprop_input""" + if index_table is not None: + raise RuntimeError('index_table should be none') + kernel_name = "conv_backprop_input_poly" + if idx is not None: + kernel_name += str(idx) + + if config is None: + attrs = {'dim': ""} + else: + tile_hh = config.tile_h + tile_coco = config.tile_co + tile_mm = config.tile_m + tile_kk = config.tile_k + tile_nn = config.tile_n + tile_ww = config.tile_w + tiling_param = [tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, tile_ww] + attrs = {'conv_tile': tiling_param} + + conv_dtype = 'float16' + block_size = 16 + + in_n, in_c, in_h, in_w = op_desc.fmap_shape + cout, _, w_h, w_w = op_desc.filter_shape + + in_c = (in_c + block_size - 1) // block_size * block_size + cout = (cout + block_size - 1) // block_size * block_size + + pad_top, pad_bottom, pad_left, pad_right = op_desc.pad + stride_h, stride_w = op_desc.stride + + out_n = in_n + out_c = cout + out_h = (in_h + pad_top + pad_bottom - w_h) // stride_h + 1 + out_w = (in_w + pad_left + pad_right - w_w) // stride_w + 1 + + x_shape = (out_n, out_c, out_h, out_w) + w_shape = (cout, in_c, w_h, w_w) + in_nn, in_cc, in_hh, in_ww = x_shape + input_shape_nc1hwc0 = (in_nn, in_cc // block_size, + in_hh, in_ww, block_size) + k_n, k_c, k_h, k_w = w_shape + kernel_shape_nc1hwc0 = (k_n, k_c // block_size, k_h, k_w, block_size) + k_n, _, k_h, k_w, _ = kernel_shape_nc1hwc0 + kernel_shape_fractal = (k_c // block_size * k_h * + k_w, k_n // block_size, block_size, block_size) + + shape = [input_shape_nc1hwc0, kernel_shape_fractal] + + return utils.op_build(conv_backprop_input.conv_backprop_input, [shape], [conv_dtype], + op_attrs=[op_desc.fmap_shape, op_desc.filter_shape, op_desc.pad, + op_desc.stride, op_desc.dilation, attrs], + kernel_name=kernel_name, attrs=attrs, polyhedral=True, tuning=gen_tiling_spaces) + + +def gen_kernel_conv_backprop_filter(op_desc: ConvBackpropDesc, _, index_table, config: ConvBackpropFilterConfig = None, + idx=None, gen_tiling_spaces=False): + """Compile kernel module for conv_backprop_filter""" + if index_table is not None: + raise RuntimeError('index_table should be none') + kernel_name = "conv_backprop_filter_poly" + if idx is not None: + kernel_name += str(idx) + + if config is None: + attrs = {'dim': ""} + else: + tile_cici = config.tile_ci + tile_khkh = config.tile_kh + tile_kwkw = config.tile_kw + tile_coco = config.tile_co + tile_bb = config.tile_batch + tile_hh = config.tile_h + tile_ww = config.tile_w + tile_mm = config.tile_m + tile_kk = config.tile_k + tile_nn = config.tile_n + tiling_param = [tile_cici, tile_khkh, tile_kwkw, tile_coco, tile_bb, tile_hh, tile_ww, + tile_mm, tile_kk, tile_nn] + attrs = {'conv_tile': tiling_param} + + conv_dtype = 'float16' + block_size = 16 + + in_n, in_c, in_h, in_w = op_desc.fmap_shape + cout, _, w_h, w_w = op_desc.filter_shape + + in_c = (in_c + block_size - 1) // block_size * block_size + cout = (cout + block_size - 1) // block_size * block_size + + pad_top, pad_bottom, pad_left, pad_right = op_desc.pad + stride_h, stride_w = op_desc.stride + + out_n = in_n + out_c = cout + out_h = (in_h + pad_top + pad_bottom - w_h) // stride_h + 1 + out_w = (in_w + pad_left + pad_right - w_w) // stride_w + 1 + + x_shape = (in_n, in_c, in_h, in_w) + y_shape = (out_n, out_c, out_h, out_w) + in_n, in_c, in_h, in_w = x_shape + input_shape_nc1hwc0 = (in_n, in_c // block_size, in_h, in_w, block_size) + o_n, o_c, o_h, o_w = y_shape + kernel_shape_nc1hwc0 = (o_n, o_c // block_size, o_h, o_w, block_size) + o_n, o_c1, o_h, o_w, o_c0 = kernel_shape_nc1hwc0 + mo = (o_h * o_w + block_size - 1) // block_size + mi = block_size + kernel_shape_fractal = (o_n, o_c1, mo, mi, o_c0) + + input_shape = [kernel_shape_fractal, input_shape_nc1hwc0] + + return utils.op_build(conv_backprop_filter.conv_backprop_filter, [input_shape], [conv_dtype], + op_attrs=[op_desc.fmap_shape, op_desc.filter_shape, op_desc.pad, + op_desc.stride, op_desc.dilation, attrs], + kernel_name=kernel_name, attrs=attrs, polyhedral=True, tuning=gen_tiling_spaces) + + +def gen_kernel_for_vector(op_desc, _, index_table=None, config: NamedTuple = None, idx=None, gen_tiling_spaces=False): + """Compile kernel module for vector""" + test_base = TestBase() + test_base.params_init(op_desc[0][0:4] + str(idx), os.getcwd()) + kernel_name = "poly_" + if idx is not None: + kernel_name += str(idx) + if config is None: + attrs = {'dim': ""} + else: + tiling = [[getattr(config, name), 1] for name in getattr( + config, '_fields') if name.startswith('tiling')] + tiling_param = [] + for i, element in enumerate(tiling): + tiling_param.append(index_table[i] + element) + dim_info = ct_util.set_dims(tuple(tiling_param)) + attrs = {'dim': dim_info} + _, func, args, kwargs = test_base.ana_args(op_desc) + if 'attrs' in kwargs.keys(): + kwargs['attrs']['dim'] = attrs['dim'] + kwargs['attrs']['tuning'] = gen_tiling_spaces + kwargs['attrs']['kernel_name'] = kernel_name + else: + for _, arg_ in enumerate(args): + if isinstance(arg_, dict): + arg_['dim'] = attrs['dim'] + arg_['tuning'] = gen_tiling_spaces + arg_['kernel_name'] = kernel_name + break + try: + if gen_tiling_spaces: + mod, expect, param_for_mod = func(*args, **kwargs) + mod = list(mod) + mod.append(expect) + mod.append(param_for_mod) + else: + mod = func(*args, **kwargs) + except BaseException as e: + print("Compile ERROR message:", e) + print(func) + print("Compile ERROR") + raise Exception("Compile ERROR") + + return mod + + +def gen_kernel_batch_matmul_gpu(op_desc, _, index_table=None, + config: NamedTuple = None, idx=None, + gen_tiling_spaces=False, need_tune_json=None): + """Compile kernel module for batch_matmul in gpu""" + kernel_name = "batch_matmul_gpu_" + # wait for implementation + return + + +def gen_kernel_reduce_sum_gpu(op_desc, _, index_table=None, + config: NamedTuple = None, idx=None, gen_tiling_spaces=False, need_tune_json=None): + """Compile kernel module for reduce_sum in gpu""" + kernel_name = "reduce_sum_gpu_" + if idx is not None: + kernel_name += str(idx) + attrs = op_desc[2] + if config is not None: + attrs = merge_attrs(attrs, config, need_tune_json) + + try: + if gen_tiling_spaces: + # NOTE: don't use this process for reduce spaces generation, + # see function: "_get_space_reduce_gpu_manually". + from .tiling_strategies_gpu import reduce_gpu_tiling_strategy + spaces, set_dim_key = utils.op_build(reduce_sum, (attrs.in_shape, ), + (attrs.in_dtype, + ), kernel_name="reduce_sum", + op_attrs=[ + attrs.axis, attrs.keepdims], + attrs={"target": "cuda", + "enable_akg_reduce_lib": attrs.enable_akg_reduce_lib, + "enable_atomic_add": attrs.enable_atomic_add, + "custom_tiling": reduce_gpu_tiling_strategy(attrs.in_shape, attrs.axis)}, tuning=True) + + from test_ms_reduce_sum import gen_data + input_for_mod, output, expect = gen_data( + attrs.in_shape, attrs.in_dtype, attrs.axis, attrs.keepdims) + return [spaces, set_dim_key, expect, [input_for_mod, output]] + else: + mod = utils.op_build(reduce_sum, (attrs.in_shape, ), + (attrs.in_dtype, + ), kernel_name="reduce_sum", + op_attrs=[ + attrs.axis, attrs.keepdims], + attrs={"target": "cuda", + "enable_akg_reduce_lib": attrs.enable_akg_reduce_lib, + "dim": attrs.dim, + "bind_block": attrs.bind_block, + "bind_thread": attrs.bind_thread, + "enable_atomic_add": attrs.enable_atomic_add}) + return mod + except BaseException as e: + print("Compile ERROR message:", e) + print(reduce_sum) + print("Compile ERROR") + raise Exception("Compile ERROR") + + +def gen_kernel_conv_image2col_gemm_gpu(op_desc, _, index_table=None, config: NamedTuple = None, idx=None, gen_tiling_spaces=False, need_tune_json=None): + """Compile kernel module for convolution in gpu using image2col+gemm""" + # wait for implementation + return + + +_compile_kernel_func = { + 'conv': gen_kernel_conv, + 'conv_bn1': gen_kernel_conv_bn1, + 'conv_backprop_input': gen_kernel_conv_backprop_input, + 'conv_backprop_filter': gen_kernel_conv_backprop_filter, + 'matmul': gen_kernel_matmul_cube, + 'reduce_sum_gpu': gen_kernel_reduce_sum_gpu, + 'batch_matmul_gpu': gen_kernel_batch_matmul_gpu, + 'conv_image2col_gemm_gpu': gen_kernel_conv_image2col_gemm_gpu, +} + + +def compile_kernel(op_type: str, op_desc: NamedTuple, input_shape=None, index_table=None, + config_param: NamedTuple = None, idx: int = None, gen_tiling_spaces: bool = False, need_tune_json=None): + """Generate kernel module for operator + + Parameters + op_type: str + operator name + op_desc: NamedTuple + operator definition parameters + config_param: NameTuple + operator config parameters + idx: int + operator idx(th) kernel + gen_tiling_spaces: bool + parameter passed to utils.op_build, whether to get spaces instead of stmt + ---------- + + Returns: + kernel if gen_tiling_spaces == False else np.ndarray + """ + gen_func = _compile_kernel_func.get(op_type, None) + if gen_func is None: + gen_func = gen_kernel_for_vector + if gen_tiling_spaces: + space_res, key, expect, input_for_mod = gen_func(op_desc, input_shape, index_table, config_param, + idx, gen_tiling_spaces) + else: + if "gpu" in op_type: + mod = gen_func(op_desc, input_shape, index_table, + config_param, idx, gen_tiling_spaces, need_tune_json=need_tune_json) + else: + mod = gen_func(op_desc, input_shape, index_table, + config_param, idx, gen_tiling_spaces) + + return [space_res, key, expect, input_for_mod] if gen_tiling_spaces else mod diff --git a/tests/fuzz/tune_for_gpu/autotuning/runner.py b/tests/fuzz/tune_for_gpu/autotuning/runner.py new file mode 100644 index 00000000..a3400932 --- /dev/null +++ b/tests/fuzz/tune_for_gpu/autotuning/runner.py @@ -0,0 +1,243 @@ +# Copyright 2019 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Runner for compile and execute a configs of an operator on device""" +import time +import multiprocessing +import logging +import json +import os +import subprocess +import time +from typing import NamedTuple +import numpy as np +from akg import composite +from akg.utils import custom_tiling as ct_util +from akg.utils import kernel_exec as utils +from .kernel_compiler import compile_kernel +from .test_data_generators import gen_data +from .tuning_utils import * + +logger = logging.getLogger('fuzz.tune.autotuning.runner') + +error_time_list = [ + 9999999999.0, + 9999999998.0, + 9999999997.0, + 9999999996.0, +] + +error_time_string = { + error_time_list[0]: 'run_failed', + error_time_list[1]: 'precision_error', + error_time_list[2]: 'compile_failed', + error_time_list[3]: 'timeout' +} + +run_failed_time = error_time_list[0] +precision_error_time = error_time_list[1] +compile_fail_time = error_time_list[2] +timeout_time = error_time_list[3] + + +def get_attr_from_config(config, index_table): + tiling = [] + attrs = {} + tuning_dict = config._asdict() + for key, value in tuning_dict.items(): + if key.startswith('tiling'): + item = [value, 1] + tiling.append(item) + else: + attrs[key] = value + if len(tiling): + tiling_param = [] + for i, element in enumerate(tiling): + tiling_param.append(index_table[i] + element) + dim_info = ct_util.set_dims(tuple(tiling_param)) + attrs['dim'] = dim_info + else: + print("No tiling info. Use auto tiling.") + return attrs + + +class KernelRunner: + """kernel runner + This runner will compile and execute configs of an operator, and return their running times. + + Parameters + ---------- + op_type: str + The name of operator + op_desc: NamedTuple + The definition parameters of operator + timeout: int + Timeout for running one config + repeat_times: + Run one config repeat_times + """ + + def __init__(self, op_type: str, op_desc: NamedTuple, + index_table: list, self_attrs: list, timeout: int = 600, + repeat_times: int = 2, input_data=None, + expect=None, mod_output_param=None, is_all_space=True, + skip_config_set=None, need_tune_json=None): + self.op_type = op_type + self.op_desc = op_desc + self._index_table = index_table + self.self_attrs = self_attrs + self.run_kernel_time = 0.0 + self.tune_self_attrs = True + self.timeout = timeout + self.repeat_times = repeat_times + self.mod_output_param = mod_output_param + self.is_all_space = is_all_space + self.skip_config_set = skip_config_set + self.need_tune_json = need_tune_json + if input_data is None: + self.input, self.expect = gen_data(op_type, op_desc) + if isinstance(self.input, dict): + self.input, self.mod_output_param = self.input['args'], self.input['outputs'] + else: + self.input, self.expect = input_data, expect + self.input_shape = [x.shape for x in self.input] + + def info(self): + print('run kernel time:', self.run_kernel_time) + + def run_one_kernel(self, run_times, idx, config, best_time=np.inf, is_auto=False): + """Compile and execute a config of the operator on device""" + + if json.dumps(config.input._asdict()) in self.skip_config_set: + print("CONFIG SKIP:", json.dumps(config.input._asdict())) + run_times[idx] = -1 + return + + time_one_kernel_start = time.time() + logger.debug('compile %dth kernel', idx) + gpu_devices_list = get_available_gpu_num() + device_id = gpu_devices_list[idx % len(gpu_devices_list)] + logger.debug('run %dth kernel', idx) + logger.debug('++++++++++++++++++++++=device_id') + logger.debug(device_id) + logger.debug('++++++++++++++++++++++=device_id') + try: + time_start_build = time.time() + logger.debug(config) + if self.op_type in ["json", "extra_tune"]: + if is_auto: + mod = composite.build(self.op_desc) + if self.op_type == "extra_tune": + del os.environ['MS_GRAPH_KERNEL_TILING'] + else: + attrs = get_attr_from_config( + config.input, self._index_table) + if os.environ['RUNTIME_MODE'] == "gpu": + attrs['target'] = "cuda" + mod = composite.build(self.op_desc, attrs, use_repo=False) + else: + mod = compile_kernel(self.op_type, self.op_desc, self.input_shape, self._index_table, + None if is_auto else config.input, idx, need_tune_json=self.need_tune_json) + time_end_build = time.time() + logger.debug("build module time: %f", + time_end_build - time_start_build) + logger.debug('finished compile %dth kernel', idx) + except BaseException as e: + logger.debug("Compile Failed: [%s] : %s", "origin" if is_auto else str( + config.input), str(e)) + run_times[idx] = compile_fail_time + return + + run_times[idx] = run_failed_time + + try: + # NOTE: in gpu tuning, it is no need to use this repeat_times, + # repeat_time has been setted in mod_launch in tuning mode. + for _ in range(self.repeat_times): + stat_info = {} + try: + time_start_launch = time.time() + if self.mod_output_param is not None: + pass + else: + output, stat_info = utils.mod_launch( + mod, self.input, tuning=True, device_id=device_id, repeat_time=40) + if not np.allclose(output, self.expect, rtol=5e-03, atol=5e-03, equal_nan=True): + stat_info['run_time'] = precision_error_time + logger.debug("Precision Error: [%s]", + "origin" if config is None else str(config.input)) + + time_end_launch = time.time() + logger.debug("mod launch time: %f", + time_end_launch - time_start_launch) + except BaseException as e: + logger.debug("Run Failed: [%s] : %s", str( + config.input), str(e)) + stat_info['run_time'] = run_failed_time + run_times[idx] = np.minimum( + run_times[idx], stat_info['run_time']) + finally: + logger.debug('end of %dth kernel', idx) + time_one_kernel_end = time.time() + logger.debug('run one kernel time: %f', + time_one_kernel_end - time_one_kernel_start) + return + + def run(self, configs, best_time=np.inf, is_auto_set_dim=False, all_space=False): + """Compile and execute a batch config of the operator on device""" + start = time.time() + logger.setLevel(logging.DEBUG) + logger.debug("gen cce kernels batch: %d kernels", len(configs)) + subprocess.run("rm -rf ./jobs/JOB*", shell=True) + + process_jobs = [] + run_times = multiprocessing.Manager().list( + np.full((len(configs),), compile_fail_time)) + for idx, config in enumerate(configs): + p = multiprocessing.Process(target=self.run_one_kernel, + args=(run_times, idx, config, best_time, is_auto_set_dim)) + process_jobs.append(p) + p.start() + timeout_error = False + for idx, p in enumerate(process_jobs): + if not timeout_error: + p.join(timeout=self.timeout) + if p.is_alive(): + timeout_error = True + logger.debug("Timeout Error: [%s]", str(configs[idx].input)) + run_times[idx] = timeout_time + p.terminate() + + process_end = time.time() + logger.debug("process time: %f", process_end - start) + # clean the profiling directory + tune_device = int(os.environ['DEVICE_ID']) + tune_num = int(os.environ['DEVICE_TOTAL_NUM']) + if os.environ['RUNTIME_MODE'] == "gpu": + subprocess.run("rm -rf cuda_meta_*", shell=True) + else: + pass + end = time.time() + logger.debug("run kernels time: %f", end - start) + self.run_kernel_time += end - start + + for idx, config in enumerate(configs): + if run_times[idx] not in error_time_list: + logger.debug("KernelRunTime : [%s] : %s", str( + configs[idx].input), str(run_times[idx])) + else: + logger.debug("KernelRunTime : [%s] : %s", + str(configs[idx].input), str(error_time_string[run_times[idx]])) + + return run_times diff --git a/tests/fuzz/tune_for_gpu/autotuning/space.py b/tests/fuzz/tune_for_gpu/autotuning/space.py new file mode 100644 index 00000000..7c17d422 --- /dev/null +++ b/tests/fuzz/tune_for_gpu/autotuning/space.py @@ -0,0 +1,217 @@ +# Copyright 2019 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Config space""" +from abc import ABCMeta, abstractmethod +from typing import NamedTuple, List +import random +import numpy as np + + +class ConfigEntity: + """General config entity""" + + def __init__(self, input_id: int, input_space: NamedTuple): + self.__input = input_space + self.__input_id = input_id + self.__input_type = type(input_space) + + def __len__(self): + return len(self.__input) + + def __str__(self): + return str(self.__input_id) + ': ' + str(self.__input) + + def __repr__(self): + return str(self) + + @property + def input_id(self): + return self.__input_id + + @property + def input_type(self): + return self.__input_type + + @property + def input(self): + return self.__input + + @property + def feature(self): + return self.__input + + +class ConfigSpace(metaclass=ABCMeta): + """Searching space of configs""" + + def __init__(self, input_type): + self._input_type = input_type + self._dim_names = getattr(self._input_type, '_fields') + + self._configs = [] # List[ConfigEntity] + + @abstractmethod + def reset_fetch(self): + pass + + @abstractmethod + def has_next(self) -> bool: + pass + + @abstractmethod + def fetch_index(self) -> int: + """fetch a random index of config""" + + @abstractmethod + def fetch_config(self) -> ConfigEntity: + """fetch a random config""" + + @abstractmethod + def random_walk(self, p: int) -> int: + """find a neighbor hood of the p-th ConfigEntity, which only + differs with p in at most one dimension""" + + def get(self, idx: int) -> ConfigEntity: + """get the `idx`-th config of the space""" + return self._configs[idx] + + @property + def configs(self): + return self._configs + + @property + def dim_names(self): + return self._dim_names + + @property + def input_type(self): + return self._input_type + + @property + # @abstractmethod + def length(self): + return len(self.configs) + + +class ConfigTrie: + """Trie node for config entities""" + + def __init__(self): + self.ch = dict() + + def add(self, config: ConfigEntity, last_dim: int): + """add a ConfigEntity""" + cur = self + for i, x in enumerate(config.input): + if i == last_dim: + continue + if x not in cur.ch: + cur.ch[x] = ConfigTrie() + if not isinstance(cur.ch, dict): + raise TypeError('none-leaf node should have a dict of childs') + cur = cur.ch[x] + + if not isinstance(cur.ch, list): + cur.ch = [] + cur.ch.append(config.input_id) + + def fetch_random(self, config: ConfigEntity, last_dim: int) -> int: + """randomly fetch the index of a ConfigEntity the same with `config` except for the `last_dim`-th dimension""" + cur = self + for i, x in enumerate(config.input): + if i == last_dim: + continue + if not isinstance(cur.ch, dict): + raise TypeError('none leaf node should have a dict of childs') + if x not in cur.ch: + raise RuntimeError('no element found') + cur = cur.ch[x] + if not cur.ch: + raise RuntimeError('no element found') + if len(cur.ch) == 1: + return cur.ch[0] + idx = config.input_id + while idx == config.input_id: + idx = random.choice(cur.ch) + return idx + + +class ListConfigSpace(ConfigSpace): + """Searching space of configs, which stores all possible configs in a list""" + + def __init__(self, input_type): + super(ListConfigSpace, self).__init__(input_type) + + self.__config_tries = [ConfigTrie() for _ in range(len(self._dim_names))] + self.__fetch_pool = [] + + def reset_fetch(self): + """reset fetch state""" + self.__fetch_pool = [i for i in range(len(self._configs))] + + def fetch_scope(self, start, end): + self.__fetch_pool = [i for i in range(start, end)] + + def has_next(self) -> bool: + return len(self.__fetch_pool) > 0 + + def fetch_index(self) -> int: + """fetch a random index of config""" + idx = np.random.randint(len(self.__fetch_pool)) + ret = self.__fetch_pool[idx] + self.__fetch_pool[idx] = self.__fetch_pool[-1] + self.__fetch_pool.pop() + return ret + + def fetch_next_index(self) -> int: + """fetch next index of config""" + idx = len(self.__fetch_pool) - 1 + self.__fetch_pool[0] + self.__fetch_pool.pop() + return idx + + def fetch_config(self) -> ConfigEntity: + """fetch a random config""" + return self.get(self.fetch_index()) + + def add(self, input_space: NamedTuple): + """add a new config to space""" + if not isinstance(input_space, self._input_type): + raise TypeError('invalid config input space type, got {} expected {}'.format(type(input_space), + self._input_type)) + config = ConfigEntity(len(self._configs), input_space) + self.__fetch_pool.append(len(self._configs)) + for i in range(len(self._dim_names)): + self.__config_tries[i].add(config, i) + self._configs.append(config) + + def random_walk(self, p: int) -> int: + """find a neighbor hood of the p-th ConfigEntity, which only differs with p in at most one dimension""" + dim = np.random.randint(len(self._dim_names)) + return self.__config_tries[dim].fetch_random(self._configs[p], dim) + + @property + def length(self): + return len(self._configs) + + @classmethod + def from_list(cls, configs: List[NamedTuple]): + if not isinstance(configs, list): + raise TypeError('configs must be of list type, got %s' % type(configs)) + if not configs: + raise ValueError('configs must be non-empty') + space = cls(type(configs[0])) + for config in configs: + space.add(config) + return space diff --git a/tests/fuzz/tune_for_gpu/autotuning/space_generators.py b/tests/fuzz/tune_for_gpu/autotuning/space_generators.py new file mode 100644 index 00000000..ba6c6ae3 --- /dev/null +++ b/tests/fuzz/tune_for_gpu/autotuning/space_generators.py @@ -0,0 +1,753 @@ +# Copyright 2019 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""space generating functions for operators""" +from functools import partial +from typing import NamedTuple +from collections import namedtuple +from test_run import matmul_run +from akg.utils import validation_check as vc_util +from .type_definitions import ConvDesc, ConvBackpropDesc, MatmulCubeDesc, ConvConfig, ConvBackpropInputConfig, ConvBackpropFilterConfig, MatmulCubeConfig +from .space import ListConfigSpace +from .kernel_compiler import compile_kernel +from .gen_spaces_gpu import _get_space_reduce_gpu_manually +from tqdm import tqdm +from enum import Enum + +GPU_IDX_TO_STR = {0: "x", 1: "y", 2: "z"} + +class GpuSpacePolicy(Enum): + """Policy to expand tile candidates with block and thread.""" + FULL = "FULL" + BMM = "BMM" + REDUCE_ALL = "REDUCE_ALL" + REDUCE_X = "REDUCE_X" + REDUCE_Y = "REDUCE_Y" + + +def gen_bool_list(attr_list): + bool_list = [] + for _ in attr_list: + if len(bool_list) == 0: + bool_list = [[True], [False]] + else: + tmp_list = [] + for attr_option in bool_list: + tmp = attr_option[:] + tmp.append(True) + tmp1 = tmp[:] + tmp.pop() + tmp.append(False) + tmp2 = tmp[:] + tmp_list.append(tmp1) + tmp_list.append(tmp2) + bool_list = tmp_list + return bool_list + + +def _get_space_vector(op_type: str, op_desc): + """get config space of vector operator""" + space_res, key, expect, input_for_mod = compile_kernel(op_type, op_desc, None, None, None, 0, + gen_tiling_spaces=True) + + if space_res is None: + raise RuntimeError('no space returned') + if 'index' not in space_res or 'tuning_space' not in space_res: + raise RuntimeError('invalid space returned') + index_table = space_res['index'] + tiling_spaces = space_res['tuning_space'] + + if not tiling_spaces: + raise RuntimeError('empty tiling spaces') + + dim_names = ['tiling_' + str(i) for i in range(len(tiling_spaces[0]))] + input_type = namedtuple(op_type, dim_names) + space = ListConfigSpace(input_type) + for tiling_space in tiling_spaces: + config = input_type(*tiling_space) + space.add(config) + return index_table, space, key, expect, input_for_mod + + +def _get_space_conv(op_desc: ConvDesc): + """get config space of convolution""" + if not isinstance(op_desc, ConvDesc): + raise TypeError('op_desc must be ConvDesc') + + stride_ = op_desc.stride + pad_ = op_desc.pad + dilation_ = op_desc.dilation + vc_util.convolution_format_check( + op_desc.fmap_shape, op_desc.filter_shape, pad_, stride_, dilation_) + config_space = ListConfigSpace(ConvConfig) + + # if double buff is not enabled, set it's value to 1 + size_scale = 1 + + l1_max_size = (1024 * 1024) // size_scale + l0a_max_size = (64 * 1024) // size_scale + l0b_max_size = (64 * 1024) // size_scale + l0c_max_size = ((256 - 8) * 1024) // size_scale // 2 + + _, in_c, in_h, in_w = op_desc.fmap_shape + k_n, _, k_h, k_w = op_desc.filter_shape + padding = (pad_[0], pad_[1], pad_[2], pad_[3]) + p_top, p_bottom, p_left, p_right = padding + s_h, s_w = stride_ + + in_c = ((in_c - 1) // 16 + 1) * 16 + tile_c = in_c + tile_co_start = 16 + + data_len = 2 + + h_max = in_h + p_top + p_bottom + win_h = (h_max - k_h) // s_h + 1 + h_max = (h_max - k_h) // s_h * s_h + k_h + w_max = in_w + p_left + p_right + win_w = (w_max - k_w) // s_w + 1 + w_max = (w_max - k_w) // s_w * s_w + k_w + + bypass_options = [0, 1] + + for bypass in bypass_options: + for tile_h in range(h_max, k_h - 1, -s_h): + size_h = tile_h + if tile_h == h_max: + w_range = range(w_max, k_w - 1, -s_w) + size_h = in_h + else: + w_range = [w_max] + win_tile_h = (tile_h - k_h) // s_h + 1 + h_tiles = (win_h + win_tile_h - 1) // win_tile_h + if h_tiles == 2: + size_h = max(tile_h - p_top, in_h + + p_top - tile_h + k_h - s_h) + + for tile_w in w_range: + size_w = tile_w + if size_w == w_max: + size_w = in_w + else: + win_tile_w = (tile_w - k_w) // s_w + 1 + w_tiles = (win_w + win_tile_w - 1) // win_tile_w + if w_tiles == 2: + size_w = max(tile_w - p_left, in_w + + p_left - tile_w + k_w - s_w) + + k_n_ = ((k_n - 1) // 16 + 1) * 16 + co_range = range(k_n_, tile_co_start - 1, -16) + for tile_co in co_range: + if bypass == 1: + if tile_co != k_n: + continue + l1_size = data_len * (size_h * size_w * in_c) + else: + l1_size = data_len * (size_h * size_w * in_c + + tile_co * tile_c * k_h * k_w) + + if l1_size > l1_max_size: + continue + + tile_co_ = ((tile_co - 1) // 16 + 1) * 16 + for tile_n in range(tile_co_, 15, -16): + k_max = in_c * k_h * k_w + k_max_ = ((k_max - 1) // 16 + 1) * 16 + k_size = l0b_max_size // data_len // tile_n + k_size_ = k_size // 16 * 16 + for tile_k in range(min(k_max_, k_size_), 15, -16): + m_max = (int(((tile_h - k_h) // (s_h)) + 1)) * \ + (int(((tile_w - k_w) // (s_w)) + 1)) + m_max_ = ((m_max - 1) // 16 + 1) * 16 + m_size1 = l0a_max_size // data_len // tile_k + m_size1_ = m_size1 // 16 * 16 + m_size2 = l0c_max_size // data_len // tile_n + m_size2_ = m_size2 // 16 * 16 + for tile_m in range(min(m_max_, m_size1_, m_size2_), 15, -16): + config_space.add(ConvConfig(tile_h, tile_co, tile_m, tile_k, + tile_n, tile_w, bypass)) + + return None, config_space, op_desc.__str__(), None, None + + +def _get_space_conv_bn1(op_desc: ConvDesc): + """get config space of convolution""" + if not isinstance(op_desc, ConvDesc): + raise TypeError('op_desc must be ConvDesc') + + stride_ = op_desc.stride + pad_ = op_desc.pad + dilation_ = op_desc.dilation + vc_util.convolution_format_check( + op_desc.fmap_shape, op_desc.filter_shape, pad_, stride_, dilation_) + config_space = ListConfigSpace(ConvConfig) + + # if double buff is not enabled, set it's value to 1 + size_scale = 1 + + l1_max_size = (1024 * 1024) // size_scale + l0a_max_size = (64 * 1024) // size_scale + l0b_max_size = (64 * 1024) // size_scale + l0c_max_size = ((256 - 8) * 1024) // size_scale // 2 // 4 + + _, in_c, in_h, in_w = op_desc.fmap_shape + k_n, _, k_h, k_w = op_desc.filter_shape + padding = (pad_[0], pad_[1], pad_[2], pad_[3]) + p_top, p_bottom, p_left, p_right = padding + s_h, s_w = stride_ + + in_c = ((in_c - 1) // 16 + 1) * 16 + tile_c = in_c + tile_co_start = 16 + + data_len = 2 + + h_max = in_h + p_top + p_bottom + win_h = (h_max - k_h) // s_h + 1 + h_max = (h_max - k_h) // s_h * s_h + k_h + w_max = in_w + p_left + p_right + win_w = (w_max - k_w) // s_w + 1 + w_max = (w_max - k_w) // s_w * s_w + k_w + + bypass_options = [0, 1] + + for bypass in bypass_options: + h_range = range(h_max, k_h - 1, -s_h) + for tile_h in h_range: + size_h = tile_h + if tile_h == h_max: + w_range = range(w_max, k_w - 1, -s_w) + size_h = in_h + else: + w_range = [w_max] + win_tile_h = (tile_h - k_h) // s_h + 1 + h_tiles = (win_h + win_tile_h - 1) // win_tile_h + if h_tiles == 2: + size_h = max(tile_h - p_top, in_h + + p_top - tile_h + k_h - s_h) + + for tile_w in w_range: + size_w = tile_w + if size_w == w_max: + size_w = in_w + else: + win_tile_w = (tile_w - k_w) // s_w + 1 + w_tiles = (win_w + win_tile_w - 1) // win_tile_w + if w_tiles == 2: + size_w = max(tile_w - p_left, in_w + + p_left - tile_w + k_w - s_w) + + k_n_ = ((k_n - 1) // 16 + 1) * 16 + co_range = range(k_n_, tile_co_start - 1, -16) + for tile_co in co_range: + if bypass == 1: + if tile_co != k_n: + continue + l1_size = data_len * (size_h * size_w * in_c) + else: + l1_size = data_len * (size_h * size_w * in_c + + tile_co * tile_c * k_h * k_w) + + if l1_size > l1_max_size: + continue + + tile_co_ = ((tile_co - 1) // 16 + 1) * 16 + for tile_n in range(tile_co_, 15, -16): + k_max = in_c * k_h * k_w + k_max_ = ((k_max - 1) // 16 + 1) * 16 + k_size = l0b_max_size // data_len // tile_n + k_size_ = k_size // 16 * 16 + for tile_k in range(min(k_max_, k_size_), 15, -16): + m_max = (int(((tile_h - k_h) // (s_h)) + 1)) * \ + (int(((tile_w - k_w) // (s_w)) + 1)) + m_max_ = ((m_max - 1) // 16 + 1) * 16 + m_size1 = l0a_max_size // data_len // tile_k + m_size1_ = m_size1 // 16 * 16 + m_size2 = l0c_max_size // data_len // tile_n + m_size2_ = m_size2 // 16 * 16 + for tile_m in range(min(m_max_, m_size1_, m_size2_), 15, -16): + config_space.add(ConvConfig(tile_h, tile_co, tile_m, tile_k, + tile_n, tile_w, bypass)) + + return None, config_space, op_desc.__str__(), None, None + + +def _get_space_conv_backprop_input(op_desc: ConvBackpropDesc): + """get config space of convolution backprop input""" + if not isinstance(op_desc, ConvBackpropDesc): + raise TypeError('op_desc must be ConvDesc') + + stride_ = op_desc.stride + pad_ = op_desc.pad + dilation_ = op_desc.dilation + vc_util.convolution_format_check( + op_desc.fmap_shape, op_desc.filter_shape, pad_, stride_, dilation_) + config_space = ListConfigSpace(ConvBackpropInputConfig) + + # if double buff is not enabled, set it's value to 1 + size_scale = 1 + block_size = 16 + + l1_max_size = (1024 * 1024) // size_scale + l0a_max_size = (64 * 1024) // size_scale + l0b_max_size = (64 * 1024) // size_scale + l0c_max_size = ((256 - 8) * 1024) // size_scale // 2 + ub_max_size = l0c_max_size + + _, in_c, in_h, in_w = op_desc.fmap_shape + k_n, _, k_h, k_w = op_desc.filter_shape + + in_c = (in_c + block_size - 1) // block_size * block_size + k_n = (k_n + block_size - 1) // block_size * block_size + + pad_top, pad_bottom, pad_left, pad_right = pad_ + stride_h, stride_w = stride_ + + out_c = k_n + out_h = (in_h + pad_top + pad_bottom - k_h) // stride_h + 1 + out_w = (in_w + pad_left + pad_right - k_w) // stride_w + 1 + + out_h = out_h * stride_h + out_w = out_w * stride_w + + p_top = k_h - pad_[0] - 1 + p_bottom = in_h + pad_[0] - stride_[0] * \ + ((in_h + pad_[0] + pad_[1] - k_h) // stride_[0] + 1) + p_left = k_w - pad_[2] - 1 + p_right = in_w + pad_[2] - stride_[1] * \ + ((in_w + pad_[2] + pad_[3] - k_w) // stride_[1] + 1) + + s_h = 1 + s_w = 1 + + tile_c = out_c + tile_co_start = 16 + + data_len = 2 + + h_max = out_h + p_top + p_bottom + win_h = (h_max - k_h) // s_h + 1 + h_max = (h_max - k_h) // s_h * s_h + k_h + w_max = out_w + p_left + p_right + win_w = (w_max - k_w) // s_w + 1 + w_max = (w_max - k_w) // s_w * s_w + k_w + + for tile_h in range(h_max, k_h - 1, -s_h): + size_h = tile_h + if tile_h == h_max: + w_range = range(w_max, k_w - 1, -s_w) + size_h = in_h + else: + w_range = [w_max] + win_tile_h = (tile_h - k_h) // s_h + 1 + h_tiles = (win_h + win_tile_h - 1) // win_tile_h + if h_tiles == 2: + size_h = max(tile_h - p_top, in_h + p_top - tile_h + k_h - s_h) + + for tile_w in w_range: + size_w = tile_w + if size_w == w_max: + size_w = in_w + else: + win_tile_w = (tile_w - k_w) // s_w + 1 + w_tiles = (win_w + win_tile_w - 1) // win_tile_w + if w_tiles == 2: + size_w = max(tile_w - p_left, in_w + + p_left - tile_w + k_w - s_w) + + k_n_ = ((k_n - 1) // 16 + 1) * 16 + co_range = range(k_n_, tile_co_start - 1, -16) + for tile_co in co_range: + l1_size = data_len * (size_h * size_w * out_c + + tile_co * tile_c * k_h * k_w) + if l1_size > l1_max_size: + continue + ub_size = data_len * (size_h * size_w * out_c) + if ub_size > ub_max_size: + continue + + tile_co_ = ((tile_co - 1) // 16 + 1) * 16 + for tile_n in range(tile_co_, 15, -16): + k_max = out_c * k_h * k_w + k_base = 16 * k_h * k_w + k_max_ = ((k_max - 1) // k_base + 1) * k_base + k_size = l0b_max_size // data_len // tile_n + k_size_ = k_size // k_base * k_base + for tile_k in range(min(k_max_, k_size_), k_base - 1, -k_base): + m_max = (int(((tile_h - k_h) // (s_h)) + 1)) * \ + (int(((tile_w - k_w) // (s_w)) + 1)) + m_max_ = ((m_max - 1) // 16 + 1) * 16 + m_size1 = l0a_max_size // data_len // tile_k + m_size1_ = m_size1 // 16 * 16 + m_size2 = l0c_max_size // data_len // tile_n + m_size2_ = m_size2 // 16 * 16 + for tile_m in range(min(m_max_, m_size1_, m_size2_), 15, -16): + config_space.add(ConvBackpropInputConfig(tile_h, tile_co, tile_m, + tile_k, tile_n, tile_w)) + return None, config_space, op_desc.__str__(), None, None + + +def _get_space_conv_backprop_filter(op_desc: ConvBackpropDesc): + """get config space of convolution backwprop filter""" + if not isinstance(op_desc, ConvBackpropDesc): + raise TypeError('op_desc must be ConvBackpropDesc') + + stride_ = op_desc.stride + pad_ = op_desc.pad + dilation_ = op_desc.dilation + vc_util.convolution_format_check( + op_desc.fmap_shape, op_desc.filter_shape, pad_, stride_, dilation_) + config_space = ListConfigSpace(ConvBackpropFilterConfig) + + # if double buff is not enabled, set it's value to 1 + size_scale = 1 + block_size = 16 + + l1_max_size = (1024 * 1024) // size_scale + l0a_max_size = (64 * 1024) // size_scale + l0b_max_size = (64 * 1024) // size_scale + l0c_max_size = ((256 - 8) * 1024) // size_scale // 2 + + in_n, in_c, in_h, in_w = op_desc.fmap_shape + cout, _, k_h, k_w = op_desc.filter_shape + k_n = cout + + in_c = (in_c + block_size - 1) // block_size * block_size + cout = (cout + block_size - 1) // block_size * block_size + + pad_top, pad_bottom, pad_left, pad_right = pad_ + s_h, s_w = stride_ + tile_co_start = 16 + tile_ci_start = 16 + data_len = 2 + h_max = in_h + pad_top + pad_bottom + win_h = (h_max - k_h) // s_h + 1 + h_max = (h_max - k_h) // s_h * s_h + k_h + w_max = in_w + pad_left + pad_right + win_w = (w_max - k_w) // s_w + 1 + w_max = (w_max - k_w) // s_w * s_w + k_w + + for tile_h in range(h_max, k_h - 1, -s_h): + size_h = tile_h + win_tile_h = (tile_h - k_h) // s_h + 1 + # Only one head for cut H axis + if win_tile_h * s_h < pad_top: + continue + # Only one tail for cut H axis + if (((win_h + win_tile_h - 1) // win_tile_h - 1) * win_tile_h - 1) * s_h + k_h > in_h + pad_top: + continue + if tile_h == h_max: + w_range = range(w_max, k_w - 1, -s_w) + size_h = in_h + else: + w_range = [w_max] + h_tiles = (win_h + win_tile_h - 1) // win_tile_h + if h_tiles == 2: + size_h = max(tile_h - pad_top, in_h + + pad_top - tile_h + k_h - s_h) + + for tile_w in w_range: + size_w = tile_w + win_tile_w = (tile_w - k_w) // s_w + 1 + # Only one head for cut W axis + if win_tile_w * s_w < pad_left: + continue + # Only one tail for cut W axis + if (((win_w + win_tile_w - 1) // win_tile_w - 1) * win_tile_w - 1) * s_w + k_w > in_w + pad_left: + continue + if size_w == w_max: + size_w = in_w + else: + w_tiles = (win_w + win_tile_w - 1) // win_tile_w + if w_tiles == 2: + size_w = max(tile_w - pad_left, in_w + + pad_left - tile_w + k_w - s_w) + for tile_kh in range(k_h, 0, -1): + for tile_kw in range(k_w, 0, -1): + k_n_ = ((k_n - 1) // 16 + 1) * 16 + co_range = range(k_n_, tile_co_start - 1, -16) + for tile_co in co_range: + in_c_ = ((in_c - 1) // 16 + 1) * 16 + ci_range = range(in_c_, tile_ci_start - 1, -16) + for tile_ci in ci_range: + tile_batch = 1 + l1_size = data_len * tile_batch * (tile_co * win_tile_h * win_tile_w + + tile_ci * size_h * size_w) + if l1_size > l1_max_size: + continue + + if (tile_batch != in_n or tile_co != k_n_ or tile_ci != in_c_): + tile_m = tile_co + tile_n = tile_ci * tile_kh * tile_kw + l0c_size = data_len * tile_n * tile_m + if l0c_size > l0c_max_size: + continue + k_max = tile_batch * tile_h * tile_w + k_max_ = ((k_max - 1) // 16 + 1) * 16 + k_size1 = l0a_max_size // data_len // tile_m + k_size1_ = k_size1 // 16 * 16 + k_size2 = l0b_max_size // data_len // tile_n + k_size2_ = k_size2 // 16 * 16 + for tile_k in range(min(k_max_, k_size1_, k_size2_), 15, -16): + config_space.add(ConvBackpropFilterConfig(tile_ci, tile_kh, tile_kw, tile_co, + tile_batch, tile_h, tile_w, tile_m, + tile_k, tile_n)) + else: + for tile_n in range(tile_ci * tile_kh * tile_kw, 15, -16): + k_max = tile_batch * tile_h * tile_w + k_max_ = ((k_max - 1) // 16 + 1) * 16 + k_size = l0b_max_size // data_len // tile_n + k_size_ = k_size // 16 * 16 + for tile_k in range(min(k_max_, k_size_), 15, -16): + m_max = tile_co + m_max_ = ((m_max - 1) // 16 + 1) * 16 + m_size1 = l0a_max_size // data_len // tile_k + m_size1_ = m_size1 // 16 * 16 + m_size2 = l0c_max_size // data_len // tile_n + m_size2_ = m_size2 // 16 * 16 + for tile_m in range(min(m_max_, m_size1_, m_size2_), 15, -16): + config_space.add(ConvBackpropFilterConfig(tile_ci, tile_kh, tile_kw, + tile_co, tile_batch, tile_h, + tile_w, tile_m, tile_k, tile_n)) + return None, config_space, op_desc.__str__(), None, None + + +def _get_space_matmul_cube(op_desc: MatmulCubeDesc): + """get config space of matmul_cube""" + if not isinstance(op_desc, MatmulCubeDesc): + raise TypeError('op_desc must be MatmulCubeDesc') + config_space = ListConfigSpace(MatmulCubeConfig) + batch_tuple, m, k, n = matmul_run.extract_dim( + op_desc.x_shape, op_desc.y_shape, op_desc.adj_x, op_desc.adj_y) + + mmax = (m + 15) // 16 + nmax = (n + 15) // 16 + kmax = (k + 15) // 16 + + double_buffer = True + mad_fp32 = True + + l1_max_size = (1024 * 1024) # L1 MEM 1024KB + l0a_max_size = (64 * 1024) # L0A MEM 64KB + l0b_max_size = (64 * 1024) # L0B MEM 64KB + l0c_max_size = (256 * 1024) # L0C MEM 256KB + # UB MEM 248KB, 8KB reserved for compiler + ub_max_size = ((256 - 8) * 1024) + + if double_buffer: + l1_max_size = l1_max_size // 2 + l0a_max_size = l0a_max_size // 2 + l0b_max_size = l0b_max_size // 2 + l0c_max_size = l0c_max_size // 2 + ub_max_size = ub_max_size // 2 + + if mad_fp32: + l0c_max_size = l0c_max_size // 2 + if op_desc.out_dtype == 'float32': + ub_max_size = ub_max_size // 2 + + bypass_options = [0, 1, 2] + + for bypass in bypass_options: + if (bypass == 2) and ((op_desc.adj_x == False and op_desc.left_format[0].lower() == 'n') or + (op_desc.adj_x == True and op_desc.left_format[0].lower() == 'z')): + continue + + if (bypass == 1) and ((op_desc.adj_y == False and op_desc.right_format[0].lower() == 'z') or + (op_desc.adj_y == True and op_desc.right_format[0].lower() == 'n')): + continue + + for k_l1 in range(1, kmax + 1): + if kmax % k_l1 != 0: + continue + for k_l0 in range(1, k_l1 + 1): + if k_l1 % k_l0 != 0: + continue + + # no need to cut from l1 to l0 for m and n when k is cut + for m_l1 in range(1, mmax + 1): + if mmax % m_l1 != 0: + continue + m_l0_range = [m_l1] if k_l1 != kmax else range(1, m_l1 + 1) + for m_l0 in m_l0_range: + if m_l1 % m_l0 != 0: + continue + for n_l1 in range(1, nmax + 1): + if nmax % n_l1 != 0: + continue + n_l0_range = [n_l1] if k_l1 != kmax else range( + 1, n_l1 + 1) + for n_l0 in n_l0_range: + if n_l1 % n_l0 != 0: + continue + + if m_l0 * 16 * k_l0 * 16 > l0a_max_size: + continue + + if n_l0 * 16 * k_l0 * 16 > l0b_max_size: + continue + + if m_l0 * 16 * n_l0 * 16 > l0c_max_size: + continue + + if m_l0 * 16 * n_l0 * 16 > ub_max_size: + continue + + if bypass == 2: + l1_size = n_l1 * 16 * k_l1 * 16 + elif bypass == 1: + l1_size = m_l1 * 16 * k_l1 * 16 + else: + l1_size = (m_l1 * 16 + n_l1 * + 16) * k_l1 * 16 + if l1_size > l1_max_size: + continue + + if nmax == 1: + n_l1 = 0 + n_l0 = 0 + if mmax == 1: + m_l1 = 0 + m_l0 = 0 + if kmax == 1: + k_l1 = 16 + k_l0 = 16 + config_space.add(MatmulCubeConfig( + n_l1, n_l0, m_l1, m_l0, k_l1, k_l0, bypass)) + shape_xx, shape_yy, _, _, k = matmul_run.get_converted_shapes(m, n, k, batch_tuple, op_desc.adj_x, op_desc.adj_y, + op_desc.bias, op_desc.left_format, + op_desc.right_format, op_desc.out_format) + return None, config_space, str((shape_xx, shape_yy, op_desc.bias, op_desc.left_format, op_desc.right_format, + op_desc.out_format, op_desc.adj_x, op_desc.adj_y, op_desc.dtype, + op_desc.out_dtype)), None, None + + + +def _get_space_batch_matmul_gpu(op_type: str, op_desc, tuning_attrs=[], tuning_attrs_info=None): + """get config space of batch_matmul operator in gpu""" + return + +def get_range_block(space_res): + block_range = space_res.gpu_block_range_table.asnumpy().tolist() + block_mod = space_res.gpu_block_mod_table.asnumpy().tolist() + block_x_range = range(block_range[0][0], block_range[0][1]+1, block_mod[0][0]) + block_y_range = range(block_range[1][0], block_range[1][1]+1, block_mod[1][0]) + if len(block_y_range) == 0: block_y_range = range(1,2) + block_z_range = range(block_range[2][0], block_range[2][1]+1, block_mod[2][0]) + if len(block_z_range) == 0: block_z_range = range(1,2) + return block_x_range,block_y_range,block_z_range + +def get_range_thread(space_res): + thread_range = space_res.gpu_thread_range_table.asnumpy().tolist() + thread_mod = space_res.gpu_thread_mod_table.asnumpy().tolist() + thread_x_range = range(thread_range[0][0], thread_range[0][1]+1, thread_mod[0][0]) + thread_y_range = range(thread_range[1][0], thread_range[1][1]+1, thread_mod[1][0]) + if len(thread_y_range) == 0: thread_y_range = range(1,2) + thread_z_range = range(thread_range[2][0], thread_range[2][1]+1, thread_mod[2][0]) + if len(thread_z_range) == 0: thread_z_range = range(1,2) + return thread_x_range,thread_y_range,thread_z_range + +def get_space_with_block_thread(tiling_spaces, space_res, policy=GpuSpacePolicy.FULL): + total_shape = max([max(v) for v in tiling_spaces]) + new_spaces = [] + block_x_range, block_y_range, block_z_range = get_range_block(space_res) + thread_x_range, thread_y_range, thread_z_range = get_range_thread(space_res) + pbar = tqdm(total=len(tiling_spaces)) + max_thread = 1024 + for space in tiling_spaces: + pbar.set_description("Adding block, thread to spaces") + if policy == GpuSpacePolicy.REDUCE_ALL: + for bx in range((total_shape-1)//space[0]+1,(total_shape-1)//space[0]+2): + for by in block_y_range: + for bz in block_z_range: + for tx in thread_x_range: + for ty in thread_y_range: + for tz in thread_z_range: + if tx * ty * tz > max_thread: + continue + tmp_space = space[:] + tmp_space.append(bx) + tmp_space.append(by) + tmp_space.append(bz) + tmp_space.append(tx) + tmp_space.append(ty) + tmp_space.append(tz) + new_spaces.append(tmp_space) + elif policy == GpuSpacePolicy.BMM: + for tx in thread_x_range: + for ty in thread_y_range: + for tz in thread_z_range: + if tx * ty * tz > max_thread: + continue + tmp_space = space[:] + if tx > tmp_space[-1] or (len(tmp_space) >= 2 and ty > tmp_space[-2]) or (len(tmp_space) >= 3 and tz > tmp_space[-3]): + continue + bx = max(1, tmp_space[-1] // tx) + by = max(1, tmp_space[-2] // ty) if len(tmp_space) >= 2 else 1 + bz = max(1, tmp_space[-3] // tz) if len(tmp_space) >= 3 else 1 + if bx >= block_x_range.stop or by >= block_y_range.stop or bz >= block_z_range.stop: + continue + tmp_space.append(bx) + tmp_space.append(by) + tmp_space.append(bz) + tmp_space.append(tx) + tmp_space.append(ty) + tmp_space.append(tz) + new_spaces.append(tmp_space) + elif policy == GpuSpacePolicy.FULL: + for bx in block_x_range: + for by in block_y_range: + for bz in block_z_range: + for tx in thread_x_range: + for ty in thread_y_range: + for tz in thread_z_range: + tmp_space = space[:] + tmp_space.append(bx) + tmp_space.append(by) + tmp_space.append(bz) + tmp_space.append(tx) + tmp_space.append(ty) + tmp_space.append(tz) + new_spaces.append(tmp_space) + else: + raise ValueError("Policy {} is not defined.".format(policy)) + + pbar.update(1) + print("total spaces size is: ",len(new_spaces)) + return new_spaces + +def _get_space_conv_image2col_gemm_gpu(op_type: str, op_desc, tuning_attrs=[], tuning_attrs_info=None): + """get config space of conv_image2col_gemm operators in gpu""" + return + +_get_space_func = { + 'conv': _get_space_conv, + 'conv_bn1': _get_space_conv_bn1, + 'conv_backprop_input': _get_space_conv_backprop_input, + 'conv_backprop_filter': _get_space_conv_backprop_filter, + 'matmul': _get_space_matmul_cube, + "reduce_sum_gpu": _get_space_reduce_gpu_manually, + "batch_matmul_gpu": _get_space_batch_matmul_gpu, + "conv_image2col_gemm_gpu": _get_space_conv_image2col_gemm_gpu, +} + + +def get_space(op_type: str, op_desc: NamedTuple, tuning_attrs=[], tuning_attrs_info=None): + """get space of an operator""" + func = _get_space_func.get(op_type, None) + if func is None: + func = partial(_get_space_vector, op_type=op_type) + if "gpu" in op_type: + return func(op_type=op_type, op_desc=op_desc, tuning_attrs=tuning_attrs, tuning_attrs_info=tuning_attrs_info) + return func(op_desc=op_desc) diff --git a/tests/fuzz/tune_for_gpu/autotuning/test_data_generators.py b/tests/fuzz/tune_for_gpu/autotuning/test_data_generators.py new file mode 100644 index 00000000..f8ffed41 --- /dev/null +++ b/tests/fuzz/tune_for_gpu/autotuning/test_data_generators.py @@ -0,0 +1,147 @@ +# Copyright 2019 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Generating test data for operators""" +from typing import NamedTuple + +import numpy as np +from gen_json_data import gen_json_data +from test_run import batchmatmul_run, conv_run, conv_backprop_input_run, conv_backprop_filter_run, matmul_run +from .type_definitions import ConvDesc, ConvBackpropDesc, MatmulCubeDesc + +def _gen_data_json(op_desc): + """Generating test data for composite json""" + input_for_mod, expect, _ = gen_json_data(op_desc) + return input_for_mod, expect + +def _gen_data_conv(op_desc: ConvDesc): + """Generating test data for conv""" + fmap_data, filter_data, bias_data, expect = conv_run.gen_data(op_desc.fmap_shape, op_desc.filter_shape, + op_desc.pad, op_desc.stride, op_desc.dilation, + op_desc.use_bias) + out_data = np.full(expect.shape, 0, 'float16') + + if op_desc.use_bias: + args = (fmap_data, filter_data, bias_data, out_data) + else: + args = (fmap_data, filter_data, out_data) + return args, expect + + +def _gen_data_conv_bn1(op_desc: ConvDesc): + """Generating test data for conv_bn1""" + fmap_data, filter_data, bias_data, conv_expect = conv_run.gen_data(op_desc.fmap_shape, op_desc.filter_shape, + op_desc.pad, op_desc.stride, op_desc.dilation, + op_desc.use_bias) + axes = (0, 2, 3) + conv_mean = np.mean(conv_expect, axis=axes, keepdims=True) + conv_square = np.power(conv_expect, 2) + conv_var_part = np.mean(conv_square, axis=axes, keepdims=True) + + expects = (conv_expect, conv_var_part, conv_mean) + + out_datas = [np.full(e.shape, 0, 'float16') for e in expects] + out_datas[1] = out_datas[1].astype(np.float32) + out_datas[2] = out_datas[2].astype(np.float32) + + if op_desc.use_bias: + in_data = [fmap_data, filter_data, bias_data] + else: + in_data = [fmap_data, filter_data] + + args = in_data + for out in out_datas: + args.append(out) + args = tuple(args) + + return {"args": args, 'outputs': (-3, -2, -1)}, expects + + +def _gen_data_conv_backprop_input(op_desc: ConvBackpropDesc): + dout, w, dx = conv_backprop_input_run.gen_data(op_desc.fmap_shape, op_desc.filter_shape, op_desc.pad, + op_desc.stride, op_desc.dilation) + out_data = np.full(dx.shape, 0, 'float16') + + args = (dout, w, out_data) + return args, dx + + +def _gen_data_conv_backprop_filter(op_desc: ConvBackpropDesc): + """Generating test data for conv_backprop_filter""" + block_size = 16 + + in_n, in_c, in_h, in_w = op_desc.fmap_shape + cout, _, w_h, w_w = op_desc.filter_shape + + in_c = (in_c + block_size - 1) // block_size * block_size + cout = (cout + block_size - 1) // block_size * block_size + + x_shape = (in_n, in_c, in_h, in_w) + w_shape = (cout, in_c, w_h, w_w) + + dy_data, dx_data, expect = conv_backprop_filter_run.gen_data(x_shape, w_shape, op_desc.pad, op_desc.stride, + op_desc.dilation) + out_data = np.full(expect.shape, 0, 'float32') + + args = (dy_data, dx_data, out_data) + return args, expect + + +def _gen_data_matmul_cube(op_desc: MatmulCubeDesc): + """Generating test data for matmul_cube""" + batch_tuple, m, k, n = matmul_run.extract_dim(op_desc.x_shape, op_desc.y_shape, op_desc.adj_x, op_desc.adj_y) + m = (m + 15) // 16 * 16 + n = (n + 15) // 16 * 16 + k = (k + 15) // 16 * 16 + _, _, _, out_shape, k = matmul_run.get_converted_shapes(m, n, k, batch_tuple, op_desc.adj_x, op_desc.adj_y, + op_desc.bias, op_desc.left_format, op_desc.right_format, + op_desc.out_format) + m_x, m_y, bench_mark, bias_data = matmul_run.matmul_data(batch_tuple, m, k, n, op_desc.dtype, op_desc.bias_dtype, + op_desc.out_dtype, op_desc.bias, op_desc.adj_x, + op_desc.adj_y, op_desc.left_format, + op_desc.right_format, op_desc.out_format) + + out_data = np.full(out_shape, np.nan, op_desc.out_dtype) + + if op_desc.bias: + args = (m_x, m_y, bias_data, out_data) + else: + args = (m_x, m_y, out_data) + return args, bench_mark + + +_gen_data_func = { + 'json': _gen_data_json, + 'conv': _gen_data_conv, + 'conv_bn1': _gen_data_conv_bn1, + 'conv_backprop_input': _gen_data_conv_backprop_input, + 'conv_backprop_filter': _gen_data_conv_backprop_filter, + 'matmul': _gen_data_matmul_cube, +} + + +def gen_data(op_type: str, op_desc: NamedTuple): + """Generate test data for operator + + Parameters + op_type: str + operator name + op_desc: NamedTuple + operator definition parameters + ---------- + """ + gen_func = _gen_data_func.get(op_type, None) + if gen_func is None: + raise ValueError('Unsupported op type for test data generating: %s' % op_type) + return gen_func(op_desc) diff --git a/tests/fuzz/tune_for_gpu/autotuning/tiling_strategies_gpu.py b/tests/fuzz/tune_for_gpu/autotuning/tiling_strategies_gpu.py new file mode 100644 index 00000000..f8af6cc0 --- /dev/null +++ b/tests/fuzz/tune_for_gpu/autotuning/tiling_strategies_gpu.py @@ -0,0 +1,84 @@ +from akg.utils import custom_tiling as ct_util + +def reduce_gpu_tiling_strategy(in_shape, reduce_axis): + """Custom tiling strategy for reduce op in gpu""" + strategy = list() + + if reduce_axis == None or len(reduce_axis) == len(in_shape): + """all-reduce""" + strategy.append( + ct_util.create_constraint_on_axis( + values=32, constraints=ct_util.TileConstraint.MOD, band=0, axis=0 + )[0] + ) + strategy.append( + ct_util.modify_common_constraints( + value=[32, 1, 1], constraint=ct_util.TileConstraint.THREAD_MOD + ) + ) + strategy.append( + ct_util.modify_common_constraints( + value=[1024, 1, 1], constraint=ct_util.TileConstraint.THREAD_MAX + ) + ) + strategy.append( + ct_util.modify_common_constraints( + value=[32, 1, 1], constraint=ct_util.TileConstraint.THREAD_MIN + ) + ) + elif (len(in_shape) - 1) in reduce_axis: + """Reduce-X: dummy strategy for hand-write space""" + strategy.append( + ct_util.create_constraint_on_axis( + values=1, constraints=ct_util.TileConstraint.MAX, band=0, axis=0 + )[0] + ) + strategy.append( + ct_util.create_constraint_on_axis( + values=1, constraints=ct_util.TileConstraint.MAX, band=0, axis=1 + )[0] + ) + strategy.append( + ct_util.modify_common_constraints( + value=[1, 1, 1], constraint=ct_util.TileConstraint.THREAD_MAX + ) + ) + strategy.append( + ct_util.modify_common_constraints( + value=[1, 1, 1], constraint=ct_util.TileConstraint.BLOCK_MAX + ) + ) + + else: + """Reduce-Y: dummy strategy for hand-write space""" + strategy.append( + ct_util.create_constraint_on_axis( + values=1, constraints=ct_util.TileConstraint.MAX, band=0, axis=0 + )[0] + ) + strategy.append( + ct_util.create_constraint_on_axis( + values=1, constraints=ct_util.TileConstraint.MAX, band=0, axis=1 + )[0] + ) + strategy.append( + ct_util.modify_common_constraints( + value=[1, 1, 1], constraint=ct_util.TileConstraint.THREAD_MAX + ) + ) + strategy.append( + ct_util.modify_common_constraints( + value=[1, 1, 1], constraint=ct_util.TileConstraint.BLOCK_MAX + ) + ) + + return strategy + + +def conv_dummy_strategy(): + """Conv strategy: dummy strategy""" + return + +def batch_matmul_gpu_tiling_strategy(desc): + """Custom tiling strategy for batch matmul in gpu with or without tensor core""" + return \ No newline at end of file diff --git a/tests/fuzz/tune_for_gpu/autotuning/tuner.py b/tests/fuzz/tune_for_gpu/autotuning/tuner.py new file mode 100644 index 00000000..98e35c25 --- /dev/null +++ b/tests/fuzz/tune_for_gpu/autotuning/tuner.py @@ -0,0 +1,359 @@ +# Copyright 2019 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tuner for finding best config for operators""" +import logging +import time +import json +import os +import numpy as np +from multiprocessing import Process +from tvm.autotvm.tuner.xgboost_cost_model import XgbCostModel +from tvm.autotvm.tuner.sa_model_optimizer import SimulatedAnnealingOptimizer +from .space import ConfigSpace +from .runner import KernelRunner +from tqdm import tqdm + +logger = logging.getLogger('fuzz.tune.autotuning.tuner') + + +class Tuner: + """Basic tuner class + + Parameters + ---------- + runner: KernelRunner + This is for run kernels in physical device + config_space: ConfigSpace + The space of configs + n_parallel: int + How many kernels are processed in a turn + """ + + def __init__(self, runner: KernelRunner, index_table: list, config_space: ConfigSpace, n_parallel: int = 1, skip_config_set=None): + self._runner = runner + self._index_table = index_table + self._space = config_space + self._n_parallel = n_parallel + + # trial plan + self._trials = [] + self._trial_pt = 0 + self._visited = set() + + # observed samples + self._xs = [] + self._ys = [] + + # keep the current best + self._best_config = None # type: ConfigEntity + self._best_time = np.inf + self._best_iter = 0 + self._tuning_time = 0.0 + self._original_time = np.inf + self._skip_config_set = skip_config_set + + @property + def best_config(self): + return self._best_config + + @property + def best_time(self): + return self._best_time + + @property + def best_iter(self): + return self._best_iter + + @property + def tuning_time(self): + return self._tuning_time + + @property + def original_time(self): + return self._original_time + + @property + def xs(self): + return self._xs + + @property + def ys(self): + return self._ys + + def info(self): + print('space size:', self._space.length) + print('best config:', self._best_config) + print('best time:', self._best_time) + print('best_iter:', self._best_iter) + print('tuning time:', self._tuning_time, 'secs') + + def next_batch(self, batch_size: int, is_add_visited=True): + """extract next batch with xgboost model""" + ret = [] + counter = 0 + if not is_add_visited: + return [self._space.get(index) for index in range(min(batch_size, self._space.length))] + while counter < batch_size and self._space.has_next(): + index = 0 + while self._trial_pt < len(self._trials): + index = self._trials[self._trial_pt] + if index not in self._visited: + break + self._trial_pt += 1 + + if self._trial_pt >= len(self._trials): + # if the trial list is empty choose randomly + index = self._space.fetch_index() + + ret.append(self._space.get(index)) + self._visited.add(index) + + counter += 1 + return ret + + def next_config(self, batch_size: int): + """extract next config orderly""" + ret = [] + counter = 0 + while counter < batch_size and self._space.has_next(): + index = self._space.fetch_next_index() + ret.append(self._space.get(index)) + self._visited.add(index) + counter += 1 + return ret + + def export_configs(self, configs: list, output_file: str, append: bool = True, desc=""): + """export configs""" + mode = "a" if append else "w" + with open(output_file, mode) as f: + for x, y in configs: + if y != -1: + f.write("{} | {} | {}\n".format(desc, json.dumps(x._asdict()), y)) + + def export_dim_configs(self, configs, output_file: str, append: bool = True, key=""): + """export dim configs""" + mode = "a" if append else "w" + data = {} + try: + if os.path.isfile(output_file): + with open(output_file, 'r') as f: + data = json.load(f) + except IOError as e: + logger.debug("get dim info from [%s] failed: %s", output_file, str(e)) + with open(output_file, mode) as f: + import re + data[key] = configs + s = json.dumps(data, sort_keys=True) + s = re.sub(r',\s*"', ',\n"', s) + s = '{\n' + s[1:-1] + '\n}' + f.write(s) + + def export_dim_configs_for_keys(self, configs, output_file: str, append: bool = True, keys=[]): + """export dim configs""" + mode = "a" if append else "w" + data = {} + try: + if os.path.isfile(output_file): + with open(output_file, 'r') as f: + data = json.load(f) + except IOError as e: + logger.debug("get dim info from [%s] failed: %s", output_file, str(e)) + with open(output_file, mode) as f: + import copy + tmp = copy.deepcopy(configs) + for key in reversed(keys): + info = {key: tmp} + tmp = copy.deepcopy(info) + data.update(info) + s = json.dumps(data, sort_keys=True, indent=4) + print(s) + f.write(s) + + def load_configs(self, input_file: str): + """load configs""" + configs = [] + file_path = os.path.realpath(input_file) + if os.path.isfile(file_path): + with open(file_path, "r") as f: + for line in f: + x, y, _ = line.split('|') + configs.append((self._space.input_type(**json.loads(x)), np.float64(y))) + return configs + + def tune(self, least_try_times: int, output_file: str = None): + """grid search all configs""" + i = 0 + pbar = tqdm(total=least_try_times) + while i < least_try_times: + if not self._space.has_next(): + break + configs = self.next_config(min(self._n_parallel, least_try_times - i)) + run_times = self._runner.run(configs, self._best_time) + results = [] + for idx, conf in enumerate(configs): + results.append((conf.input_id, run_times[idx])) + # keep best config + if self.best_time > run_times[idx]: + self._best_time = run_times[idx] + self._best_iter = i + idx + self._best_config = conf + + i += len(results) + pbar.update(len(results)) + + # update + for res in results: + self._xs.append(res[0]) + self._ys.append(res[1]) + if output_file: + configs = [(self._space.get(res[0]).input, res[1]) for res in results] + self.export_configs(configs, output_file) + return run_times + + +class ModelBasedTuner(Tuner): + """Model based tuner + This tuner will fit a cost model and use an optimizer to find the maximums of the cost model as next trials + + Parameters + ---------- + plan_size: int + Tuner will re-fit model per `plan_size` new measure samples + pre_model: CostModel + The cost model that predicts the speed of a config (IR) + """ + + def __init__(self, runner, index_table, config_space, n_parallel=1, plan_size=32, pre_model=None): + super(ModelBasedTuner, self).__init__(runner, index_table, config_space, n_parallel) + self.__plan_size = plan_size + + if pre_model is not None: + self.__cost_model = pre_model + self.__cost_model.reset_space(self._space) + else: + self.__cost_model = XgbCostModel(self._space) + + self.__model_optimizer = SimulatedAnnealingOptimizer(self._space) + self.__train_ct = 0 + + self.__is_auto_set_dim = False#True + + # time to leave + self.__ttl = None + self.__least_try_times = None + self.__early_stopping = None + + self.__model_run_time = 0.0 + + def info(self): + super(ModelBasedTuner, self).info() + print('model run time:', self.__model_run_time, 'secs') + + def model_res(self): + self.__cost_model.fit(self._xs, self._ys, self.__plan_size) + best_configs = self.__model_optimizer.find_best( + self.__cost_model, self.__plan_size, self._visited) + self._trials = best_configs + + def tune(self, least_try_times: int, output_file: str = None): + early_stopping = least_try_times + self.__least_try_times = least_try_times + self.__early_stopping = early_stopping + + logger.setLevel(logging.DEBUG) + old_level = logger.level + i = 0 + error_ct = 0 + + tuning_start = time.time() + while (i < self._space.length and (i < least_try_times + or (self._best_time > self._original_time - 0.9 + and i < least_try_times * 3))): + if not self._space.has_next(): + break + iter_start = time.time() + if not self.__is_auto_set_dim: + configs = self.next_batch(min(self._n_parallel, self._space.length - i)) + else: + configs = self.next_batch(min(self._n_parallel, self._space.length - i), False) + + logger.debug('--indexes: %s', str([x.input_id for x in configs])) + + run_times = self._runner.run(configs, self._best_time, self.__is_auto_set_dim) + if self.__is_auto_set_dim: + from operator import add + from functools import reduce + self._original_time = reduce(add, run_times) / len(run_times) + self._best_time = self._original_time + self._best_iter = -1 + self._best_config = None + run_times = None + self.__is_auto_set_dim = False + continue + + results = [] + for idx, conf in enumerate(configs): + if run_times[idx] == -1: + continue + results.append((conf.input_id, run_times[idx])) + # keep best config + if self._best_time > run_times[idx]: + self._best_time = run_times[idx] + self._best_iter = i + idx + self._best_config = conf + + i += len(results) + self.__ttl = min(early_stopping + self.best_iter, self._space.length) - i + + start = time.time() + # update + for res in results: + self._xs.append(res[0]) + self._ys.append(res[1]) + if output_file: + configs = [(self._space.get(res[0]).input, res[1]) for res in results] + desc = str(self._runner.op_desc) + self.export_configs(configs, output_file, desc=desc) + # if we have enough new training samples + if len(self._xs) >= self.__plan_size * (self.__train_ct + 1): + p = Process(target=self.model_res) + p.start() + p.join() + self._trial_pt = 0 + self.__train_ct += 1 + + end = time.time() + logger.debug('model running time: %f seconds', end - start) + self.__model_run_time += end - start + + iter_end = time.time() + logger.debug('iter time: %f seconds', iter_end - iter_start) + + if self._best_iter > 0 and i >= self.best_iter + early_stopping: + logger.debug('Early stopped. Best iter: %d', self._best_iter) + return + + print("tuning time already, ", time.time() - tuning_start) + if time.time() - tuning_start > 7200: + logger.debug('Early stopped because of too long time. Best iter: %d', self._best_iter) + return + + if error_ct > 150: + logging.warning('Too many errors happen in the tuning. Now is in debug mode') + logger.setLevel(logging.DEBUG) + else: + logger.setLevel(old_level) + + self._tuning_time += time.time() - tuning_start diff --git a/tests/fuzz/tune_for_gpu/autotuning/tuning_attrs_descs/reduce_tuning_attrs_desc.json b/tests/fuzz/tune_for_gpu/autotuning/tuning_attrs_descs/reduce_tuning_attrs_desc.json new file mode 100644 index 00000000..2896a2d5 --- /dev/null +++ b/tests/fuzz/tune_for_gpu/autotuning/tuning_attrs_descs/reduce_tuning_attrs_desc.json @@ -0,0 +1,9 @@ +{ + "enable_atomic_add": { + "dtype": "bool", + "options": [ + "False", + "True" + ] + } +} \ No newline at end of file diff --git a/tests/fuzz/tune_for_gpu/autotuning/tuning_utils.py b/tests/fuzz/tune_for_gpu/autotuning/tuning_utils.py new file mode 100644 index 00000000..394cda1d --- /dev/null +++ b/tests/fuzz/tune_for_gpu/autotuning/tuning_utils.py @@ -0,0 +1,155 @@ +from collections import namedtuple +import os +import logging + + +def get_block_str_from_config(config: namedtuple): + block_param = "" + if "block_x" in getattr(config, "_fields"): + block_param += str(config.block_x) + " " + + if "block_y" in getattr(config, "_fields"): + block_param += str(config.block_y) + " " + + if "block_z" in getattr(config, "_fields"): + block_param += str(config.block_z) + " " + return block_param + + +def get_thread_str_from_config(config: namedtuple): + thread_param = "" + if "thread_x" in getattr(config, "_fields"): + thread_param += str(config.thread_x) + " " + + if "thread_y" in getattr(config, "_fields"): + thread_param += str(config.thread_y) + " " + + if "thread_z" in getattr(config, "_fields"): + thread_param += str(config.thread_z) + " " + return thread_param + + +def get_parallel_build_num(): + """get the num of parallel build""" + env_dic = os.environ + try: + return int(env_dic.get('BUILD_PARALLEL_NUM').lower()) if env_dic.get('BUILD_PARALLEL_NUM') else 1 + except NameError as e: + logging.error(e) + return 1 + + +def get_available_gpu_num(): + """get the num of gpu""" + env_dic = os.environ + try: + return [int(id) for id in env_dic.get('USE_GPU_DEVICES').split(",")] if env_dic.get('USE_GPU_DEVICES') else [0, ] + except NameError as e: + logging.error(e) + return 1 + +def get_real_attr(value ,key ,need_tune_json, need_tune_keys): + if key not in need_tune_keys: + return value + if need_tune_json[key]['dtype'] == "bool": + if need_tune_json[key]['options'][value].lower() == "true": + return True + elif need_tune_json[key]['options'][value].lower() == "false": + return False + else: + raise TypeError("Wrong boolean type, please check json file") + elif need_tune_json[key]['dtype'] == "str": + if isinstance(need_tune_json[key]['options'][value], str): + return need_tune_json[key]['options'][value] + else: + raise TypeError("Wrong str type, please check json file") + elif need_tune_json[key]['dtype'] == "int": + if isinstance(need_tune_json[key]['options'][value], int): + return need_tune_json[key]['options'][value] + else: + raise TypeError("Wrong int type, please check json file") + + +def merge_attrs(attrs, config, need_tune_json): + tiling = [getattr(config, name) for name in getattr( + config, '_fields') if name.startswith('tiling')] + dim_str = '' + d_config = config._asdict() + d_attrs = attrs._asdict() + + is_2d_tiling = False + for name in getattr(config, '_fields'): + if name.startswith('tiling'): + if name.count("_") == 2: + is_2d_tiling = True + break + + for i, element in enumerate(tiling): + if is_2d_tiling: + if i % 2 == 0: + dim_str += "0 " + str(i//2) + " " + dim_str += str(element) + " " + else: + # 1d tiling + dim_str += "0 " + str(i) + " " + str(element) + " 1 " + + # add block, thread information + block = [str(getattr(config, name)) for name in getattr( + config, '_fields') if name.startswith('block')] + bind_block_str = ' '.join(block) + + thread = [str(getattr(config, name)) for name in getattr( + config, '_fields') if name.startswith('thread')] + bind_thread_str = ' '.join(thread) + + d_attrs['dim'] = dim_str + d_attrs['bind_block'] = bind_block_str + d_attrs['bind_thread'] = bind_thread_str + + need_tune_keys = need_tune_json.keys() + for key in need_tune_keys: + d_attrs[key] = d_config[key] + + # make a new attrs with config info + attrs_type = type(attrs) + config_list = [get_real_attr(d_attrs[k],k,need_tune_json, need_tune_keys) for k in d_attrs] + new_attrs = attrs_type(*config_list) + return new_attrs + + +def get_skip_configs_from_log(skip_configs_log): + skip_config_set = set() + if skip_configs_log != "": + with open(skip_configs_log, 'r') as file: + for line in file: + config = str(line.split("|")[1]).strip() + skip_config_set.add(config) + print("SKIP CONFIGS NUMBER:", len(skip_config_set)) + return skip_config_set + +def get_tuning_attrs_from_json(tuning_attrs_json): + import json + need_tune_spaces = [[]] + keys = [] + json_string = dict() + if tuning_attrs_json != "": + with open(tuning_attrs_json,'r') as file: + json_string =json.load(file) + for key in json_string.keys(): + keys.append(key) + num_options = len(json_string[key]['options']) + tmp_spaces = [] + for space in need_tune_spaces: + for i in range(num_options): + tmp_space = space[:] + tmp_space.append(i) + tmp_spaces.append(tmp_space) + need_tune_spaces = tmp_spaces[:] + return (keys, need_tune_spaces, json_string) + +if __name__ == "__main__": + """test components""" + file_name = "tuning_attrs_descs/reduce_tuning_attrs_desc.json" + keys, need_tune_spaces = get_tuning_attrs_from_json(file_name) + print(keys) + print(need_tune_spaces) \ No newline at end of file diff --git a/tests/fuzz/tune_for_gpu/autotuning/type_definitions.py b/tests/fuzz/tune_for_gpu/autotuning/type_definitions.py new file mode 100644 index 00000000..f792c73e --- /dev/null +++ b/tests/fuzz/tune_for_gpu/autotuning/type_definitions.py @@ -0,0 +1,49 @@ +# Copyright 2019 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""operator description and config param definitions""" +from collections import namedtuple + +# op desc for ascend +ConvDesc = namedtuple("ConvDesc", [ + 'fmap_shape', 'filter_shape', 'pad', 'stride', 'dilation', 'use_bias']) + +ConvBackpropDesc = namedtuple( + "ConvBackpropDesc", ['fmap_shape', 'filter_shape', 'pad', 'stride', 'dilation']) + +MatmulCubeDesc = namedtuple("MatmulCubeDesc", ["x_shape", "y_shape", "bias", "left_format", "right_format", + "out_format", "adj_x", "adj_y", "dtype", "bias_dtype", "out_dtype"]) + + +# op desc for gpu +ReduceGpuDesc = namedtuple("ReduceGpuDesc", [ + "in_shape", "in_dtype", "axis", "keepdims", + "poly_sch", "dim", "bind_block", "bind_thread", + "enable_akg_reduce_lib", "enable_atomic_add"]) + + +# config param definitions for ascend +ConvConfig = namedtuple('ConvConfig', [ + 'tile_h', 'tile_co', 'tile_m', 'tile_k', 'tile_n', 'tile_w', 'bypass']) +ConvBackpropInputConfig = namedtuple('ConvBackpropInputConfig', + ['tile_h', 'tile_co', 'tile_m', 'tile_k', 'tile_n', 'tile_w']) +ConvBackpropFilterConfig = namedtuple('ConvBackpropFilterConfig', + ['tile_ci', 'tile_kh', 'tile_kw', 'tile_co', 'tile_batch', + 'tile_h', 'tile_w', 'tile_m', 'tile_k', 'tile_n']) +MatmulCubeConfig = namedtuple( + 'MatmulCubeConfig', ['n_l1', 'n_l0', 'm_l1', 'm_l0', 'k_l1', 'k_l0', 'bypass']) + +# config param definitions for gpu + +EmptyConfig = namedtuple('empty', []) diff --git a/tests/fuzz/tune_for_gpu/config_gpu.sh b/tests/fuzz/tune_for_gpu/config_gpu.sh new file mode 100644 index 00000000..f6e082ee --- /dev/null +++ b/tests/fuzz/tune_for_gpu/config_gpu.sh @@ -0,0 +1,16 @@ +# how many multi-processing to build +export BUILD_PARALLEL_NUM=4 + +# set the default gpu devices, plz never change it +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + +# set the real devices you want to use +export USE_GPU_DEVICES=0,1,2,3 + +export RUNTIME_MODE=gpu + +export PROFILING_MODE=true + +# ascend config +export DEVICE_ID=0 +export DEVICE_TOTAL_NUM=8 diff --git a/tests/fuzz/tune_for_gpu/test_gpu.py b/tests/fuzz/tune_for_gpu/test_gpu.py new file mode 100644 index 00000000..a06064ce --- /dev/null +++ b/tests/fuzz/tune_for_gpu/test_gpu.py @@ -0,0 +1,67 @@ +# Copyright 2019-2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""test""" +import time +from autotuning.job import launch +from akg.utils import kernel_exec +from akg.ops.math_gpu import reduce_sum +from autotuning.type_definitions import ReduceGpuDesc +import numpy as np +import sys +import argparse +from autotuning.tuning_utils import get_skip_configs_from_log, get_tuning_attrs_from_json + + +def reduce_sum_gpu_execute(in_shape, dtype, axis=None, keepdims=False, attrs=False): + mod = utils.op_build_test(reduce_sum, (in_shape, ), (in_dtype, ), + kernel_name="reduce_sum_gpu", op_attrs=[axis, keepdims], + attrs={"target": "cuda", "enable_akg_reduce_lib": True}) + return mod + +def run_test_reduce_sum(in_shape, in_dtype, axis=None, keepdims=False, skip_config_set=None, tuning_attrs_info=None): + time_start = time.time() + op_type_ = 'reduce_sum_gpu' + debug_mode_ = True + save_res_ = True + all_space_ = True + op_config = [in_shape, in_dtype, axis, keepdims, + "", "", "", + True, True, True] + op_config = ReduceGpuDesc(*op_config) + desc_ = ('reduce_sum_gpu', reduce_sum_gpu_execute, + op_config, tuning_attrs_info) + launch(op_type=op_type_, debug_mode=debug_mode_, + save_res=save_res_, desc=desc_, all_space=all_space_, + from_json=False, skip_config_set=skip_config_set, + tuning_attrs_info=tuning_attrs_info) + time_end = time.time() + print("total tuning time: ", time_end - time_start) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--skip_configs_log", type=str, + default="", help="skip those configs in .log file") + parser.add_argument("--tuning_attrs_json", type=str, default="", + help="the json file to describe the tuning atttrs") + args = parser.parse_args() + + # check whether have configs need to skip + skip_config_set = get_skip_configs_from_log(args.skip_configs_log) + + # add tuning_attrs from json file + tuning_attrs_info = get_tuning_attrs_from_json(args.tuning_attrs_json) + + run_test_reduce_sum((1024, 1024), "float32", (1,), + False, skip_config_set=skip_config_set, tuning_attrs_info=tuning_attrs_info) diff --git a/tests/test_env.sh b/tests/test_env.sh index 2ca16cd7..ad80c58d 100644 --- a/tests/test_env.sh +++ b/tests/test_env.sh @@ -25,7 +25,7 @@ else TVM_ROOT="${AKG_DIR}/third_party/incubator-tvm" export LD_LIBRARY_PATH=${AKG_BUILD_DIR}:${LD_LIBRARY_PATH} - export PYTHONPATH=${TVM_ROOT}/python:${TVM_ROOT}/topi:${TVM_ROOT}/topi/python:${AKG_DIR}:${AKG_DIR}/python:${PYTHONPATH} + export PYTHONPATH=${TVM_ROOT}/python:${TVM_ROOT}/topi:${TVM_ROOT}/topi/python:${AKG_DIR}:${AKG_DIR}/tests/common:${AKG_DIR}/python:${AKG_DIR}/tests/operators/gpu:${AKG_DIR}/tests/fuzz/tune_for_gpu:${PYTHONPATH} if [ $# -eq 1 ] && [ $1 = "gpu" ]; then export LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:${LD_LIBRARY_PATH} fi -- Gitee