From 964188ef45f356121e66655f68442399a7b000a2 Mon Sep 17 00:00:00 2001
From: yiyanzhi_akane <yiyanzhi@huawei.com>
Date: Mon, 22 Mar 2021 14:42:57 +0800
Subject: [PATCH] [TUNING] add the gpu-tuning process to master

bug fix for ascend gen tuning space

bug fix: args in kernel_exec.py
---
 python/akg/build_module.py                    |  16 +-
 python/akg/utils/custom_tiling.py             |  36 +-
 python/akg/utils/kernel_exec.py               |  11 +-
 src/poly/tiling/custom_tiling.h               |  24 +
 src/poly/tiling/gen_tiling_space.cc           |  77 +-
 src/poly/tiling/tile_space.h                  |  10 +
 src/poly/tiling/tiling_analyzer.cc            |  24 +-
 src/poly/tiling/tiling_analyzer.h             |  31 +-
 src/poly/tiling/tiling_strategy_manager.h     |   9 +-
 .../tiling/tiling_strategy_manager_gpu.cc     | 242 +++++-
 tests/fuzz/tune_for_gpu/__init__.py           |   0
 .../autotuning/data_utils/sort_log.py         |  17 +
 .../tune_for_gpu/autotuning/gen_spaces_gpu.py |  95 +++
 tests/fuzz/tune_for_gpu/autotuning/job.py     | 501 ++++++++++++
 .../autotuning/kernel_compiler.py             | 407 ++++++++++
 tests/fuzz/tune_for_gpu/autotuning/runner.py  | 243 ++++++
 tests/fuzz/tune_for_gpu/autotuning/space.py   | 217 +++++
 .../autotuning/space_generators.py            | 753 ++++++++++++++++++
 .../autotuning/test_data_generators.py        | 147 ++++
 .../autotuning/tiling_strategies_gpu.py       |  84 ++
 tests/fuzz/tune_for_gpu/autotuning/tuner.py   | 359 +++++++++
 .../reduce_tuning_attrs_desc.json             |   9 +
 .../tune_for_gpu/autotuning/tuning_utils.py   | 155 ++++
 .../autotuning/type_definitions.py            |  49 ++
 tests/fuzz/tune_for_gpu/config_gpu.sh         |  16 +
 tests/fuzz/tune_for_gpu/test_gpu.py           |  67 ++
 tests/test_env.sh                             |   2 +-
 27 files changed, 3520 insertions(+), 81 deletions(-)
 create mode 100644 tests/fuzz/tune_for_gpu/__init__.py
 create mode 100644 tests/fuzz/tune_for_gpu/autotuning/data_utils/sort_log.py
 create mode 100644 tests/fuzz/tune_for_gpu/autotuning/gen_spaces_gpu.py
 create mode 100644 tests/fuzz/tune_for_gpu/autotuning/job.py
 create mode 100644 tests/fuzz/tune_for_gpu/autotuning/kernel_compiler.py
 create mode 100644 tests/fuzz/tune_for_gpu/autotuning/runner.py
 create mode 100644 tests/fuzz/tune_for_gpu/autotuning/space.py
 create mode 100644 tests/fuzz/tune_for_gpu/autotuning/space_generators.py
 create mode 100644 tests/fuzz/tune_for_gpu/autotuning/test_data_generators.py
 create mode 100644 tests/fuzz/tune_for_gpu/autotuning/tiling_strategies_gpu.py
 create mode 100644 tests/fuzz/tune_for_gpu/autotuning/tuner.py
 create mode 100644 tests/fuzz/tune_for_gpu/autotuning/tuning_attrs_descs/reduce_tuning_attrs_desc.json
 create mode 100644 tests/fuzz/tune_for_gpu/autotuning/tuning_utils.py
 create mode 100644 tests/fuzz/tune_for_gpu/autotuning/type_definitions.py
 create mode 100644 tests/fuzz/tune_for_gpu/config_gpu.sh
 create mode 100644 tests/fuzz/tune_for_gpu/test_gpu.py

diff --git a/python/akg/build_module.py b/python/akg/build_module.py
index 3f70f311..54a16033 100644
--- a/python/akg/build_module.py
+++ b/python/akg/build_module.py
@@ -50,7 +50,17 @@ def dump_tiling_info(level):
                 logging.info(info, tuning_spaces["index"][i][0], tuning_spaces["index"][i][1],
                              tuning_spaces["c1_range"][i][0], tuning_spaces["c1_range"][i][1],
                              tuning_spaces["c1_mod"][i][0], tuning_spaces["c0_range"][i][0],
-                             tuning_spaces["c0_range"][i][1], tuning_spaces["c0_mod"][i][0])
+                             tuning_spaces["c0_range"][i][1], tuning_spaces["c0_mod"][i][0],
+                             )
+            idx_to_str = {0: "x", 1: "y", 2: "z"}
+            for i in range(len(tuning_spaces["thread_range"])):
+                info = "[thread.%s] range [%d, %d](jump by %d), "
+                logging.info(info, idx_to_str[i], tuning_spaces["thread_range"][i][0], tuning_spaces["thread_range"][i][1],
+                             tuning_spaces['thread_mod'][i][0], )
+            for i in range(len(tuning_spaces["block_range"])):
+                info = "[block.%s]  range [%d, %d](jump by %d)"
+                logging.info(info, idx_to_str[i], tuning_spaces["block_range"][i][0],
+                             tuning_spaces["block_range"][i][1], tuning_spaces['block_mod'][i][0],)
             logging.info("===============================================")
         elif isinstance(indice, int) and indice == EMPTY_CODE:
             logging.info("Empty tiling space.")
@@ -108,6 +118,10 @@ def lower(sch, args, shape_params=None, name="default_function", binds=None, att
         tuning_spaces["c0_range"] = ret.c0_tile_range_table.asnumpy().tolist()
         tuning_spaces["c1_mod"] = ret.c1_tile_mod_table.asnumpy().tolist()
         tuning_spaces["c0_mod"] = ret.c0_tile_mod_table.asnumpy().tolist()
+        tuning_spaces["thread_range"] = ret.gpu_thread_range_table.asnumpy().tolist()
+        tuning_spaces["block_range"] = ret.gpu_block_range_table.asnumpy().tolist()
+        tuning_spaces["thread_mod"] = ret.gpu_thread_mod_table.asnumpy().tolist()
+        tuning_spaces["block_mod"] = ret.gpu_block_mod_table.asnumpy().tolist()
         if level >= help_tiling_level["Candidates"]:
             tuning_spaces["tuning_space"] = ret.tiling_candidate.asnumpy().tolist()
         if not tuning:
diff --git a/python/akg/utils/custom_tiling.py b/python/akg/utils/custom_tiling.py
index e5d7d060..4430af5b 100644
--- a/python/akg/utils/custom_tiling.py
+++ b/python/akg/utils/custom_tiling.py
@@ -70,15 +70,33 @@ class TileConstraint(Enum):
     SET_EXPANSION = "SET_EXPANSION"
     SET_MEM_RATIO = "SET_MEM_RATIO"
     SET_AXIS_INFO = "SET_AXIS_INFO"
+    THREAD_MIN = "THREAD_MIN"
+    THREAD_MAX = "THREAD_MAX"
+    THREAD_MOD = "THREAD_MOD"
+    BLOCK_MIN = "BLOCK_MIN"
+    BLOCK_MAX = "BLOCK_MAX"
+    BLOCK_MOD = "BLOCK_MOD"
 
 
-@check_input_type((double, float, int), TileConstraint, TileLevel)
+@check_input_type((double, float, int, list), TileConstraint, TileLevel)
 def modify_common_constraints(value, constraint, level=TileLevel.C1):
     """api for dsl to modify some default constraint used in auto tiling."""
     if constraint not in TileConstraint:
         raise ValueError("Tile constraints must be chosen from {0}".format(TileConstraint))
     if constraint == TileConstraint.SET_MEM_RATIO:
         return create_custom_tiling_node(TileMode.COMMON, tile_level=level, mem_ratio=double(value))
+    if constraint == TileConstraint.THREAD_MIN:
+        return create_custom_tiling_node(TileMode.COMMON, thread_min=value)
+    if constraint == TileConstraint.THREAD_MAX:
+        return create_custom_tiling_node(TileMode.COMMON, thread_max=value)
+    if constraint == TileConstraint.THREAD_MOD:
+        return create_custom_tiling_node(TileMode.COMMON, thread_mod=value)
+    if constraint == TileConstraint.BLOCK_MIN:
+        return create_custom_tiling_node(TileMode.COMMON, block_min=value)
+    if constraint == TileConstraint.BLOCK_MAX:
+        return create_custom_tiling_node(TileMode.COMMON, block_max=value)
+    if constraint == TileConstraint.BLOCK_MOD:
+        return create_custom_tiling_node(TileMode.COMMON, block_mod=value)
     raise TypeError("Constraint {} is not supported in this api, please use other api"
                     .format(constraint.value))
 
@@ -233,7 +251,13 @@ def create_custom_tiling_node(tile_mode,
                               axis_info=DEFAULT_STRING,
                               priority=DEFAULT_VALUE,
                               expansion=DEFAULT_VALUE,
-                              mem_ratio=double(DEFAULT_VALUE)):
+                              mem_ratio=double(DEFAULT_VALUE),
+                              thread_min=[],
+                              thread_max=[],
+                              thread_mod=[],
+                              block_min=[],
+                              block_max=[],
+                              block_mod=[]):
     """default method to create custom tiling node, all values are default except tile mode."""
 
     tile_min = to_tvm_type(tile_min, "tile_min")
@@ -257,7 +281,13 @@ def create_custom_tiling_node(tile_mode,
                              axis_info=akg.tvm.expr.StringImm(axis_info),
                              priority=priority,
                              expansion=expansion,
-                             mem_ratio=mem_ratio)
+                             mem_ratio=mem_ratio,
+                             thread_min=thread_min,
+                             thread_max=thread_max,
+                             thread_mod=thread_mod,
+                             block_min=block_min,
+                             block_max=block_max,
+                             block_mod=block_mod)
 
 
 def template_nc1hwc0(tensor_name, level):
diff --git a/python/akg/utils/kernel_exec.py b/python/akg/utils/kernel_exec.py
index cd98ec6b..1bee4e3c 100644
--- a/python/akg/utils/kernel_exec.py
+++ b/python/akg/utils/kernel_exec.py
@@ -35,6 +35,7 @@ import numpy as np
 
 import akg
 from akg.build_module import help_tiling_level
+from akg import backend as cce
 import akg.tvm
 from akg.tvm import autotvm
 from akg.tvm import rpc
@@ -88,7 +89,6 @@ def debug_mode(debug_flag):
         pass_list.append((0, ir_pass.inject_dma_intrin))
     return pass_list
 
-
 def func_time_required(func_name):
     """Checking the Time Required for Function Running."""
     def wrapper(*args, **kwargs):
@@ -467,7 +467,7 @@ def mod_launch_air(mod, args, outputs):
     return None
 
 @func_time_required
-def mod_launch(mod, args, outputs=(-1,), tuning=False, device_id=0, expect=None):
+def mod_launch(mod, args, outputs=(-1,), tuning=False, device_id=0, expect=None, repeat_time=400):
     """
     unified run CCE kernel api.
 
@@ -492,7 +492,7 @@ def mod_launch(mod, args, outputs=(-1,), tuning=False, device_id=0, expect=None)
         if not tuning:
             return out_list[0] if len(out_list) == 1 else tuple(out_list)
         else:
-            cycles = get_gpu_cycles(mod, *mod_args, device_id=device_id, save_log=True)
+            cycles = get_gpu_cycles(mod, *mod_args, device_id=device_id, save_log=True, repeat_time=repeat_time)
             return out_list[0] if len(out_list) == 1 else tuple(out_list), {'run_time': cycles}
 
     stat_info = {}
@@ -996,7 +996,6 @@ def op_build(op_func, input_shapes, input_types, op_attrs=None, kernel_name="",
     level = attrs.get("help_tiling") if attrs and "help_tiling" in attrs else None
     if tuning or (level is not None and level > help_tiling_level['None']):
         return gen_spaces_dim_key(op_func, args, s, op_var, kernel_name, attrs, polyhedral, tuning, target)
-
     mode = get_runtime_mode()
     if mode == "cpu":
         mod = akg.tvm.build(s, op_var, "llvm")
@@ -1069,12 +1068,12 @@ def get_device_id():
         logging.error(e)
         return 0
 
-def get_gpu_cycles(mod, *mod_args, device_id=0, save_log=False):
+def get_gpu_cycles(mod, *mod_args, device_id=0, save_log=False, repeat_time=400):
     "get gpu profiling cycles."
     func = tvm.get_global_func('GPUProfilerInit')
     func("")
     from akg.utils.result_analysis import gpu_profiling
-    gpu_profiling(mod, *mod_args, repeat_time=400, device_id=device_id)
+    gpu_profiling(mod, *mod_args, repeat_time=repeat_time, device_id=device_id)
     func = tvm.get_global_func('GPUProfilerStop')
     a = func()
     return int(a)
diff --git a/src/poly/tiling/custom_tiling.h b/src/poly/tiling/custom_tiling.h
index b1da52cb..f3848af0 100644
--- a/src/poly/tiling/custom_tiling.h
+++ b/src/poly/tiling/custom_tiling.h
@@ -80,6 +80,24 @@ class CustomTilingNode : public Node {
    * default is 0.5 which is reserved for double buffer*/
   double mem_ratio;
 
+  /*! \brief minimal thread binding factor on gpu, greater than 0*/
+  Array<Expr> thread_min;
+
+  /*! \brief maximal thread binding factor on gpu*/
+  Array<Expr> thread_max;
+
+  /*! \brief constraint thread binding factor % thread_mod == 0*/
+  Array<Expr> thread_mod;
+
+    /*! \brief minimal block binding factor on gpu, greater than 0*/
+  Array<Expr> block_min;
+
+  /*! \brief maximal block binding factor on gpu*/
+  Array<Expr> block_max;
+
+  /*! \brief constraint block binding factor % block_mod == 0*/
+  Array<Expr> block_mod;
+
   void VisitAttrs(AttrVisitor *v) {
     v->Visit("tile_level", &tile_level);
     v->Visit("tile_mode", &tile_mode);
@@ -97,6 +115,12 @@ class CustomTilingNode : public Node {
     v->Visit("priority", &priority);
     v->Visit("expansion", &expansion);
     v->Visit("mem_ratio", &mem_ratio);
+    v->Visit("thread_min", &thread_min);
+    v->Visit("thread_max", &thread_max);
+    v->Visit("thread_mod", &thread_mod);
+    v->Visit("block_min", &block_min);
+    v->Visit("block_max", &block_max);
+    v->Visit("block_mod", &block_mod);
   }
 
   static constexpr const char *_type_key = "CustomTilingNode";
diff --git a/src/poly/tiling/gen_tiling_space.cc b/src/poly/tiling/gen_tiling_space.cc
index 779ef84a..72f61623 100644
--- a/src/poly/tiling/gen_tiling_space.cc
+++ b/src/poly/tiling/gen_tiling_space.cc
@@ -36,6 +36,15 @@ class TileSpaceCollector {
     space_->c1_tile_mod_table = init_array;
     space_->c0_tile_mod_table = init_array;
     space_->tiling_candidate = init_array;
+    space_->gpu_thread_range_table = init_array;
+    space_->gpu_block_range_table = init_array;
+    space_->gpu_thread_mod_table = init_array;
+    space_->gpu_block_mod_table = init_array;
+    if (analyzer_.scop_info_.user_config_.GetTarget() == TARGET_CUDA) {
+      cared_info_ = {"index", "C1_range", "C0_range", "C1_mod", "C0_mod", "gpu_thread_range", "gpu_block_range", "gpu_thread_mod", "gpu_block_mod"};
+    } else {
+      cared_info_ = {"index", "C1_range", "C0_range", "C1_mod", "C0_mod"};
+    }
   }
   ~TileSpaceCollector() = default;
 
@@ -122,38 +131,61 @@ class TileSpaceCollector {
     // step 2. collect cared info from each axis
     for (const auto &con : cared_info_) {
       int length = con.find("mod") != std::string::npos ? 1 : 2;
-      auto array = air::runtime::NDArray::Empty({static_cast<int64_t>(tile_size), length}, type, ctx);
+      auto size = static_cast<int64_t>(tile_size);
+      if (con.find("gpu") != std::string::npos) {
+        size = std::max<int64_t>(3, size);
+      }
+      auto array = air::runtime::NDArray::Empty({size, length}, type, ctx);
       auto spaceDlPack = array.ToDLPack();
       auto ptr = reinterpret_cast<int *>(spaceDlPack->dl_tensor.data);
-      for (size_t b_idx = 0; b_idx < all_axes.size(); ++b_idx) {
-        for (size_t a_idx = 0; a_idx < all_axes[b_idx].size(); ++a_idx) {
-          if (con == "index") {
-            *ptr++ = b_idx;
-            *ptr++ = a_idx;
+      if (con.find("gpu") != std::string::npos) {
+        size_t s = con.find("thread") != std::string::npos ? 0 : 3;
+        size_t e = con.find("thread") != std::string::npos ? 3 : 6;
+        for (size_t i = s; i < e; ++i) {
+          if (length == 1) {
+            *ptr++ = analyzer_.binding_spaces_[i].map_mod_;
           } else {
-            if (con == "C1_range") {
-              TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE1);
-              *ptr++ = const_cons.tile_min_.as<IntImm>()->value;
-              *ptr++ = const_cons.tile_extent_.as<IntImm>()->value;
-            } else if (con == "C0_range") {
-              TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE0);
-              *ptr++ = const_cons.tile_min_.as<IntImm>()->value;
-              *ptr++ = const_cons.tile_extent_.as<IntImm>()->value;
-            } else if (con == "C1_mod") {
-              TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE1);
-              *ptr++ = const_cons.tile_mod_.as<IntImm>()->value;
-            } else if (con == "C0_mod") {
-              TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE0);
-              *ptr++ = const_cons.tile_mod_.as<IntImm>()->value;
+            *ptr++ = analyzer_.binding_spaces_[i].map_min_;
+            *ptr++ = analyzer_.binding_spaces_[i].map_extent_;
+          }
+        }
+      } else {
+        for (size_t b_idx = 0; b_idx < all_axes.size(); ++b_idx) {
+          for (size_t a_idx = 0; a_idx < all_axes[b_idx].size(); ++a_idx) {
+            if (con == "index") {
+              *ptr++ = b_idx;
+              *ptr++ = a_idx;
+            } else {
+              if (con == "C1_range") {
+                TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE1);
+                *ptr++ = const_cons.tile_min_.as<IntImm>()->value;
+                *ptr++ = const_cons.tile_extent_.as<IntImm>()->value;
+              } else if (con == "C0_range") {
+                TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE0);
+                *ptr++ = const_cons.tile_min_.as<IntImm>()->value;
+                *ptr++ = const_cons.tile_extent_.as<IntImm>()->value;
+              } else if (con == "C1_mod") {
+                TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE1);
+                *ptr++ = const_cons.tile_mod_.as<IntImm>()->value;
+              } else if (con == "C0_mod") {
+                TileAxis::Constraint const_cons = all_axes[b_idx][a_idx]->GetConstConstraint(CACHE0);
+                *ptr++ = const_cons.tile_mod_.as<IntImm>()->value;
+              }
             }
           }
         }
       }
+
       if (con == "index") space_->index_table = array;
       if (con == "C1_range") space_->c1_tile_range_table = array;
       if (con == "C0_range") space_->c0_tile_range_table = array;
       if (con == "C1_mod") space_->c1_tile_mod_table = array;
       if (con == "C0_mod") space_->c0_tile_mod_table = array;
+      if (con == "gpu_thread_range") space_->gpu_thread_range_table = array;
+      if (con == "gpu_block_range") space_->gpu_block_range_table = array;
+      if (con == "gpu_thread_mod") space_->gpu_thread_mod_table = array;
+      if (con == "gpu_block_mod") space_->gpu_block_mod_table = array;
+
       delete spaceDlPack;
     }
   }
@@ -196,7 +228,8 @@ class TileSpaceCollector {
       bool min_tile_ok = false;
       for (int64_t tile = tile_min->value; tile <= tile_extent->value; ++tile) {
         bool break_constraint =
-          (tile != tile_min->value) && (tile != tile_extent->value) && (tile % tile_mod->value != 0);
+          ((tile != tile_min->value) && (tile != tile_extent->value) && (tile % tile_mod->value != 0)) ||
+          (axis->forbid_iso && tile_extent->value % tile != 0);
         if (analyzer_.scop_info_.user_config_.GetPruneTuningSpace() && break_constraint) {
           continue;
         }
@@ -365,7 +398,7 @@ class TileSpaceCollector {
   DLContext ctx = {kDLCPU, 0};
   std::vector<TileAxis *> tile_axes_;
   std::vector<bool> is_shared_;
-  std::unordered_set<std::string> cared_info_ = {"index", "C1_range", "C0_range", "C1_mod", "C0_mod"};
+  std::unordered_set<std::string> cared_info_;
 
   struct Result {
     std::vector<int> tile;
diff --git a/src/poly/tiling/tile_space.h b/src/poly/tiling/tile_space.h
index e1a00a02..5171e85a 100644
--- a/src/poly/tiling/tile_space.h
+++ b/src/poly/tiling/tile_space.h
@@ -28,6 +28,11 @@ class TileSpaceNode : public Node {
   air::runtime::NDArray c1_tile_mod_table;
   air::runtime::NDArray c0_tile_mod_table;
   air::runtime::NDArray tiling_candidate;
+  air::runtime::NDArray gpu_thread_range_table;
+  air::runtime::NDArray gpu_block_range_table;
+  air::runtime::NDArray gpu_thread_mod_table;
+  air::runtime::NDArray gpu_block_mod_table;
+
 
   void VisitAttrs(AttrVisitor *v) {
     v->Visit("index_table", &index_table);
@@ -36,6 +41,11 @@ class TileSpaceNode : public Node {
     v->Visit("c1_tile_mod_table", &c1_tile_mod_table);
     v->Visit("c0_tile_mod_table", &c0_tile_mod_table);
     v->Visit("tiling_candidate", &tiling_candidate);
+    v->Visit("gpu_thread_range_table", &gpu_thread_range_table);
+    v->Visit("gpu_block_range_table", &gpu_block_range_table);
+    v->Visit("gpu_thread_mod_table", &gpu_thread_mod_table);
+    v->Visit("gpu_block_mod_table", &gpu_block_mod_table);
+
   }
   static constexpr const char *_type_key = "TileSpace";
   TVM_DECLARE_NODE_TYPE_INFO(TileSpaceNode, Node);
diff --git a/src/poly/tiling/tiling_analyzer.cc b/src/poly/tiling/tiling_analyzer.cc
index e4b37dc5..bdb97de5 100644
--- a/src/poly/tiling/tiling_analyzer.cc
+++ b/src/poly/tiling/tiling_analyzer.cc
@@ -1351,19 +1351,34 @@ void TilingAnalyzer::AddPostTilingConstraints() {
 
   if (scop_info_.user_config_.GetTarget() == TARGET_CUDA) {
     ReduceStrategy reduce_strategy(this);
-    actived_strategies.push_back(&reduce_strategy);
     ModStrategy mod_strategy(this);
-    actived_strategies.push_back(&mod_strategy);
-
+    GemmStrategy gemm_strategy(this);
     GpuDmaAnalysisStrategy dma_analysis_strategy(this);
+    CustomTilingStrategy custom_strategy(this);
     GpuStrategy gpu_strategy(this);
     if (scop_info_.analysis_result_.GetIsGpuDmaAnalysed()) {
       actived_strategies.push_back(&dma_analysis_strategy);
     } else {
+      if (scop_info_.user_config_.GetIsTuning()) {
+        actived_strategies.push_back(&custom_strategy);
+      } else {
+        actived_strategies.push_back(&reduce_strategy);
+        actived_strategies.push_back(&mod_strategy);
+        actived_strategies.push_back(&gemm_strategy);
+      }
       actived_strategies.push_back(&gpu_strategy);
     }
     strategy_manager->SetStrategies(actived_strategies);
     strategy_manager->ExecuteGpu();
+    if (scop_info_.user_config_.GetIsTuning()) {
+      binding_spaces_.clear();
+      for (auto i : gpu_strategy.thread_binding_spaces_) {
+        UpdateBindingSpace(i);
+      }
+      for (auto i : gpu_strategy.block_binding_spaces_) {
+        UpdateBindingSpace(i);
+      }
+    }
     return;
   }
 }
@@ -1376,7 +1391,6 @@ void TilingAnalyzer::AddTilingConstraints() {
   if (scop_info_.user_config_.GetTarget() == TARGET_CUDA) {
     CastStrategy cast_strategy(this);
     actived_strategies.push_back(&cast_strategy);
-
     strategy_manager->SetStrategies(actived_strategies);
     strategy_manager->ExecuteGpu();
     return;
@@ -1429,7 +1443,7 @@ void TilingAnalyzer::AddTilingConstraints() {
 
 bool TilingAnalyzer::Prepare() {
   logger_ = std::unique_ptr<TileLogger>(new (std::nothrow) TileLogger(
-    scop_info_.AddDumpDir("tiling.log"), !scop_info_.user_config_.GetDumpPolyDir().empty()));
+  scop_info_.AddDumpDir("tiling.log"), !scop_info_.user_config_.GetDumpPolyDir().empty()));
   CHECK(logger_) << "memory alloc fail.";
   // Stage 1: Analyze schedule tree.
   ScheduleTreeAnalyzer sch_ana(this, this->sch_);
diff --git a/src/poly/tiling/tiling_analyzer.h b/src/poly/tiling/tiling_analyzer.h
index 3a00e83d..b3adc711 100644
--- a/src/poly/tiling/tiling_analyzer.h
+++ b/src/poly/tiling/tiling_analyzer.h
@@ -64,7 +64,7 @@ inline int64_t GetAlignBytes(const int64_t dtype) {
   return (ALIGN_BYTES + dtype - 1) / dtype;
 }
 
-inline int64_t GetMaxAlignBytes(std::unordered_map<std::string, std::vector<int>> dtypes) {
+inline int64_t GetMinBytes(std::unordered_map<std::string, std::vector<int>> dtypes) {
   int64_t min_byte = -1;
   for (const auto &it : dtypes) {
     if (it.second.empty()) {
@@ -75,7 +75,11 @@ inline int64_t GetMaxAlignBytes(std::unordered_map<std::string, std::vector<int>
       min_byte = min_elem;
     }
   }
-  return GetAlignBytes(min_byte);
+  return min_byte;
+}
+
+inline int64_t GetMaxAlignBytes(std::unordered_map<std::string, std::vector<int>> dtypes) {
+  return GetAlignBytes(GetMinBytes(dtypes));
 }
 
 inline Expr CastToExpr(const std::string &value) {
@@ -134,6 +138,12 @@ constexpr auto AT_DYNAMIC_BOUND = "DYNAMIC_BOUND";
 constexpr auto AT_MOD = "MOD";
 constexpr auto AT_CAST = "CAST";
 constexpr auto AT_MEM_RATIO = "MEM_RATIO";
+constexpr auto AT_THREAD_MIN = "THREAD_MIN";
+constexpr auto AT_THREAD_MAX = "THREAD_MAX";
+constexpr auto AT_THREAD_MOD = "THREAD_MOD";
+constexpr auto AT_BLOCK_MIN = "BLOCK_MIN";
+constexpr auto AT_BLOCK_MAX = "BLOCK_MAX";
+constexpr auto AT_BLOCK_MOD = "BLOCK_MOD";
 
 class TilingAnalyzer;
 
@@ -233,12 +243,12 @@ class TilingAnalyzer {
         sch_(sch),
         scop_info_(scop_info),
         is_retry_(!global_attrs.GetStringAttr(kErrorInfo, "").empty()) {
-    if (scop_info.mmu_info_.IsGemm()) {
-      op_type_ = GEMM_OP;
-    } else if (scop_info.mmu_info_.IsConv()) {
-      op_type_ = CONV_OP;
-    } else {
-      op_type_ = VECTOR_OP;
+          if (scop_info.mmu_info_.IsGemm()) {
+            op_type_ = GEMM_OP;
+          } else if (scop_info.mmu_info_.IsConv()) {
+            op_type_ = CONV_OP;
+          } else {
+            op_type_ = VECTOR_OP;
     }
   }
 
@@ -292,7 +302,7 @@ class TilingAnalyzer {
     CHECK(logger_);
     return *(logger_.get());
   }
-
+  void UpdateBindingSpace(TileAxis::MappingConstraint constraint) { binding_spaces_.emplace_back(constraint); }
   Stmt body_;
   Binds &binds_;
   isl::schedule sch_;
@@ -306,9 +316,8 @@ class TilingAnalyzer {
 
   std::unordered_map<TilingAnalyzer::BufferEntry *, std::pair<int, int>> buffer_usage_timetable_;
   std::unordered_map<std::string, std::shared_ptr<BufferEntry>> buf_info_;
-
   bool is_retry_{false};
-
+  std::vector<TileAxis::MappingConstraint> binding_spaces_;  // [thread.x[min, max, mod], thread.y, thread.z, block.x, block.y, block.z]
  private:
   void AddTilingConstraints();
   void AddPostTilingConstraints();
diff --git a/src/poly/tiling/tiling_strategy_manager.h b/src/poly/tiling/tiling_strategy_manager.h
index 140b0cae..513745fc 100644
--- a/src/poly/tiling/tiling_strategy_manager.h
+++ b/src/poly/tiling/tiling_strategy_manager.h
@@ -284,8 +284,6 @@ class GemmStrategy : public TilingStrategy {
   ~GemmStrategy() {}
   void AddNpuConstraint();
   void AddGpuConstraint();
-
-  std::string interested_attr_key = AT_GEMM;
 };
 
 class GpuStrategy : public TilingStrategy {
@@ -306,6 +304,8 @@ class GpuStrategy : public TilingStrategy {
   };
   void AddNpuConstraint();
   void AddGpuConstraint();
+  std::vector<TileAxis::MappingConstraint> thread_binding_spaces_;  // [thread.x, thread.y, thread.z]
+  std::vector<TileAxis::MappingConstraint> block_binding_spaces_;  // [block.x, block.y, block.z]
 
  private:
   void DetermineTemplate();
@@ -326,6 +326,8 @@ class GpuStrategy : public TilingStrategy {
   // Step 1. Collect axes and sort them from inner to outer
   void BuildAxesQueue();
 
+  void ApplyCustomConstraint();
+
   /*
    * Step 2. Tile inner axes first and map them to threads, and then tile outer axis and map the rest of them to blocks.
    * e.g.
@@ -357,6 +359,7 @@ class GpuStrategy : public TilingStrategy {
   int64_t min_elem_for_io_bound_ = 2;
   size_t depth_{0};
   bool need_reverse_{false};
+  bool reverse_binding_{false};
   int64_t fused_size_{1};
   std::unordered_map<int, std::string> template_map_ = {{0, "DEFAULT"},   {1, "PURE_ELEM"},    {2, "BROADCAST_OP"},
                                                         {3, "REDUCTION"}, {4, "ALL_REDUCE"},   {5, "BITWISE_REDUCTION"},
@@ -378,7 +381,7 @@ class MulticoreStrategy {
 
 class TilingPriorityScorer {
  public:
-  TilingPriorityScorer(TilingAnalyzer &analyzer) : analyzer_(analyzer), logger_(analyzer.GetTileLogger()) {}
+ TilingPriorityScorer(TilingAnalyzer &analyzer) : analyzer_(analyzer), logger_(analyzer.GetTileLogger()) {}
   ~TilingPriorityScorer() {}
 
   /*
diff --git a/src/poly/tiling/tiling_strategy_manager_gpu.cc b/src/poly/tiling/tiling_strategy_manager_gpu.cc
index f0f9083c..8233b21c 100644
--- a/src/poly/tiling/tiling_strategy_manager_gpu.cc
+++ b/src/poly/tiling/tiling_strategy_manager_gpu.cc
@@ -18,7 +18,6 @@
 #include <numeric>
 
 #include "tiling_analyzer.h"
-
 namespace akg {
 namespace ir {
 namespace poly {
@@ -377,13 +376,129 @@ void ReduceStrategy::DealWithPostReduceTensors() {
   }
 }
 
+void GpuStrategy::ApplyCustomConstraint() {
+  auto ParseBindingConstraint = [](const std::string constraint, size_t max_size) {
+    std::vector<std::string> sp = akg::common::Split(constraint, ",");
+    std::vector<int64_t> ret;
+    for (auto val : sp) {
+      if (ret.size() == max_size) {
+        break;
+      }
+      CHECK(!val.empty());
+      ret.emplace_back(static_cast<int>(std::strtol(val.c_str(), nullptr, 10)));
+    }
+    return ret;
+  };
+
+  // init binding space through template-determined limit
+  thread_binding_spaces_.clear();
+  block_binding_spaces_.clear();
+  for (size_t i = 0; i < thread_limit_.size(); ++i) {
+    TileAxis::MappingConstraint elem;
+    elem.map_extent_ = thread_limit_[i];
+    thread_binding_spaces_.emplace_back(elem);
+  }
+  for (size_t i = 0; i < std::min(depth_, block_limit_.size()); ++i) {
+    TileAxis::MappingConstraint elem;
+    elem.map_extent_ = block_limit_[i];
+    block_binding_spaces_.emplace_back(elem);
+  }
+
+  // add constraints to binding space according to custom tiling
+  std::unordered_set<std::string> thread_keys = {AT_THREAD_MIN, AT_THREAD_MAX, AT_THREAD_MOD};
+  std::unordered_set<std::string> block_keys = {AT_BLOCK_MIN, AT_BLOCK_MAX, AT_BLOCK_MOD};
+  for (const auto attr : analyzer_->RootAxis()->attrs) {
+    std::vector<int64_t> constraint;
+    std::vector<TileAxis::MappingConstraint> target;
+    if (thread_keys.find(attr.attr_key) != thread_keys.end()) {
+      constraint = ParseBindingConstraint(attr.attr_value, thread_binding_spaces_.size());
+      target = thread_binding_spaces_;
+    } else if (block_keys.find(attr.attr_key) != block_keys.end()) {
+      constraint = ParseBindingConstraint(attr.attr_value, block_binding_spaces_.size());
+      target = block_binding_spaces_;
+    }
+    if (constraint.empty()) {
+      continue;
+    }
+
+    for (size_t i = 0; i < constraint.size(); ++i) {
+      if (attr.attr_key.find("MIN") != std::string::npos) {
+        target[i].map_min_ = std::max<int64_t>(target[i].map_min_, constraint[i]);
+      } else if (attr.attr_key.find("MAX") != std::string::npos && constraint[i] > 0) {
+        target[i].map_extent_ = std::min<int64_t>(target[i].map_extent_, constraint[i]);
+      } else if (attr.attr_key.find("MOD") != std::string::npos) {
+        target[i].map_mod_ = std::max<int64_t>(1, constraint[i]);
+      }
+    }
+
+    if (thread_keys.find(attr.attr_key) != thread_keys.end()) {
+      thread_binding_spaces_ = target;
+    } else if (block_keys.find(attr.attr_key) != block_keys.end()) {
+      block_binding_spaces_ = target;
+    }
+  }
+
+  // apply custom constraint to corresponding axis and modify binding space according to tile range of axis
+  size_t cur_depth = 0;
+  analyzer_->ForEachAxisTopDown([this, &cur_depth](TileAxis *axis) {
+    if (axis == analyzer_->RootAxis()) {
+      return;
+    }
+    auto cons = axis->GetConstConstraint(CACHE1);
+    auto range_extent = axis->GetConstExtent();
+    int tile_min = cons.tile_min_.as<IntImm>()->value;
+    int tile_extent = cons.tile_extent_.as<IntImm>()->value;
+    auto idx = reverse_binding_ ? cur_depth : depth_ - 1 - cur_depth;
+
+    auto thread_extent = tile_extent;
+    if (idx < thread_binding_spaces_.size()) {
+      thread_extent = std::min<int64_t>(thread_extent, thread_binding_spaces_[idx].map_extent_);
+      thread_binding_spaces_[idx].map_extent_ = thread_extent;
+    }
+
+    auto block_extent = range_extent / tile_min;
+    if (idx < block_binding_spaces_.size()) {
+      block_extent = std::min<int64_t>(block_extent, block_binding_spaces_[idx].map_extent_);
+      block_binding_spaces_[idx].map_extent_ = block_extent;
+    }
+
+    auto block_min = block_extent / std::max<int64_t>(1, thread_extent);
+    if (idx < block_binding_spaces_.size()) {
+      block_min = std::max<int64_t>(block_min, block_binding_spaces_[idx].map_min_);
+      block_binding_spaces_[idx].map_min_ = block_min;
+    }
+
+    axis->thread_constraints.map_extent_ = thread_extent;
+    axis->block_constraints.map_extent_ = block_extent;
+    axis->block_constraints.map_min_ = block_min;
+    if (idx < thread_binding_spaces_.size()) {
+      axis->thread_constraints.map_mod_ = thread_binding_spaces_[idx].map_mod_;
+    }
+    if (idx < block_binding_spaces_.size()) {
+      axis->block_constraints.map_mod_ = block_binding_spaces_[idx].map_mod_;
+    }
+    ++cur_depth;
+  });
+}
+
 void GpuStrategy::AddGpuConstraint() {
   InitMappingLimit();
-  if (template_ == Template::BROADCAST_OP || template_ == Template::CUSTOM_CONFIG) {
+  if (!analyzer_->scop_info_.user_config_.GetIsTuning() &&
+      (template_ == Template::BROADCAST_OP || template_ == Template::CUSTOM_CONFIG)) {
     BroadcastSpeedup();
   }
   BuildAxesQueue();
   if (analyzer_->scop_info_.user_config_.GetIsTuning()) {
+    ApplyCustomConstraint();
+    for (size_t i = 0; i < max_dim_; ++i) {
+      TileAxis::MappingConstraint pad;
+      if (i >= thread_binding_spaces_.size()) {
+        thread_binding_spaces_.emplace_back(pad);
+      }
+      if (i >= block_binding_spaces_.size()) {
+        block_binding_spaces_.emplace_back(pad);
+      }
+    }
     return;
   }
   InnerThreadOuterBlock();
@@ -391,19 +506,27 @@ void GpuStrategy::AddGpuConstraint() {
     InjectiveSpeedup();
   }
   SetMappingConfig();
+  if (template_ != Template::MATMUL || !analyzer_->scop_info_.user_config_.GetEnableTensorCore()) {
+    analyzer_->ForEachAxisTopDown([this](TileAxis *axis) {
+      if (axis == analyzer_->RootAxis()) {
+        return;
+      }
+      axis->TileRestrainToSingleValue(axis->c1_constraints.tile_min_, TileLevel::CACHE0);
+    });
+  }
 }
 
 void GpuStrategy::InitMappingLimit() {
   max_num_threads_ = analyzer_->scop_info_.user_config_.GetMaxElemPerThread();
   DetermineTemplate();
   std::stringstream ss;
-  need_reverse_ = analyzer_->scop_info_.user_config_.GetEnableAkgReduceLib() &&
-                  analyzer_->scop_info_.analysis_result_.GetReduceDirection() == Y_DIRECTION;
+  reverse_binding_ = analyzer_->scop_info_.user_config_.GetEnableAkgReduceLib() &&
+                     analyzer_->scop_info_.analysis_result_.GetReduceDirection() == Y_DIRECTION;
 
   if (template_ == Template::CUSTOM_CONFIG) {
     auto thread_config = analyzer_->scop_info_.user_config_.GetThreadConfig();
     for (size_t i = 0; i < thread_config->bound; ++i) {
-      auto idx = need_reverse_ ? thread_config->bound - 1 - i : i;
+      auto idx = reverse_binding_ ? thread_config->bound - 1 - i : i;
       if (idx >= depth_) {
         continue;
       }
@@ -427,12 +550,16 @@ void GpuStrategy::InitMappingLimit() {
   } else if (template_ == Template::MATMUL) {
     // This is a naive tiling strategy used in gpu when thread and block configs are already set.
     // This strategy will tile up to three inner-most axes to 32 (for thread binding).
-    thread_limit_ = {32, 8};
+    if (analyzer_->scop_info_.user_config_.GetEnableTensorCore()) {
+      thread_limit_ = {warp_sizes_, 16};
+    } else {
+      thread_limit_ = {warp_sizes_, 8};
+    }
   } else {
     thread_limit_ = {max_x_y_dim_thread_, max_x_y_dim_thread_, max_z_dim_thread_};
   }
 
-  if (template_ != Template::CUSTOM_CONFIG) {
+  if (template_ != Template::CUSTOM_CONFIG && !analyzer_->scop_info_.user_config_.GetEnableTensorCore()) {
     AdjustThreadMappingLimit();
   }
 
@@ -505,13 +632,21 @@ void GpuStrategy::InnerThreadOuterBlock() {
       tile = tile == SpItemPerThread::AUTO   ? std::min(axis->thread_constraints.item_process_, max_elem_per_thread_)
              : tile == SpItemPerThread::FULL ? std::min(shape, max_elem_per_thread_)
                                              : 1;
-      if (axis->block_constraints.map_extent_ > 1) {
-        tile =
-          std::max(tile, std::max<int64_t>(ceil(static_cast<float>(shape) / axis->block_constraints.map_extent_), 1));
-        pending_axes_.push_back(std::make_pair(axis, std::max<int64_t>(ceil(static_cast<float>(shape) / tile), 1)));
-        ss << ", map to block.";
+      auto tile_min = axis->c1_constraints.tile_min_.as<IntImm>()->value;
+      auto tile_extent = axis->c1_constraints.tile_extent_.as<IntImm>()->value;
+      if (tile_min == tile_extent && tile_extent != MIN_TILE) {
+        ss << "tile extent is already determined = " << tile_extent;
+        analyzer_->GetTileLogger().AppendLog(GPU_MAPPING, ss);
+        tile = tile_min;
       } else {
-        tile = std::min(tile, shape);
+        if (axis->block_constraints.map_extent_ > 1) {
+          tile =
+            std::max(tile, std::max<int64_t>(ceil(static_cast<float>(shape) / axis->block_constraints.map_extent_), 1));
+          pending_axes_.push_back(std::make_pair(axis, std::max<int64_t>(ceil(static_cast<float>(shape) / tile), 1)));
+          ss << ", map to block.";
+        } else {
+          tile = std::min(tile, shape);
+        }
       }
       axis->TileRestrainLower(tile, TileLevel::CACHE1);
       ss << ", tile = " << tile;
@@ -522,19 +657,11 @@ void GpuStrategy::InnerThreadOuterBlock() {
       rest_threads = std::min(rest_threads, axis->thread_constraints.map_extent_);
     }
 
-    if (thread_cfg_.size() >= thread_dim || inner_dim >= max_dim_) {
+    if (rest_threads <= 1 || thread_cfg_.size() >= thread_dim || inner_dim >= max_dim_) {
       ss << ", no thread/dim rests";
       SkipMapping();
       continue;
     }
-    if (rest_threads <= 1) {
-      if (axis->mc_sup ||
-          (template_ == Template::REDUCTION && analyzer_->scop_info_.user_config_.GetEnableAkgReduceLib())) {
-        thread_cfg_.emplace_back(1);
-      }
-      SkipMapping();
-      continue;
-    }
     auto item = elem_per_thread_[inner_dim] == SpItemPerThread::AUTO ? axis->thread_constraints.item_process_
                                                                      : elem_per_thread_[inner_dim];
     item = std::min(item, max_elem_per_thread_);
@@ -575,6 +702,7 @@ void GpuStrategy::InnerThreadOuterBlock() {
         if (pending_axes_.size() - i > block_dim) {
           auto axis = pending_axes_[i].first;
           ss << "axis " << axis->index << "_" << axis->dim_axis
+
              << " exceeded block dim and should be mapped to block for higher performance, consider flatten";
           analyzer_->GetTileLogger().AppendLog(GPU_MAPPING, ss);
           continue;
@@ -594,7 +722,7 @@ void GpuStrategy::InnerThreadOuterBlock() {
     int64_t shape;
     std::tie(axis, shape) = pending_axes_[i];
     auto idx = pending_axes_.size() - 1 - i;
-    idx = need_reverse_ ? block_limit_.size() - 1 - idx : idx;
+    idx = reverse_binding_ ? block_limit_.size() - 1 - idx : idx;
     auto rest_blocks = std::min(max_num_blocks_ / activated_blocks, block_limit_[idx]);
     rest_blocks = std::min(rest_blocks, axis->block_constraints.map_extent_);
     ss << "axis " << axis->index << "_" << axis->dim_axis << " shape = " << shape << ", rest blocks = " << rest_blocks;
@@ -635,11 +763,9 @@ void GpuStrategy::SetMappingConfig() {
   if (block_cfg_.empty()) {
     block_cfg_.emplace_back(1);
   }
-  bool reverse_binding = (analyzer_->scop_info_.user_config_.GetEnableAkgReduceLib() &&
-                          analyzer_->scop_info_.analysis_result_.GetReduceDirection() == Y_DIRECTION);
   std::string block_str = "";
   std::string thread_str = "";
-  if (reverse_binding) {
+  if (reverse_binding_) {
     for (int i = 0; i < static_cast<int>(block_cfg_.size()); ++i) {
       if (i >= block_count_) {
         continue;
@@ -753,7 +879,7 @@ int64_t GpuStrategy::TileAfterThreadMapping(TileAxis *axis, size_t inner_dim, in
       tile = thread_size;
       ss << "tile = thread size, ";
     } else {
-      auto block_dim = need_reverse_ ? inner_dim : block_limit_.size() - 1 - inner_dim;
+      auto block_dim = reverse_binding_ ? inner_dim : block_limit_.size() - 1 - inner_dim;
       int64_t least_blocks;
       if (block_dim >= 0 && block_dim < block_limit_.size()) {
         least_blocks = block_limit_[block_dim];
@@ -1139,12 +1265,70 @@ void GpuStrategy::GpuVectorBroadcastStrategy() {
   }
 }
 
+void CustomTilingStrategy::AddGpuConstraint() {
+  auto interested_info = GetInterestedInfo(interested_attr_key, false);
+  for (auto it : interested_info) {
+    TileAxis *axis = it.first;
+    for (auto attr : it.second) {
+      std::vector<std::string> modes = akg::common::Split(attr.attr_key, ":");
+      CHECK_EQ(modes.size(), 2U);
+      std::string constraint_str = attr.attr_value;
+      if (constraint_str.find("->") != std::string::npos) {
+        std::vector<std::string> res = akg::common::Split(constraint_str, "->");
+        constraint_str = res[1];
+      }
+      std::vector<std::string> constraints = akg::common::Split(constraint_str, "_");
+      CHECK_GE(constraints.size(), 1U);
+      std::vector<std::string> level = akg::common::Split(constraints[0], ":");
+      CHECK(level.size() == 2U && level[0] == "LEVEL");
+      CHECK(level[1] == "C1" || level[1] == "C0");
+      TileLevel lv = level[1] == "C1" ? CACHE1 : CACHE0;
+      constraints.erase(constraints.begin());
+      for (const auto &con : constraints) {
+        std::vector<std::string> items = akg::common::Split(con, ":");
+        CHECK_EQ(items.size(), 2U);
+        CHECK_NE(items[0], "");
+        CHECK_NE(items[1], "");
+        if (items[0] == "MIN") {
+          if (items[1] == "MIN") {
+            if (lv == CACHE1) {
+              axis->c1_constraints.tile_extent_ = axis->c1_constraints.tile_min_;
+            } else if (lv == CACHE0) {
+              axis->c0_constraints.tile_extent_ = axis->c0_constraints.tile_min_;
+            }
+          } else {
+            if (lv == CACHE1) {
+              axis->c1_constraints.tile_min_ = CastToExpr(items[1]);
+            } else if (lv == CACHE0) {
+              axis->c0_constraints.tile_min_ = CastToExpr(items[1]);
+            }
+          }
+        } else if (items[0] == "FACTOR") {
+          axis->TileRestrainToSingleValue(CastToExpr(items[1]), lv);
+        } else if (items[0] == "FORBIDISO") {
+          axis->forbid_iso = true;
+        } else if (items[0] == "MAX") {
+          if (items[1] == "FULL") {
+            axis->TileRestrainEntire(lv);
+          } else {
+            if (lv == CACHE1) {
+              axis->c1_constraints.tile_extent_ = CastToExpr(items[1]);
+            } else if (lv == CACHE0) {
+              axis->c0_constraints.tile_extent_ = CastToExpr(items[1]);
+            }
+          }
+        } else if (items[0] == AT_MOD) {
+          axis->TileRestrainMod(CastToExpr(items[1]), lv);
+        }
+      }
+    }
+  }
+}
+
 // No constraint found in cuda
 
 void ModStrategy::AddGpuConstraint() {}
 
-void CustomTilingStrategy::AddGpuConstraint() {}
-
 void ConflictTreeRangeStrategy::AddGpuConstraint() {}
 
 void VectorizedStrategy::AddGpuConstraint() {}
diff --git a/tests/fuzz/tune_for_gpu/__init__.py b/tests/fuzz/tune_for_gpu/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/fuzz/tune_for_gpu/autotuning/data_utils/sort_log.py b/tests/fuzz/tune_for_gpu/autotuning/data_utils/sort_log.py
new file mode 100644
index 00000000..377cead2
--- /dev/null
+++ b/tests/fuzz/tune_for_gpu/autotuning/data_utils/sort_log.py
@@ -0,0 +1,17 @@
+import sys
+
+if __name__ == "__main__":
+    from_log_file = str(sys.argv[1])
+    sorted_log_file = str(sys.argv[2])
+    f_in = open(from_log_file, 'r')
+    f_out = open(sorted_log_file, "wt")
+    d = dict()
+    for line in f_in:
+        config = line.split("|")
+        d[str(config[1])] = float(config[2])
+    sorted_dict = {k: v for k, v in sorted(
+        d.items(), key=lambda item: (item[1], item[0]))}
+    for k, v in sorted_dict.items():
+        f_out.write("|" + str(k) + "|" + str(v) + "\n")
+    f_in.close()
+    f_out.close()
diff --git a/tests/fuzz/tune_for_gpu/autotuning/gen_spaces_gpu.py b/tests/fuzz/tune_for_gpu/autotuning/gen_spaces_gpu.py
new file mode 100644
index 00000000..3516bbf5
--- /dev/null
+++ b/tests/fuzz/tune_for_gpu/autotuning/gen_spaces_gpu.py
@@ -0,0 +1,95 @@
+from .kernel_compiler import compile_kernel
+from collections import namedtuple
+from .space import ListConfigSpace
+
+def get_reduce_axis_length(in_shape,reduce_axis):
+  lx, ly = 1, 1
+  if reduce_axis == None or len(reduce_axis) == len(in_shape):
+    for v in in_shape: lx *= v
+  elif (len(in_shape) - 1) in reduce_axis:
+    for i in range(len(in_shape)):
+      if i in reduce_axis: 
+        lx *= in_shape[i]
+      else:
+        ly *= in_shape[i]
+
+  else:
+    for i in range(len(in_shape)):
+      if i in reduce_axis: 
+        ly *= in_shape[i]
+      else:
+        lx *= in_shape[i]
+
+  return lx, ly
+    
+
+def _get_space_reduce_gpu_manually(op_type: str, op_desc, tuning_attrs=[], tuning_attrs_info=None):
+  """get config space of reduce_sum operators in gpu"""
+  space_res, key, expect, input_for_mod = compile_kernel(op_type, op_desc, None, None, None, 0,
+                                                          gen_tiling_spaces=True)
+  
+  in_shape, reduce_axis = op_desc[2].in_shape, op_desc[2].axis
+  dim_len = 1 if reduce_axis == None or len(reduce_axis) == len(in_shape) else 2
+  dim_names = ['tiling_' + str(i) for i in range(dim_len)]
+  dim_names.append("block_x")
+  dim_names.append("block_y")  
+  dim_names.append("block_z") 
+  dim_names.append("thread_x")
+  dim_names.append("thread_y")
+  dim_names.append("thread_z")
+  for key in tuning_attrs_info[0]:
+    dim_names.append(key)
+  lx, ly =  get_reduce_axis_length(in_shape, reduce_axis)
+
+  tiling_spaces = []
+  if reduce_axis == None or len(reduce_axis) == len(in_shape):
+    """all-reduce"""
+    possible_tx_list = [2**i for i in range(4,11)]
+    for tx in possible_tx_list:
+      if tx > lx: break
+      possible_dim0_list = [d0 for d0 in range(tx, lx+1, tx)]
+      if possible_dim0_list[-1] != lx: possible_dim0_list.append(lx)
+      for d0 in possible_dim0_list:
+        bx = lx//d0 if lx % d0 == 0  else lx//d0+1
+        tiling_spaces.append([d0,bx,1,1,tx,1,1])
+
+
+  elif (len(in_shape) - 1) in reduce_axis:
+    """reduce-x"""
+    possible_tx_list = [2**i for i in range(4,11)]
+    for tx in possible_tx_list:
+      if tx > lx: break
+      ty = 1
+      by = ly
+      possible_dim1_list = [d1 for d1 in range(tx, lx+1, tx)]
+      if possible_dim1_list[-1] != lx: possible_dim1_list.append(lx)
+      for d1 in possible_dim1_list:
+        bx = lx//d1 if lx % d1 == 0 else lx//d1+1
+        tiling_spaces.append([1,d1,bx,by,1,tx,ty,1])
+
+  else:
+    """reduce-y"""
+    tx = min(32,lx)
+    bx = lx//tx if lx %tx==0 else lx//tx + 1
+    d0 = tx
+    for ty in range(min(8,ly),1025):
+      if ty * tx > 1024: break
+      possible_dim1_list = [d1 for d1 in range(ty, ly+1, ty)]
+      for d1 in possible_dim1_list:
+        by = ly//d1 if ly % d1 == 0 else ly//d1+1
+        tiling_spaces.append([d0,d1,bx,by,1,tx,ty,1])
+
+  input_type = namedtuple(op_type, dim_names)
+  space = ListConfigSpace(input_type)
+  if len(tuning_attrs_info[0]) != 0:
+    for tiling_space in tiling_spaces:
+      for tuning_attrs_config in tuning_attrs_info[1]:
+        tmp = tiling_space[:]
+        tmp.extend(tuning_attrs_config)
+        config = input_type(*tmp)
+        space.add(config)
+  else:
+      for tiling_space in tiling_spaces:
+          config = input_type(*tiling_space)
+          space.add(config)
+  return space_res.index_table, space, key, expect, input_for_mod
\ No newline at end of file
diff --git a/tests/fuzz/tune_for_gpu/autotuning/job.py b/tests/fuzz/tune_for_gpu/autotuning/job.py
new file mode 100644
index 00000000..50c1b446
--- /dev/null
+++ b/tests/fuzz/tune_for_gpu/autotuning/job.py
@@ -0,0 +1,501 @@
+# Copyright 2019 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""AutoTuning job"""
+import os
+import json
+import time
+import datetime
+import importlib
+import logging
+import pandas as pd
+import subprocess
+import numpy as np
+from collections import namedtuple
+from multiprocessing import Process, Manager
+from akg import composite
+from akg.utils import kernel_exec as utils
+from akg.composite.build_module import generate_trait
+from autotuning.runner import KernelRunner, error_time_list, error_time_string
+from autotuning.tuner import ModelBasedTuner, Tuner
+from autotuning.type_definitions import ConvDesc, ConvBackpropDesc, MatmulCubeDesc
+from autotuning.space_generators import get_space
+from autotuning.space import ListConfigSpace
+from autotuning.test_data_generators import gen_data
+from autotuning.space_generators import gen_bool_list
+from autotuning.tuning_utils import *
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger('fuzz.tune.autotuning.job')
+
+storage_dir = './res/'
+
+if not os.path.exists(storage_dir):
+    os.makedirs(storage_dir)
+
+json_file = './res/' + "{0}" + ".json"
+json_load = './autotuning/shapes/' + "{0}"
+
+
+def get_repo(repo, keys, default=None):
+    for key in keys:
+        repo = repo.get(key)
+        if not repo:
+            return default
+    return repo
+
+
+def get_json_space(json_input, space_dict):
+    space_res = composite.get_tiling_space(json_input, 2)
+    space_dict['res'] = space_res
+
+
+def launch_json(debug_mode: bool = True, save_res: bool = False, json_dir="", repo_path="", all_space=False,
+                skip_exist=True, extra_tune=False, self_attrs=[], tuning_attrs=[]):
+    """composite json tuning launch"""
+    subprocess.run("mkdir -p res/", shell=True)
+    iter_times = [3, 3, 3] if debug_mode else [80, 160, 320]
+    files = os.listdir(json_dir)
+    with open(repo_path, 'r') as f:
+        repo = json.loads(f.read())
+    for input_file in files:
+        print("----Start tuning for ", input_file)
+        with open(json_dir + '/' + input_file, 'r') as f:
+            json_input = f.read()
+        json_content = json.loads(json_input)
+        for input_desc in json_content["input_desc"]:
+            if input_desc[0]["shape"] == []:
+                input_desc[0]["shape"] = [1]
+        json_input = json.dumps(json_content)
+
+        # skip tuning for info in repo
+        if skip_exist:
+            compute, shape, dtype = generate_trait(json_content)
+            if get_repo(repo, [compute, shape, dtype]):
+                print("Info for %s already exists" % input_file)
+                print("ops are ", str(compute))
+                print("shape is ", str(shape))
+                print("dtype is ", str(dtype))
+                with open('res/skip_file.txt', 'a') as fe:
+                    fe.write(input_file)
+                    fe.write("\n")
+                continue
+
+        # generate tuning space
+        if not extra_tune:
+            time_start_get_space = time.time()
+            with Manager() as manager:
+                space_dict = manager.dict()
+                p = Process(target=get_json_space,
+                            args=(json_input, space_dict))
+                p.start()
+                p.join(600)
+                if 'res' not in space_dict:
+                    with open('res/error_space_list.txt', 'a') as fe:
+                        fe.write(input_file)
+                        fe.write("\n")
+                    continue
+                space_res = space_dict['res']
+            time_end_get_space = time.time()
+            print("get space time: ", time_end_get_space - time_start_get_space)
+            index_table = space_res['index']
+            tiling_spaces = space_res['tuning_space']
+            if not isinstance(tiling_spaces, list):
+                with open('res/empty_space_list.txt', 'a') as fe:
+                    fe.write(input_file)
+                    fe.write("\n")
+                continue
+            dim_names = ['tiling_' + str(i)
+                         for i in range(len(tiling_spaces[0]))]
+            use_tuning_attrs = len(tiling_spaces) < 10 ** 5
+            if tuning_attrs and use_tuning_attrs:
+                dim_names.extend(tuning_attrs)
+            input_type = namedtuple("json", dim_names)
+            space = ListConfigSpace(input_type)
+            if tuning_attrs and use_tuning_attrs:
+                attr_options = gen_bool_list(tuning_attrs)
+                for tiling_space in tiling_spaces:
+                    for attr_option in attr_options:
+                        tmp = tiling_space[:]
+                        tmp.extend(attr_option)
+                        config = input_type(*tmp)
+                        space.add(config)
+            else:
+                for tiling_space in tiling_spaces:
+                    config = input_type(*tiling_space)
+                    space.add(config)
+        else:
+            index_table = []
+            pre_lists = gen_bool_list(self_attrs)
+            pre_input_type = namedtuple("extra_tune", self_attrs)
+            space = ListConfigSpace(pre_input_type)
+            for item in pre_lists:
+                config = pre_input_type(*item)
+                space.add(config)
+
+        key = json_content["op"]
+        try:
+            input_for_mod, expect = gen_data(
+                op_type="json", op_desc=json_input)
+        except BaseException as e:
+            logger.debug(
+                "gen numpy data from [%s] failed: %s", input_file, str(e))
+            with open('res/error_gen_data_list.txt', 'a') as fe:
+                fe.write(input_file)
+                fe.write(": ")
+                fe.write(str(e))
+                fe.write("\n")
+            continue
+        print('space size:', space.length)
+        print('index table:', index_table)
+
+        output_para = None  # this is for multi-output
+        if len(json_content["output_desc"]) > 1:
+            output_para = []
+            for i in range(len(json_content["output_desc"])):
+                output_para.append(i - len(json_content["output_desc"]))
+        runner = KernelRunner(op_type="json", op_desc=json_input, index_table=index_table, self_attrs=self_attrs,
+                              input_data=input_for_mod, expect=expect, mod_output_param=output_para, timeout=180,
+                              repeat_times=1)
+
+        # we can only get a valid tiling, or accurate get cycles
+        is_truly_profiling = utils.get_profiling_mode(
+        ) or os.environ['RUNTIME_MODE'] == "gpu"
+
+        # available device numbers, normally is 8 or 1
+        available_device_numbers = utils.get_available_devices_num()
+
+        if all_space:
+            tuner = Tuner(runner, index_table, space,
+                          n_parallel=available_device_numbers)
+            least_try_times = 3  # space.length
+        else:
+            tuner = ModelBasedTuner(runner, index_table, space,
+                                    n_parallel=available_device_numbers if is_truly_profiling else 1,
+                                    plan_size=64, pre_model=None)
+            least_try_times = iter_times[0 if space.length <
+                                         10 ** 4 else 1 if space.length < 10 ** 5 else 2]
+        tuner.tune(least_try_times, output_file="json.log")
+
+        print_tuning_result("json", space, index_table, tuner, key)
+
+        if save_res:
+            if extra_tune:
+                save_tuning_result(key, "extra_tune",
+                                   json_content, index_table, tuner, repo_path)
+            else:
+                save_tuning_result(key, "json", json_content,
+                                   index_table, tuner, repo_path)
+
+
+def jobs(op_type: str = 'add', desc=None, debug_mode: bool = True, save_res: bool = False,
+         all_space: bool = True, insert_key='', conf_of_set_dim="", tuning_attrs=[], skip_config_set=None, tuning_attrs_info=None):
+    """AutoTuning jobs"""
+    iter_times = [3, 3, 3] if debug_mode else [80, 160, 320]
+    time_start_get_space = time.time()
+    index_table, space, key, expect, input_for_mod = get_space(
+        op_type, desc, tuning_attrs=tuning_attrs, tuning_attrs_info=tuning_attrs_info)
+    time_end_get_space = time.time()
+    print("get space time: ", time_end_get_space - time_start_get_space)
+    print('space size:', space.length)
+    print('index table:', index_table)
+    key = key if insert_key == '' else insert_key
+
+    # filter already tuned shape
+    if isinstance(conf_of_set_dim, dict) and key in conf_of_set_dim.keys():
+        if isinstance(conf_of_set_dim[key], (list, tuple)) and conf_of_set_dim[key]:
+            return
+
+        if isinstance(conf_of_set_dim[key], dict):
+            return
+
+    output_para = None  # this is for multi-output
+    if isinstance(input_for_mod, dict):
+        input_for_mod, output_para = input_for_mod['args'], input_for_mod['outputs']
+    runner = KernelRunner(op_type, desc, index_table,
+                          self_attrs=None, input_data=input_for_mod,
+                          expect=expect, mod_output_param=output_para,
+                          timeout=30, repeat_times=1,
+                          is_all_space=all_space,
+                          skip_config_set=skip_config_set, 
+                          need_tune_json=tuning_attrs_info[2])
+
+    # we can only get a valid tiling, or accurate get cycles
+    is_truly_profiling = utils.get_profiling_mode()
+
+    # number of multi-processing for build kernels
+    available_device_numbers = get_parallel_build_num()
+
+    time_start_tuning = time.time()
+    if all_space:
+        tuner = Tuner(runner, index_table, space,
+                      n_parallel=available_device_numbers)
+        least_try_times = space.length
+    else:
+        tuner = ModelBasedTuner(runner, index_table, space,
+                                n_parallel=available_device_numbers if is_truly_profiling else 1,
+                                plan_size=100, pre_model=None)
+        least_try_times = space.length
+    tuner.tune(least_try_times, output_file=op_type + ".log")
+
+    time_end_tuning = time.time()
+    print("tuning time: ", time_end_tuning - time_start_tuning)
+    print_tuning_result(op_type, space, index_table, tuner, key)
+    # save_results_to_csv(op_type, space, index_table, tuner, key)
+
+    # if save_res:
+    #     save_tuning_result(key, op_type, desc, index_table, tuner)
+
+
+def print_tuning_result(op_type, space, index_table, tuner, key):
+    """print tuning result"""
+    print(op_type + " shape is:", key)
+    print('space size:', space.length)
+    print('index table:', index_table)
+    print('best config:', tuner.best_config)
+    print('best time:',
+          tuner.best_time if tuner.best_time not in error_time_string.keys() else error_time_string[tuner.best_time])
+    print('original time:', tuner.original_time)
+    print('optimal result is ', tuner.original_time /
+          tuner.best_time, "faster then auto set dim.")
+    print("total try times", len(tuner.xs))
+    for x, y in zip(tuner.xs, tuner.ys):
+        print(space.get(x), y if y not in error_time_string.keys()
+              else error_time_string[y])
+
+
+def save_results_to_csv(op_type, space, index_table, tuner, key):
+    """save all results to csv"""
+    data = []
+    for x, y in zip(tuner.xs, tuner.ys):
+        data.append([space.get(x), y if y not in error_time_string.keys()
+                     else 9999999])
+    df = pd.DataFrame(data, columns=["config", "time"])
+    df.to_csv(op_type + "_" + key + ".csv")
+
+
+def save_tuning_result(key, op_type, desc, index_table, tuner, repo_path="", extra_tune=False, platform="gpu"):
+    """save tuning result"""
+    if tuner.best_config is not None and tuner.best_time not in error_time_list:
+        set_dim_configs = tuner.best_config.input
+        if op_type == "matmul":
+            param = []
+            for _ in range(len(desc.x_shape) - 2):
+                param.append((1, 1))
+            if set_dim_configs.n_l1 > 0:
+                param.append((set_dim_configs.n_l1, set_dim_configs.n_l0))
+            if set_dim_configs.m_l1 > 0:
+                param.append((set_dim_configs.m_l1, set_dim_configs.m_l0))
+            param.extend(
+                [(16, 16), (16, 16), (set_dim_configs.k_l1, set_dim_configs.k_l0)])
+            tiling_param = (param, {"bypass": set_dim_configs.bypass})
+
+        # special case with different tiling parameter format
+        elif op_type in ("conv", "conv_bn1"):
+            param = []
+            tile_hh = set_dim_configs.tile_h
+            tile_coco = set_dim_configs.tile_co
+            tile_mm = set_dim_configs.tile_m
+            tile_kk = set_dim_configs.tile_k
+            tile_nn = set_dim_configs.tile_n
+            tile_ww = set_dim_configs.tile_w
+            param = [tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, tile_ww]
+            tiling_param = (param, {"bypass": set_dim_configs.bypass})
+        elif op_type == "conv_backprop_input":
+            param = []
+            tile_hh = set_dim_configs.tile_h
+            tile_coco = set_dim_configs.tile_co
+            tile_mm = set_dim_configs.tile_m
+            tile_kk = set_dim_configs.tile_k
+            tile_nn = set_dim_configs.tile_n
+            tile_ww = set_dim_configs.tile_w
+            param = [tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, tile_ww]
+            tiling_param = (param)
+        elif op_type == "conv_backprop_filter":
+            param = []
+            tile_cici = set_dim_configs.tile_ci
+            tile_khkh = set_dim_configs.tile_kh
+            tile_kwkw = set_dim_configs.tile_kw
+            tile_coco = set_dim_configs.tile_co
+            tile_bb = set_dim_configs.tile_batch
+            tile_hh = set_dim_configs.tile_h
+            tile_ww = set_dim_configs.tile_w
+            tile_mm = set_dim_configs.tile_m
+            tile_kk = set_dim_configs.tile_k
+            tile_nn = set_dim_configs.tile_n
+            param = [tile_cici, tile_khkh, tile_kwkw, tile_coco,
+                     tile_bb, tile_hh, tile_ww, tile_mm, tile_kk, tile_nn]
+            tiling_param = (param)
+        elif ("batch_matmul" in op_type) and (platform == "gpu"):
+            tiling = [str(getattr(set_dim_configs, name)) for name in getattr(
+                set_dim_configs, "_fields") if name.startswith("tiling")]
+            tiling_param = ""
+            for i, tile_v in enumerate(tiling):
+                if i % 2 == 0:
+                    tiling_param += "0 " + str(i) + " "
+                tiling_param += tile_v + " "
+
+            block_param = get_block_str_from_config(set_dim_configs)
+            thread_param = get_thread_str_from_config(set_dim_configs)
+            config = {
+                'attrs': {
+                    'dim': tiling_param,
+                    'bind_block': block_param,
+                    'bind_thread': thread_param
+                },
+                'best_cycles': tuner.best_time,
+                'original_cycles': tuner.original_time,
+                'date': str(datetime.datetime.now()),
+                'tuning_time': tuner.tuning_time,
+            }
+        elif op_type == "json":
+            from autotuning.runner import get_attr_from_config
+            tiling_param = get_attr_from_config(set_dim_configs, index_table)
+        elif op_type == "reduce_sum_gpu":
+            print(set_dim_configs)
+            tiling = [str(getattr(set_dim_configs, name))
+                      for name in getattr(set_dim_configs, '_fields') if name.startswith('tiling')]
+            tiling_param = ""
+            for i, tile_v in enumerate(tiling):
+                tiling_param += "0 " + str(i) + " "
+                tiling_param += tile_v + " 1 "
+
+            block_param = get_block_str_from_config(set_dim_configs)
+            thread_param = get_thread_str_from_config(set_dim_configs)
+            config = {
+                'attrs': {
+                    'dim': tiling_param,
+                    'bind_block': block_param,
+                    'bind_thread': thread_param
+                },
+                'best_cycles': tuner.best_time,
+                'original_cycles': tuner.original_time,
+                'date': str(datetime.datetime.now()),
+                'tuning_time': tuner.tuning_time,
+            }
+        else:
+            print(set_dim_configs)
+            tiling = [[getattr(set_dim_configs, name), 1]
+                      for name in getattr(set_dim_configs, '_fields') if name.startswith('tiling')]
+            tiling_param = []
+            for i, tile_v in enumerate(tiling):
+                tiling_param.append(index_table[i] + tile_v)
+            config = []
+    else:
+        tiling_param = []
+
+    # when there is a valid result, save the result
+    if op_type in ("json", "extra_tune") and tuner.best_time not in error_time_list:
+        config = {'attrs': tiling_param,
+                  'best_cycles': tuner.best_time,
+                  'original_cycles': tuner.original_time,
+                  "date": str(datetime.datetime.now()),
+                  "tuning time": tuner.tuning_time,
+                  }
+        if op_type == "json":
+            config["file_name"] = str(key)
+        compute, shape, dtype = generate_trait(desc)
+        tuner.export_dim_configs(
+            config, json_file.format(op_type), False, str(key))
+        save_file = "autotuning/extra_tune.json" if extra_tune else repo_path
+        with open(save_file, 'r') as f:
+            repo = json.loads(f.read())
+            if len(tiling_param) != 0 and (get_repo(repo, [compute, shape, dtype]) is None or
+                                           int(tuner.best_time) < int(repo[compute][shape][dtype]["metadata"]["best_cycles"])):
+                tuner.export_dim_configs_for_keys(config, save_file, False, [
+                                                  compute, shape, dtype, "metadata"])
+    else:
+        try:
+            tuner.export_dim_configs(
+                config, json_file.format(op_type), False, str(key))
+        except UnboundLocalError as e:
+            logger.warning(e)
+            print("[save_tuning_result]: ", "no result is saved.")
+
+
+def load_json_configs(op_type):
+    """load json configs"""
+    dim_file = json_file.format(op_type)
+    file_path = os.path.realpath(dim_file)
+    if os.path.isfile(file_path):
+        try:
+            with open(file_path, 'r') as f:
+                data = json.load(f)
+                return data
+        except IOError as e:
+            logger.debug(e)
+            return {}
+    return {}
+
+
+def read_shapes_from_file(debug_mode, save_res, all_space, conf_of_set_dim, op_type):
+    """read tuning shapes from file"""
+    file = importlib.import_module('autotuning.shapes.' + op_type)
+    shapes = file.shapes
+    for _, shp in enumerate(shapes):
+        do_profiling(shp, debug_mode, save_res,
+                     all_space, op_type, conf_of_set_dim)
+
+
+def do_profiling(shp, debug_mode, save_res, all_space, op_type, conf_of_set_dim=None, tuning_attrs=None, skip_config_set=None, tuning_attrs_info=None):
+    """do profiling"""
+    # remove undeleted JOB files for previous shapes
+    subprocess.run("rm -rf /var/log/npu/profiling/JOB*", shell=True)
+    if op_type == 'matmul':
+        key = shp[2][0:-1]
+        logger.debug("start profiling: [%s]", str(key))
+        desc = MatmulCubeDesc(*key)
+        jobs(op_type, desc, debug_mode, save_res,
+             all_space, key.__str__(), conf_of_set_dim)
+        logger.debug("end profiling: [%s]", str(key))
+    elif op_type.startswith('conv_backprop'):
+        key = shp[2]
+        logger.debug("start profiling: [%s]", str(key))
+        desc = ConvBackpropDesc(*key)
+        jobs(op_type, desc, debug_mode, save_res,
+             all_space, key.__str__(), conf_of_set_dim)
+        logger.debug("end profiling: [%s]", str(key))
+    elif op_type.startswith('conv') and "gpu" not in op_type:
+        key = shp[2]
+        logger.debug("start profiling: [%s]", str(key))
+        desc = ConvDesc(*key)
+        jobs(op_type, desc, debug_mode, save_res,
+             all_space, key.__str__(), conf_of_set_dim)
+        logger.debug("end profiling: [%s]", str(key))
+    elif op_type in ["batch_matmul_gpu", "conv_image2col_gemm_gpu", "reduce_sum_gpu"]:
+        logger.debug("start profiling: [%s]", str(shp))
+        jobs(op_type, shp, debug_mode, save_res,
+             all_space, conf_of_set_dim=conf_of_set_dim, tuning_attrs=tuning_attrs, skip_config_set=skip_config_set, tuning_attrs_info=tuning_attrs_info)
+    else:
+        key = shp
+        logger.debug("start profiling: [%s]", str(key))
+        desc = key
+        jobs(op_type, desc, debug_mode, save_res,
+             all_space, conf_of_set_dim=conf_of_set_dim, skip_config_set=skip_config_set)
+        logger.debug("end profiling: [%s]", str(key))
+
+
+def launch(op_type, debug_mode, save_res=False, desc=None, all_space=False,
+           from_json=False, tuning_attrs=None, skip_config_set=None, tuning_attrs_info=None):
+    # get the existed tiling
+    conf_of_set_dim = load_json_configs(op_type) if from_json else None
+
+    if desc is None:
+        read_shapes_from_file(debug_mode, save_res,
+                              all_space, conf_of_set_dim, op_type)
+    else:
+        shp = desc
+        do_profiling(shp, debug_mode, save_res, all_space, op_type,
+                     tuning_attrs=tuning_attrs, skip_config_set=skip_config_set, tuning_attrs_info=tuning_attrs_info)
diff --git a/tests/fuzz/tune_for_gpu/autotuning/kernel_compiler.py b/tests/fuzz/tune_for_gpu/autotuning/kernel_compiler.py
new file mode 100644
index 00000000..efb10f6d
--- /dev/null
+++ b/tests/fuzz/tune_for_gpu/autotuning/kernel_compiler.py
@@ -0,0 +1,407 @@
+# Copyright 2019 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Compile kernel module for operator"""
+import os
+from typing import NamedTuple
+from base import TestBase
+from akg.utils import kernel_exec as utils
+from akg.utils import custom_tiling as ct_util
+from akg.ops.nn import conv_bn1
+from akg.ops.nn import conv, conv_backprop_input, conv_backprop_filter, batchmatmul
+from test_op.batch_matmul import batch_matmul
+from akg.ops.math_gpu.reduce_sum import reduce_sum
+from akg.build_module import tuning_spaces
+from akg.ops.nn import matmul
+from test_run import batchmatmul_run, matmul_run
+from .type_definitions import ConvDesc, ConvBackpropDesc, MatmulCubeDesc, ConvConfig, ConvBackpropInputConfig, ConvBackpropFilterConfig, MatmulCubeConfig
+import numpy as np
+from gen_random import random_gaussian
+from .tuning_utils import merge_attrs
+
+
+def get_spaces_gpu_manually(op_type: str, op_desc: NamedTuple = None):
+    # wait for implementation
+    return
+
+
+def gen_kernel_conv(op_desc: ConvDesc, input_shape, index_table,
+                    config: ConvConfig = None, idx=None, gen_tiling_spaces=False):
+    """Compile kernel module for conv"""
+    if index_table is not None:
+        raise RuntimeError('index_table should be none')
+    kernel_name = "conv_poly"
+    if idx is not None:
+        kernel_name += str(idx)
+
+    if config is None:
+        attrs = {'dim': ""}
+    else:
+        tile_hh = config.tile_h
+        tile_coco = config.tile_co
+        tile_mm = config.tile_m
+        tile_kk = config.tile_k
+        tile_nn = config.tile_n
+        tile_ww = config.tile_w
+        tiling_param = [tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, tile_ww]
+        attrs = {'conv_tile': tiling_param, 'bypass': config.bypass}
+
+    if op_desc.use_bias:
+        shape = [input_shape[0], input_shape[1], input_shape[2]]
+    else:
+        shape = [input_shape[0], input_shape[1]]
+    conv_dtype = 'float16'
+
+    return utils.op_build(conv.conv, [shape], [conv_dtype],
+                          op_attrs=[op_desc.fmap_shape, op_desc.filter_shape, op_desc.pad, op_desc.stride,
+                                    op_desc.dilation, op_desc.use_bias, attrs],
+                          kernel_name=kernel_name, attrs=attrs, polyhedral=True, tuning=gen_tiling_spaces)
+
+
+def gen_kernel_conv_bn1(op_desc: ConvDesc, input_shape, index_table, config: ConvConfig = None,
+                        idx=None, gen_tiling_spaces=False):
+    """Compile kernel module for conv_bn1"""
+    if index_table is not None:
+        raise RuntimeError('index_table should be none')
+    kernel_name = "conv_bn1_poly"
+    if idx is not None:
+        kernel_name += str(idx)
+
+    if config is None:
+        attrs = {'dim': ""}
+    else:
+        tile_hh = config.tile_h
+        tile_coco = config.tile_co
+        tile_mm = config.tile_m
+        tile_kk = config.tile_k
+        tile_nn = config.tile_n
+        tile_ww = config.tile_w
+        tiling_param = [tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, tile_ww]
+        attrs = {'conv_tile': tiling_param, 'bypass': config.bypass}
+
+    if op_desc.use_bias:
+        shape = [input_shape[0], input_shape[1], input_shape[2]]
+    else:
+        shape = [input_shape[0], input_shape[1]]
+    conv_dtype = 'float16'
+
+    return utils.op_build(conv_bn1.conv_bn1, [shape], [conv_dtype],
+                          op_attrs=[op_desc.fmap_shape, op_desc.filter_shape, op_desc.pad, op_desc.stride,
+                                    op_desc.dilation, op_desc.use_bias, attrs],
+                          kernel_name=kernel_name, attrs=attrs, polyhedral=True, tuning=gen_tiling_spaces)
+
+
+def gen_kernel_matmul_cube(op_desc: MatmulCubeDesc, _, index_table,
+                           config: MatmulCubeConfig = None, idx=None, gen_tiling_spaces=False):
+    """Compile kernel module for matmul_cube"""
+    if index_table is not None:
+        raise RuntimeError('index_table should be none')
+    kernel_name = "matmul_cube_poly"
+    if idx is not None:
+        kernel_name += str(idx)
+    if config is None:
+        attrs = {'dim': ""}
+    else:
+        tiling_param = []
+        for _ in range(len(op_desc.x_shape) - 2):
+            tiling_param.append((1, 1))
+        if config.n_l1 > 0:
+            tiling_param.append((config.n_l1, config.n_l0))
+        if config.m_l1 > 0:
+            tiling_param.append((config.m_l1, config.m_l0))
+        tiling_param.extend([(16, 16), (16, 16), (config.k_l1, config.k_l0)])
+        dim_info = ct_util.set_dims(tuple(tiling_param))
+        attrs = {'dim': dim_info, 'bypass': config.bypass}
+    return matmul_run.matmul_compile(op_desc.x_shape, op_desc.y_shape, op_desc.bias, op_desc.left_format,
+                                     op_desc.right_format, op_desc.out_format, op_desc.adj_x, op_desc.adj_y,
+                                     op_desc.dtype, op_desc.bias_dtype, op_desc.out_dtype, kernel_name,
+                                     attrs, tuning=gen_tiling_spaces)
+
+
+def gen_kernel_conv_backprop_input(op_desc: ConvBackpropDesc, _, index_table, config: ConvBackpropInputConfig = None,
+                                   idx=None, gen_tiling_spaces=False):
+    """Compile kernel module for conv_backprop_input"""
+    if index_table is not None:
+        raise RuntimeError('index_table should be none')
+    kernel_name = "conv_backprop_input_poly"
+    if idx is not None:
+        kernel_name += str(idx)
+
+    if config is None:
+        attrs = {'dim': ""}
+    else:
+        tile_hh = config.tile_h
+        tile_coco = config.tile_co
+        tile_mm = config.tile_m
+        tile_kk = config.tile_k
+        tile_nn = config.tile_n
+        tile_ww = config.tile_w
+        tiling_param = [tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, tile_ww]
+        attrs = {'conv_tile': tiling_param}
+
+    conv_dtype = 'float16'
+    block_size = 16
+
+    in_n, in_c, in_h, in_w = op_desc.fmap_shape
+    cout, _, w_h, w_w = op_desc.filter_shape
+
+    in_c = (in_c + block_size - 1) // block_size * block_size
+    cout = (cout + block_size - 1) // block_size * block_size
+
+    pad_top, pad_bottom, pad_left, pad_right = op_desc.pad
+    stride_h, stride_w = op_desc.stride
+
+    out_n = in_n
+    out_c = cout
+    out_h = (in_h + pad_top + pad_bottom - w_h) // stride_h + 1
+    out_w = (in_w + pad_left + pad_right - w_w) // stride_w + 1
+
+    x_shape = (out_n, out_c, out_h, out_w)
+    w_shape = (cout, in_c, w_h, w_w)
+    in_nn, in_cc, in_hh, in_ww = x_shape
+    input_shape_nc1hwc0 = (in_nn, in_cc // block_size,
+                           in_hh, in_ww, block_size)
+    k_n, k_c, k_h, k_w = w_shape
+    kernel_shape_nc1hwc0 = (k_n, k_c // block_size, k_h, k_w, block_size)
+    k_n, _, k_h, k_w, _ = kernel_shape_nc1hwc0
+    kernel_shape_fractal = (k_c // block_size * k_h *
+                            k_w, k_n // block_size, block_size, block_size)
+
+    shape = [input_shape_nc1hwc0, kernel_shape_fractal]
+
+    return utils.op_build(conv_backprop_input.conv_backprop_input, [shape], [conv_dtype],
+                          op_attrs=[op_desc.fmap_shape, op_desc.filter_shape, op_desc.pad,
+                                    op_desc.stride, op_desc.dilation, attrs],
+                          kernel_name=kernel_name, attrs=attrs, polyhedral=True, tuning=gen_tiling_spaces)
+
+
+def gen_kernel_conv_backprop_filter(op_desc: ConvBackpropDesc, _, index_table, config: ConvBackpropFilterConfig = None,
+                                    idx=None, gen_tiling_spaces=False):
+    """Compile kernel module for conv_backprop_filter"""
+    if index_table is not None:
+        raise RuntimeError('index_table should be none')
+    kernel_name = "conv_backprop_filter_poly"
+    if idx is not None:
+        kernel_name += str(idx)
+
+    if config is None:
+        attrs = {'dim': ""}
+    else:
+        tile_cici = config.tile_ci
+        tile_khkh = config.tile_kh
+        tile_kwkw = config.tile_kw
+        tile_coco = config.tile_co
+        tile_bb = config.tile_batch
+        tile_hh = config.tile_h
+        tile_ww = config.tile_w
+        tile_mm = config.tile_m
+        tile_kk = config.tile_k
+        tile_nn = config.tile_n
+        tiling_param = [tile_cici, tile_khkh, tile_kwkw, tile_coco, tile_bb, tile_hh, tile_ww,
+                        tile_mm, tile_kk, tile_nn]
+        attrs = {'conv_tile': tiling_param}
+
+    conv_dtype = 'float16'
+    block_size = 16
+
+    in_n, in_c, in_h, in_w = op_desc.fmap_shape
+    cout, _, w_h, w_w = op_desc.filter_shape
+
+    in_c = (in_c + block_size - 1) // block_size * block_size
+    cout = (cout + block_size - 1) // block_size * block_size
+
+    pad_top, pad_bottom, pad_left, pad_right = op_desc.pad
+    stride_h, stride_w = op_desc.stride
+
+    out_n = in_n
+    out_c = cout
+    out_h = (in_h + pad_top + pad_bottom - w_h) // stride_h + 1
+    out_w = (in_w + pad_left + pad_right - w_w) // stride_w + 1
+
+    x_shape = (in_n, in_c, in_h, in_w)
+    y_shape = (out_n, out_c, out_h, out_w)
+    in_n, in_c, in_h, in_w = x_shape
+    input_shape_nc1hwc0 = (in_n, in_c // block_size, in_h, in_w, block_size)
+    o_n, o_c, o_h, o_w = y_shape
+    kernel_shape_nc1hwc0 = (o_n, o_c // block_size, o_h, o_w, block_size)
+    o_n, o_c1, o_h, o_w, o_c0 = kernel_shape_nc1hwc0
+    mo = (o_h * o_w + block_size - 1) // block_size
+    mi = block_size
+    kernel_shape_fractal = (o_n, o_c1, mo, mi, o_c0)
+
+    input_shape = [kernel_shape_fractal, input_shape_nc1hwc0]
+
+    return utils.op_build(conv_backprop_filter.conv_backprop_filter, [input_shape], [conv_dtype],
+                          op_attrs=[op_desc.fmap_shape, op_desc.filter_shape, op_desc.pad,
+                                    op_desc.stride, op_desc.dilation, attrs],
+                          kernel_name=kernel_name, attrs=attrs, polyhedral=True, tuning=gen_tiling_spaces)
+
+
+def gen_kernel_for_vector(op_desc, _, index_table=None, config: NamedTuple = None, idx=None, gen_tiling_spaces=False):
+    """Compile kernel module for vector"""
+    test_base = TestBase()
+    test_base.params_init(op_desc[0][0:4] + str(idx), os.getcwd())
+    kernel_name = "poly_"
+    if idx is not None:
+        kernel_name += str(idx)
+    if config is None:
+        attrs = {'dim': ""}
+    else:
+        tiling = [[getattr(config, name), 1] for name in getattr(
+            config, '_fields') if name.startswith('tiling')]
+        tiling_param = []
+        for i, element in enumerate(tiling):
+            tiling_param.append(index_table[i] + element)
+        dim_info = ct_util.set_dims(tuple(tiling_param))
+        attrs = {'dim': dim_info}
+    _, func, args, kwargs = test_base.ana_args(op_desc)
+    if 'attrs' in kwargs.keys():
+        kwargs['attrs']['dim'] = attrs['dim']
+        kwargs['attrs']['tuning'] = gen_tiling_spaces
+        kwargs['attrs']['kernel_name'] = kernel_name
+    else:
+        for _, arg_ in enumerate(args):
+            if isinstance(arg_, dict):
+                arg_['dim'] = attrs['dim']
+                arg_['tuning'] = gen_tiling_spaces
+                arg_['kernel_name'] = kernel_name
+                break
+    try:
+        if gen_tiling_spaces:
+            mod, expect, param_for_mod = func(*args, **kwargs)
+            mod = list(mod)
+            mod.append(expect)
+            mod.append(param_for_mod)
+        else:
+            mod = func(*args, **kwargs)
+    except BaseException as e:
+        print("Compile ERROR message:", e)
+        print(func)
+        print("Compile ERROR")
+        raise Exception("Compile ERROR")
+
+    return mod
+
+
+def gen_kernel_batch_matmul_gpu(op_desc, _, index_table=None,
+                                config: NamedTuple = None, idx=None,
+                                gen_tiling_spaces=False, need_tune_json=None):
+    """Compile kernel module for batch_matmul in gpu"""
+    kernel_name = "batch_matmul_gpu_"
+    # wait for implementation
+    return 
+
+
+def gen_kernel_reduce_sum_gpu(op_desc, _, index_table=None,
+                              config: NamedTuple = None, idx=None, gen_tiling_spaces=False, need_tune_json=None):
+    """Compile kernel module for reduce_sum in gpu"""
+    kernel_name = "reduce_sum_gpu_"
+    if idx is not None:
+        kernel_name += str(idx)
+    attrs = op_desc[2]
+    if config is not None:
+        attrs = merge_attrs(attrs, config, need_tune_json)
+
+    try:
+        if gen_tiling_spaces:
+            # NOTE: don't use this process for reduce spaces generation,
+            # see function: "_get_space_reduce_gpu_manually".
+            from .tiling_strategies_gpu import reduce_gpu_tiling_strategy
+            spaces, set_dim_key = utils.op_build(reduce_sum, (attrs.in_shape, ),
+                                                 (attrs.in_dtype,
+                                                  ), kernel_name="reduce_sum",
+                                                 op_attrs=[
+                                                     attrs.axis, attrs.keepdims],
+                                                 attrs={"target": "cuda",
+                                                        "enable_akg_reduce_lib": attrs.enable_akg_reduce_lib,
+                                                        "enable_atomic_add": attrs.enable_atomic_add,
+                                                        "custom_tiling": reduce_gpu_tiling_strategy(attrs.in_shape, attrs.axis)}, tuning=True)
+
+            from test_ms_reduce_sum import gen_data
+            input_for_mod, output, expect = gen_data(
+                attrs.in_shape, attrs.in_dtype, attrs.axis, attrs.keepdims)
+            return [spaces, set_dim_key, expect, [input_for_mod, output]]
+        else:
+            mod = utils.op_build(reduce_sum, (attrs.in_shape, ),
+                                 (attrs.in_dtype,
+                                  ), kernel_name="reduce_sum",
+                                 op_attrs=[
+                attrs.axis, attrs.keepdims],
+                attrs={"target": "cuda",
+                       "enable_akg_reduce_lib": attrs.enable_akg_reduce_lib,
+                       "dim": attrs.dim,
+                       "bind_block": attrs.bind_block,
+                       "bind_thread": attrs.bind_thread,
+                       "enable_atomic_add": attrs.enable_atomic_add})
+            return mod
+    except BaseException as e:
+        print("Compile ERROR message:", e)
+        print(reduce_sum)
+        print("Compile ERROR")
+        raise Exception("Compile ERROR")
+
+
+def gen_kernel_conv_image2col_gemm_gpu(op_desc, _, index_table=None, config: NamedTuple = None, idx=None, gen_tiling_spaces=False, need_tune_json=None):
+    """Compile kernel module for convolution in gpu using image2col+gemm"""
+    # wait for implementation
+    return
+
+
+_compile_kernel_func = {
+    'conv': gen_kernel_conv,
+    'conv_bn1': gen_kernel_conv_bn1,
+    'conv_backprop_input': gen_kernel_conv_backprop_input,
+    'conv_backprop_filter': gen_kernel_conv_backprop_filter,
+    'matmul': gen_kernel_matmul_cube,
+    'reduce_sum_gpu': gen_kernel_reduce_sum_gpu,
+    'batch_matmul_gpu': gen_kernel_batch_matmul_gpu,
+    'conv_image2col_gemm_gpu':  gen_kernel_conv_image2col_gemm_gpu,
+}
+
+
+def compile_kernel(op_type: str, op_desc: NamedTuple, input_shape=None, index_table=None,
+                   config_param: NamedTuple = None, idx: int = None, gen_tiling_spaces: bool = False, need_tune_json=None):
+    """Generate kernel module for operator
+
+    Parameters
+    op_type: str
+        operator name
+    op_desc: NamedTuple
+        operator definition parameters
+    config_param: NameTuple
+        operator config  parameters
+    idx: int
+        operator idx(th) kernel
+    gen_tiling_spaces: bool
+        parameter passed to utils.op_build, whether to get spaces instead of stmt
+    ----------
+
+    Returns:
+        kernel if gen_tiling_spaces == False else np.ndarray
+    """
+    gen_func = _compile_kernel_func.get(op_type, None)
+    if gen_func is None:
+        gen_func = gen_kernel_for_vector
+    if gen_tiling_spaces:
+        space_res, key, expect, input_for_mod = gen_func(op_desc, input_shape, index_table, config_param,
+                                                         idx, gen_tiling_spaces)
+    else:
+        if "gpu" in op_type:
+            mod = gen_func(op_desc, input_shape, index_table,
+                           config_param, idx, gen_tiling_spaces, need_tune_json=need_tune_json)
+        else:
+            mod = gen_func(op_desc, input_shape, index_table,
+                           config_param, idx, gen_tiling_spaces)
+
+    return [space_res, key, expect, input_for_mod] if gen_tiling_spaces else mod
diff --git a/tests/fuzz/tune_for_gpu/autotuning/runner.py b/tests/fuzz/tune_for_gpu/autotuning/runner.py
new file mode 100644
index 00000000..a3400932
--- /dev/null
+++ b/tests/fuzz/tune_for_gpu/autotuning/runner.py
@@ -0,0 +1,243 @@
+# Copyright 2019 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Runner for compile and execute a configs of an operator on device"""
+import time
+import multiprocessing
+import logging
+import json
+import os
+import subprocess
+import time
+from typing import NamedTuple
+import numpy as np
+from akg import composite
+from akg.utils import custom_tiling as ct_util
+from akg.utils import kernel_exec as utils
+from .kernel_compiler import compile_kernel
+from .test_data_generators import gen_data
+from .tuning_utils import *
+
+logger = logging.getLogger('fuzz.tune.autotuning.runner')
+
+error_time_list = [
+    9999999999.0,
+    9999999998.0,
+    9999999997.0,
+    9999999996.0,
+]
+
+error_time_string = {
+    error_time_list[0]: 'run_failed',
+    error_time_list[1]: 'precision_error',
+    error_time_list[2]: 'compile_failed',
+    error_time_list[3]: 'timeout'
+}
+
+run_failed_time = error_time_list[0]
+precision_error_time = error_time_list[1]
+compile_fail_time = error_time_list[2]
+timeout_time = error_time_list[3]
+
+
+def get_attr_from_config(config, index_table):
+    tiling = []
+    attrs = {}
+    tuning_dict = config._asdict()
+    for key, value in tuning_dict.items():
+        if key.startswith('tiling'):
+            item = [value, 1]
+            tiling.append(item)
+        else:
+            attrs[key] = value
+    if len(tiling):
+        tiling_param = []
+        for i, element in enumerate(tiling):
+            tiling_param.append(index_table[i] + element)
+        dim_info = ct_util.set_dims(tuple(tiling_param))
+        attrs['dim'] = dim_info
+    else:
+        print("No tiling info. Use auto tiling.")
+    return attrs
+
+
+class KernelRunner:
+    """kernel runner
+    This runner will compile and execute configs of an operator, and return their running times.
+
+    Parameters
+    ----------
+    op_type: str
+        The name of operator
+    op_desc: NamedTuple
+        The definition parameters of operator
+    timeout: int
+        Timeout for running one config
+    repeat_times:
+        Run one config repeat_times
+    """
+
+    def __init__(self, op_type: str, op_desc: NamedTuple,
+                 index_table: list, self_attrs: list, timeout: int = 600,
+                 repeat_times: int = 2, input_data=None,
+                 expect=None, mod_output_param=None, is_all_space=True,
+                 skip_config_set=None, need_tune_json=None):
+        self.op_type = op_type
+        self.op_desc = op_desc
+        self._index_table = index_table
+        self.self_attrs = self_attrs
+        self.run_kernel_time = 0.0
+        self.tune_self_attrs = True
+        self.timeout = timeout
+        self.repeat_times = repeat_times
+        self.mod_output_param = mod_output_param
+        self.is_all_space = is_all_space
+        self.skip_config_set = skip_config_set
+        self.need_tune_json = need_tune_json
+        if input_data is None:
+            self.input, self.expect = gen_data(op_type, op_desc)
+            if isinstance(self.input, dict):
+                self.input, self.mod_output_param = self.input['args'], self.input['outputs']
+        else:
+            self.input, self.expect = input_data, expect
+        self.input_shape = [x.shape for x in self.input]
+
+    def info(self):
+        print('run kernel time:', self.run_kernel_time)
+
+    def run_one_kernel(self, run_times, idx, config, best_time=np.inf, is_auto=False):
+        """Compile and execute a config of the operator on device"""
+
+        if json.dumps(config.input._asdict()) in self.skip_config_set:
+            print("CONFIG SKIP:", json.dumps(config.input._asdict()))
+            run_times[idx] = -1
+            return
+
+        time_one_kernel_start = time.time()
+        logger.debug('compile %dth kernel', idx)
+        gpu_devices_list = get_available_gpu_num()
+        device_id = gpu_devices_list[idx % len(gpu_devices_list)]
+        logger.debug('run %dth kernel', idx)
+        logger.debug('++++++++++++++++++++++=device_id')
+        logger.debug(device_id)
+        logger.debug('++++++++++++++++++++++=device_id')
+        try:
+            time_start_build = time.time()
+            logger.debug(config)
+            if self.op_type in ["json", "extra_tune"]:
+                if is_auto:
+                    mod = composite.build(self.op_desc)
+                    if self.op_type == "extra_tune":
+                        del os.environ['MS_GRAPH_KERNEL_TILING']
+                else:
+                    attrs = get_attr_from_config(
+                        config.input, self._index_table)
+                    if os.environ['RUNTIME_MODE'] == "gpu":
+                        attrs['target'] = "cuda"
+                    mod = composite.build(self.op_desc, attrs, use_repo=False)
+            else:
+                mod = compile_kernel(self.op_type, self.op_desc, self.input_shape, self._index_table,
+                                     None if is_auto else config.input, idx, need_tune_json=self.need_tune_json)
+            time_end_build = time.time()
+            logger.debug("build module time: %f",
+                         time_end_build - time_start_build)
+            logger.debug('finished compile %dth kernel', idx)
+        except BaseException as e:
+            logger.debug("Compile Failed: [%s] : %s", "origin" if is_auto else str(
+                config.input), str(e))
+            run_times[idx] = compile_fail_time
+            return
+
+        run_times[idx] = run_failed_time
+
+        try:
+            # NOTE: in gpu tuning, it is no need to use this repeat_times,
+            # repeat_time has been setted in mod_launch in tuning mode.
+            for _ in range(self.repeat_times):
+                stat_info = {}
+                try:
+                    time_start_launch = time.time()
+                    if self.mod_output_param is not None:
+                        pass
+                    else:
+                        output, stat_info = utils.mod_launch(
+                            mod, self.input, tuning=True, device_id=device_id, repeat_time=40)
+                        if not np.allclose(output, self.expect, rtol=5e-03, atol=5e-03, equal_nan=True):
+                            stat_info['run_time'] = precision_error_time
+                            logger.debug("Precision Error: [%s]",
+                                            "origin" if config is None else str(config.input))
+
+                    time_end_launch = time.time()
+                    logger.debug("mod launch time: %f",
+                                 time_end_launch - time_start_launch)
+                except BaseException as e:
+                    logger.debug("Run Failed: [%s] : %s", str(
+                        config.input), str(e))
+                    stat_info['run_time'] = run_failed_time
+                run_times[idx] = np.minimum(
+                    run_times[idx], stat_info['run_time'])
+        finally:
+            logger.debug('end of %dth kernel', idx)
+            time_one_kernel_end = time.time()
+            logger.debug('run one kernel time: %f',
+                         time_one_kernel_end - time_one_kernel_start)
+        return
+
+    def run(self, configs, best_time=np.inf, is_auto_set_dim=False, all_space=False):
+        """Compile and execute a batch config of the operator on device"""
+        start = time.time()
+        logger.setLevel(logging.DEBUG)
+        logger.debug("gen cce kernels batch: %d kernels", len(configs))
+        subprocess.run("rm -rf ./jobs/JOB*", shell=True)
+
+        process_jobs = []
+        run_times = multiprocessing.Manager().list(
+            np.full((len(configs),), compile_fail_time))
+        for idx, config in enumerate(configs):
+            p = multiprocessing.Process(target=self.run_one_kernel,
+                                        args=(run_times, idx, config, best_time, is_auto_set_dim))
+            process_jobs.append(p)
+            p.start()
+        timeout_error = False
+        for idx, p in enumerate(process_jobs):
+            if not timeout_error:
+                p.join(timeout=self.timeout)
+            if p.is_alive():
+                timeout_error = True
+                logger.debug("Timeout Error: [%s]", str(configs[idx].input))
+                run_times[idx] = timeout_time
+                p.terminate()
+
+        process_end = time.time()
+        logger.debug("process time: %f", process_end - start)
+        # clean the profiling directory
+        tune_device = int(os.environ['DEVICE_ID'])
+        tune_num = int(os.environ['DEVICE_TOTAL_NUM'])
+        if os.environ['RUNTIME_MODE'] == "gpu":
+            subprocess.run("rm -rf cuda_meta_*", shell=True)
+        else:
+            pass
+        end = time.time()
+        logger.debug("run kernels time: %f", end - start)
+        self.run_kernel_time += end - start
+
+        for idx, config in enumerate(configs):
+            if run_times[idx] not in error_time_list:
+                logger.debug("KernelRunTime : [%s] : %s", str(
+                    configs[idx].input), str(run_times[idx]))
+            else:
+                logger.debug("KernelRunTime : [%s] : %s",
+                             str(configs[idx].input), str(error_time_string[run_times[idx]]))
+
+        return run_times
diff --git a/tests/fuzz/tune_for_gpu/autotuning/space.py b/tests/fuzz/tune_for_gpu/autotuning/space.py
new file mode 100644
index 00000000..7c17d422
--- /dev/null
+++ b/tests/fuzz/tune_for_gpu/autotuning/space.py
@@ -0,0 +1,217 @@
+# Copyright 2019 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Config space"""
+from abc import ABCMeta, abstractmethod
+from typing import NamedTuple, List
+import random
+import numpy as np
+
+
+class ConfigEntity:
+    """General config entity"""
+
+    def __init__(self, input_id: int, input_space: NamedTuple):
+        self.__input = input_space
+        self.__input_id = input_id
+        self.__input_type = type(input_space)
+
+    def __len__(self):
+        return len(self.__input)
+
+    def __str__(self):
+        return str(self.__input_id) + ': ' + str(self.__input)
+
+    def __repr__(self):
+        return str(self)
+
+    @property
+    def input_id(self):
+        return self.__input_id
+
+    @property
+    def input_type(self):
+        return self.__input_type
+
+    @property
+    def input(self):
+        return self.__input
+
+    @property
+    def feature(self):
+        return self.__input
+
+
+class ConfigSpace(metaclass=ABCMeta):
+    """Searching space of configs"""
+
+    def __init__(self, input_type):
+        self._input_type = input_type
+        self._dim_names = getattr(self._input_type, '_fields')
+
+        self._configs = []  # List[ConfigEntity]
+
+    @abstractmethod
+    def reset_fetch(self):
+        pass
+
+    @abstractmethod
+    def has_next(self) -> bool:
+        pass
+
+    @abstractmethod
+    def fetch_index(self) -> int:
+        """fetch a random index of config"""
+
+    @abstractmethod
+    def fetch_config(self) -> ConfigEntity:
+        """fetch a random config"""
+
+    @abstractmethod
+    def random_walk(self, p: int) -> int:
+        """find a neighbor hood of the p-th ConfigEntity, which only
+            differs with p in at most one dimension"""
+
+    def get(self, idx: int) -> ConfigEntity:
+        """get the `idx`-th config of the space"""
+        return self._configs[idx]
+
+    @property
+    def configs(self):
+        return self._configs
+
+    @property
+    def dim_names(self):
+        return self._dim_names
+
+    @property
+    def input_type(self):
+        return self._input_type
+
+    @property
+    # @abstractmethod
+    def length(self):
+        return len(self.configs)
+
+
+class ConfigTrie:
+    """Trie node for config entities"""
+
+    def __init__(self):
+        self.ch = dict()
+
+    def add(self, config: ConfigEntity, last_dim: int):
+        """add a ConfigEntity"""
+        cur = self
+        for i, x in enumerate(config.input):
+            if i == last_dim:
+                continue
+            if x not in cur.ch:
+                cur.ch[x] = ConfigTrie()
+            if not isinstance(cur.ch, dict):
+                raise TypeError('none-leaf node should have a dict of childs')
+            cur = cur.ch[x]
+
+        if not isinstance(cur.ch, list):
+            cur.ch = []
+        cur.ch.append(config.input_id)
+
+    def fetch_random(self, config: ConfigEntity, last_dim: int) -> int:
+        """randomly fetch the index of a ConfigEntity the same with `config` except for the `last_dim`-th dimension"""
+        cur = self
+        for i, x in enumerate(config.input):
+            if i == last_dim:
+                continue
+            if not isinstance(cur.ch, dict):
+                raise TypeError('none leaf node should have a dict of childs')
+            if x not in cur.ch:
+                raise RuntimeError('no element found')
+            cur = cur.ch[x]
+        if not cur.ch:
+            raise RuntimeError('no element found')
+        if len(cur.ch) == 1:
+            return cur.ch[0]
+        idx = config.input_id
+        while idx == config.input_id:
+            idx = random.choice(cur.ch)
+        return idx
+
+
+class ListConfigSpace(ConfigSpace):
+    """Searching space of configs, which stores all possible configs in a list"""
+
+    def __init__(self, input_type):
+        super(ListConfigSpace, self).__init__(input_type)
+
+        self.__config_tries = [ConfigTrie() for _ in range(len(self._dim_names))]
+        self.__fetch_pool = []
+
+    def reset_fetch(self):
+        """reset fetch state"""
+        self.__fetch_pool = [i for i in range(len(self._configs))]
+
+    def fetch_scope(self, start, end):
+        self.__fetch_pool = [i for i in range(start, end)]
+
+    def has_next(self) -> bool:
+        return len(self.__fetch_pool) > 0
+
+    def fetch_index(self) -> int:
+        """fetch a random index of config"""
+        idx = np.random.randint(len(self.__fetch_pool))
+        ret = self.__fetch_pool[idx]
+        self.__fetch_pool[idx] = self.__fetch_pool[-1]
+        self.__fetch_pool.pop()
+        return ret
+
+    def fetch_next_index(self) -> int:
+        """fetch next index of config"""
+        idx = len(self.__fetch_pool) - 1 + self.__fetch_pool[0]
+        self.__fetch_pool.pop()
+        return idx
+
+    def fetch_config(self) -> ConfigEntity:
+        """fetch a random config"""
+        return self.get(self.fetch_index())
+
+    def add(self, input_space: NamedTuple):
+        """add a new config to space"""
+        if not isinstance(input_space, self._input_type):
+            raise TypeError('invalid config input space type, got {} expected {}'.format(type(input_space),
+                                                                                         self._input_type))
+        config = ConfigEntity(len(self._configs), input_space)
+        self.__fetch_pool.append(len(self._configs))
+        for i in range(len(self._dim_names)):
+            self.__config_tries[i].add(config, i)
+        self._configs.append(config)
+
+    def random_walk(self, p: int) -> int:
+        """find a neighbor hood of the p-th ConfigEntity, which only differs with p in at most one dimension"""
+        dim = np.random.randint(len(self._dim_names))
+        return self.__config_tries[dim].fetch_random(self._configs[p], dim)
+
+    @property
+    def length(self):
+        return len(self._configs)
+
+    @classmethod
+    def from_list(cls, configs: List[NamedTuple]):
+        if not isinstance(configs, list):
+            raise TypeError('configs must be of list type, got %s' % type(configs))
+        if not configs:
+            raise ValueError('configs must be non-empty')
+        space = cls(type(configs[0]))
+        for config in configs:
+            space.add(config)
+        return space
diff --git a/tests/fuzz/tune_for_gpu/autotuning/space_generators.py b/tests/fuzz/tune_for_gpu/autotuning/space_generators.py
new file mode 100644
index 00000000..ba6c6ae3
--- /dev/null
+++ b/tests/fuzz/tune_for_gpu/autotuning/space_generators.py
@@ -0,0 +1,753 @@
+# Copyright 2019 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""space generating functions for operators"""
+from functools import partial
+from typing import NamedTuple
+from collections import namedtuple
+from test_run import matmul_run
+from akg.utils import validation_check as vc_util
+from .type_definitions import ConvDesc, ConvBackpropDesc, MatmulCubeDesc, ConvConfig, ConvBackpropInputConfig, ConvBackpropFilterConfig, MatmulCubeConfig
+from .space import ListConfigSpace
+from .kernel_compiler import compile_kernel
+from .gen_spaces_gpu import _get_space_reduce_gpu_manually
+from tqdm import tqdm
+from enum import Enum
+
+GPU_IDX_TO_STR = {0: "x", 1: "y", 2: "z"}
+
+class GpuSpacePolicy(Enum):
+    """Policy to expand tile candidates with block and thread."""
+    FULL = "FULL"
+    BMM = "BMM"
+    REDUCE_ALL = "REDUCE_ALL"
+    REDUCE_X = "REDUCE_X"
+    REDUCE_Y = "REDUCE_Y"
+
+
+def gen_bool_list(attr_list):
+    bool_list = []
+    for _ in attr_list:
+        if len(bool_list) == 0:
+            bool_list = [[True], [False]]
+        else:
+            tmp_list = []
+            for attr_option in bool_list:
+                tmp = attr_option[:]
+                tmp.append(True)
+                tmp1 = tmp[:]
+                tmp.pop()
+                tmp.append(False)
+                tmp2 = tmp[:]
+                tmp_list.append(tmp1)
+                tmp_list.append(tmp2)
+            bool_list = tmp_list
+    return bool_list
+
+
+def _get_space_vector(op_type: str, op_desc):
+    """get config space of vector operator"""
+    space_res, key, expect, input_for_mod = compile_kernel(op_type, op_desc, None, None, None, 0,
+                                                           gen_tiling_spaces=True)
+
+    if space_res is None:
+        raise RuntimeError('no space returned')
+    if 'index' not in space_res or 'tuning_space' not in space_res:
+        raise RuntimeError('invalid space returned')
+    index_table = space_res['index']
+    tiling_spaces = space_res['tuning_space']
+
+    if not tiling_spaces:
+        raise RuntimeError('empty tiling spaces')
+
+    dim_names = ['tiling_' + str(i) for i in range(len(tiling_spaces[0]))]
+    input_type = namedtuple(op_type, dim_names)
+    space = ListConfigSpace(input_type)
+    for tiling_space in tiling_spaces:
+        config = input_type(*tiling_space)
+        space.add(config)
+    return index_table, space, key, expect, input_for_mod
+
+
+def _get_space_conv(op_desc: ConvDesc):
+    """get config space of convolution"""
+    if not isinstance(op_desc, ConvDesc):
+        raise TypeError('op_desc must be ConvDesc')
+
+    stride_ = op_desc.stride
+    pad_ = op_desc.pad
+    dilation_ = op_desc.dilation
+    vc_util.convolution_format_check(
+        op_desc.fmap_shape, op_desc.filter_shape, pad_, stride_, dilation_)
+    config_space = ListConfigSpace(ConvConfig)
+
+    # if double buff is not enabled, set it's value to 1
+    size_scale = 1
+
+    l1_max_size = (1024 * 1024) // size_scale
+    l0a_max_size = (64 * 1024) // size_scale
+    l0b_max_size = (64 * 1024) // size_scale
+    l0c_max_size = ((256 - 8) * 1024) // size_scale // 2
+
+    _, in_c, in_h, in_w = op_desc.fmap_shape
+    k_n, _, k_h, k_w = op_desc.filter_shape
+    padding = (pad_[0], pad_[1], pad_[2], pad_[3])
+    p_top, p_bottom, p_left, p_right = padding
+    s_h, s_w = stride_
+
+    in_c = ((in_c - 1) // 16 + 1) * 16
+    tile_c = in_c
+    tile_co_start = 16
+
+    data_len = 2
+
+    h_max = in_h + p_top + p_bottom
+    win_h = (h_max - k_h) // s_h + 1
+    h_max = (h_max - k_h) // s_h * s_h + k_h
+    w_max = in_w + p_left + p_right
+    win_w = (w_max - k_w) // s_w + 1
+    w_max = (w_max - k_w) // s_w * s_w + k_w
+
+    bypass_options = [0, 1]
+
+    for bypass in bypass_options:
+        for tile_h in range(h_max, k_h - 1, -s_h):
+            size_h = tile_h
+            if tile_h == h_max:
+                w_range = range(w_max, k_w - 1, -s_w)
+                size_h = in_h
+            else:
+                w_range = [w_max]
+                win_tile_h = (tile_h - k_h) // s_h + 1
+                h_tiles = (win_h + win_tile_h - 1) // win_tile_h
+                if h_tiles == 2:
+                    size_h = max(tile_h - p_top, in_h +
+                                 p_top - tile_h + k_h - s_h)
+
+            for tile_w in w_range:
+                size_w = tile_w
+                if size_w == w_max:
+                    size_w = in_w
+                else:
+                    win_tile_w = (tile_w - k_w) // s_w + 1
+                    w_tiles = (win_w + win_tile_w - 1) // win_tile_w
+                    if w_tiles == 2:
+                        size_w = max(tile_w - p_left, in_w +
+                                     p_left - tile_w + k_w - s_w)
+
+                k_n_ = ((k_n - 1) // 16 + 1) * 16
+                co_range = range(k_n_, tile_co_start - 1, -16)
+                for tile_co in co_range:
+                    if bypass == 1:
+                        if tile_co != k_n:
+                            continue
+                        l1_size = data_len * (size_h * size_w * in_c)
+                    else:
+                        l1_size = data_len * (size_h * size_w * in_c +
+                                              tile_co * tile_c * k_h * k_w)
+
+                    if l1_size > l1_max_size:
+                        continue
+
+                    tile_co_ = ((tile_co - 1) // 16 + 1) * 16
+                    for tile_n in range(tile_co_, 15, -16):
+                        k_max = in_c * k_h * k_w
+                        k_max_ = ((k_max - 1) // 16 + 1) * 16
+                        k_size = l0b_max_size // data_len // tile_n
+                        k_size_ = k_size // 16 * 16
+                        for tile_k in range(min(k_max_, k_size_), 15, -16):
+                            m_max = (int(((tile_h - k_h) // (s_h)) + 1)) * \
+                                (int(((tile_w - k_w) // (s_w)) + 1))
+                            m_max_ = ((m_max - 1) // 16 + 1) * 16
+                            m_size1 = l0a_max_size // data_len // tile_k
+                            m_size1_ = m_size1 // 16 * 16
+                            m_size2 = l0c_max_size // data_len // tile_n
+                            m_size2_ = m_size2 // 16 * 16
+                            for tile_m in range(min(m_max_, m_size1_, m_size2_), 15, -16):
+                                config_space.add(ConvConfig(tile_h, tile_co, tile_m, tile_k,
+                                                            tile_n, tile_w, bypass))
+
+    return None, config_space, op_desc.__str__(), None, None
+
+
+def _get_space_conv_bn1(op_desc: ConvDesc):
+    """get config space of convolution"""
+    if not isinstance(op_desc, ConvDesc):
+        raise TypeError('op_desc must be ConvDesc')
+
+    stride_ = op_desc.stride
+    pad_ = op_desc.pad
+    dilation_ = op_desc.dilation
+    vc_util.convolution_format_check(
+        op_desc.fmap_shape, op_desc.filter_shape, pad_, stride_, dilation_)
+    config_space = ListConfigSpace(ConvConfig)
+
+    # if double buff is not enabled, set it's value to 1
+    size_scale = 1
+
+    l1_max_size = (1024 * 1024) // size_scale
+    l0a_max_size = (64 * 1024) // size_scale
+    l0b_max_size = (64 * 1024) // size_scale
+    l0c_max_size = ((256 - 8) * 1024) // size_scale // 2 // 4
+
+    _, in_c, in_h, in_w = op_desc.fmap_shape
+    k_n, _, k_h, k_w = op_desc.filter_shape
+    padding = (pad_[0], pad_[1], pad_[2], pad_[3])
+    p_top, p_bottom, p_left, p_right = padding
+    s_h, s_w = stride_
+
+    in_c = ((in_c - 1) // 16 + 1) * 16
+    tile_c = in_c
+    tile_co_start = 16
+
+    data_len = 2
+
+    h_max = in_h + p_top + p_bottom
+    win_h = (h_max - k_h) // s_h + 1
+    h_max = (h_max - k_h) // s_h * s_h + k_h
+    w_max = in_w + p_left + p_right
+    win_w = (w_max - k_w) // s_w + 1
+    w_max = (w_max - k_w) // s_w * s_w + k_w
+
+    bypass_options = [0, 1]
+
+    for bypass in bypass_options:
+        h_range = range(h_max, k_h - 1, -s_h)
+        for tile_h in h_range:
+            size_h = tile_h
+            if tile_h == h_max:
+                w_range = range(w_max, k_w - 1, -s_w)
+                size_h = in_h
+            else:
+                w_range = [w_max]
+                win_tile_h = (tile_h - k_h) // s_h + 1
+                h_tiles = (win_h + win_tile_h - 1) // win_tile_h
+                if h_tiles == 2:
+                    size_h = max(tile_h - p_top, in_h +
+                                 p_top - tile_h + k_h - s_h)
+
+            for tile_w in w_range:
+                size_w = tile_w
+                if size_w == w_max:
+                    size_w = in_w
+                else:
+                    win_tile_w = (tile_w - k_w) // s_w + 1
+                    w_tiles = (win_w + win_tile_w - 1) // win_tile_w
+                    if w_tiles == 2:
+                        size_w = max(tile_w - p_left, in_w +
+                                     p_left - tile_w + k_w - s_w)
+
+                k_n_ = ((k_n - 1) // 16 + 1) * 16
+                co_range = range(k_n_, tile_co_start - 1, -16)
+                for tile_co in co_range:
+                    if bypass == 1:
+                        if tile_co != k_n:
+                            continue
+                        l1_size = data_len * (size_h * size_w * in_c)
+                    else:
+                        l1_size = data_len * (size_h * size_w * in_c +
+                                              tile_co * tile_c * k_h * k_w)
+
+                    if l1_size > l1_max_size:
+                        continue
+
+                    tile_co_ = ((tile_co - 1) // 16 + 1) * 16
+                    for tile_n in range(tile_co_, 15, -16):
+                        k_max = in_c * k_h * k_w
+                        k_max_ = ((k_max - 1) // 16 + 1) * 16
+                        k_size = l0b_max_size // data_len // tile_n
+                        k_size_ = k_size // 16 * 16
+                        for tile_k in range(min(k_max_, k_size_), 15, -16):
+                            m_max = (int(((tile_h - k_h) // (s_h)) + 1)) * \
+                                (int(((tile_w - k_w) // (s_w)) + 1))
+                            m_max_ = ((m_max - 1) // 16 + 1) * 16
+                            m_size1 = l0a_max_size // data_len // tile_k
+                            m_size1_ = m_size1 // 16 * 16
+                            m_size2 = l0c_max_size // data_len // tile_n
+                            m_size2_ = m_size2 // 16 * 16
+                            for tile_m in range(min(m_max_, m_size1_, m_size2_), 15, -16):
+                                config_space.add(ConvConfig(tile_h, tile_co, tile_m, tile_k,
+                                                            tile_n, tile_w, bypass))
+
+    return None, config_space, op_desc.__str__(), None, None
+
+
+def _get_space_conv_backprop_input(op_desc: ConvBackpropDesc):
+    """get config space of convolution backprop input"""
+    if not isinstance(op_desc, ConvBackpropDesc):
+        raise TypeError('op_desc must be ConvDesc')
+
+    stride_ = op_desc.stride
+    pad_ = op_desc.pad
+    dilation_ = op_desc.dilation
+    vc_util.convolution_format_check(
+        op_desc.fmap_shape, op_desc.filter_shape, pad_, stride_, dilation_)
+    config_space = ListConfigSpace(ConvBackpropInputConfig)
+
+    # if double buff is not enabled, set it's value to 1
+    size_scale = 1
+    block_size = 16
+
+    l1_max_size = (1024 * 1024) // size_scale
+    l0a_max_size = (64 * 1024) // size_scale
+    l0b_max_size = (64 * 1024) // size_scale
+    l0c_max_size = ((256 - 8) * 1024) // size_scale // 2
+    ub_max_size = l0c_max_size
+
+    _, in_c, in_h, in_w = op_desc.fmap_shape
+    k_n, _, k_h, k_w = op_desc.filter_shape
+
+    in_c = (in_c + block_size - 1) // block_size * block_size
+    k_n = (k_n + block_size - 1) // block_size * block_size
+
+    pad_top, pad_bottom, pad_left, pad_right = pad_
+    stride_h, stride_w = stride_
+
+    out_c = k_n
+    out_h = (in_h + pad_top + pad_bottom - k_h) // stride_h + 1
+    out_w = (in_w + pad_left + pad_right - k_w) // stride_w + 1
+
+    out_h = out_h * stride_h
+    out_w = out_w * stride_w
+
+    p_top = k_h - pad_[0] - 1
+    p_bottom = in_h + pad_[0] - stride_[0] * \
+        ((in_h + pad_[0] + pad_[1] - k_h) // stride_[0] + 1)
+    p_left = k_w - pad_[2] - 1
+    p_right = in_w + pad_[2] - stride_[1] * \
+        ((in_w + pad_[2] + pad_[3] - k_w) // stride_[1] + 1)
+
+    s_h = 1
+    s_w = 1
+
+    tile_c = out_c
+    tile_co_start = 16
+
+    data_len = 2
+
+    h_max = out_h + p_top + p_bottom
+    win_h = (h_max - k_h) // s_h + 1
+    h_max = (h_max - k_h) // s_h * s_h + k_h
+    w_max = out_w + p_left + p_right
+    win_w = (w_max - k_w) // s_w + 1
+    w_max = (w_max - k_w) // s_w * s_w + k_w
+
+    for tile_h in range(h_max, k_h - 1, -s_h):
+        size_h = tile_h
+        if tile_h == h_max:
+            w_range = range(w_max, k_w - 1, -s_w)
+            size_h = in_h
+        else:
+            w_range = [w_max]
+            win_tile_h = (tile_h - k_h) // s_h + 1
+            h_tiles = (win_h + win_tile_h - 1) // win_tile_h
+            if h_tiles == 2:
+                size_h = max(tile_h - p_top, in_h + p_top - tile_h + k_h - s_h)
+
+        for tile_w in w_range:
+            size_w = tile_w
+            if size_w == w_max:
+                size_w = in_w
+            else:
+                win_tile_w = (tile_w - k_w) // s_w + 1
+                w_tiles = (win_w + win_tile_w - 1) // win_tile_w
+                if w_tiles == 2:
+                    size_w = max(tile_w - p_left, in_w +
+                                 p_left - tile_w + k_w - s_w)
+
+            k_n_ = ((k_n - 1) // 16 + 1) * 16
+            co_range = range(k_n_, tile_co_start - 1, -16)
+            for tile_co in co_range:
+                l1_size = data_len * (size_h * size_w * out_c +
+                                      tile_co * tile_c * k_h * k_w)
+                if l1_size > l1_max_size:
+                    continue
+                ub_size = data_len * (size_h * size_w * out_c)
+                if ub_size > ub_max_size:
+                    continue
+
+                tile_co_ = ((tile_co - 1) // 16 + 1) * 16
+                for tile_n in range(tile_co_, 15, -16):
+                    k_max = out_c * k_h * k_w
+                    k_base = 16 * k_h * k_w
+                    k_max_ = ((k_max - 1) // k_base + 1) * k_base
+                    k_size = l0b_max_size // data_len // tile_n
+                    k_size_ = k_size // k_base * k_base
+                    for tile_k in range(min(k_max_, k_size_), k_base - 1, -k_base):
+                        m_max = (int(((tile_h - k_h) // (s_h)) + 1)) * \
+                            (int(((tile_w - k_w) // (s_w)) + 1))
+                        m_max_ = ((m_max - 1) // 16 + 1) * 16
+                        m_size1 = l0a_max_size // data_len // tile_k
+                        m_size1_ = m_size1 // 16 * 16
+                        m_size2 = l0c_max_size // data_len // tile_n
+                        m_size2_ = m_size2 // 16 * 16
+                        for tile_m in range(min(m_max_, m_size1_, m_size2_), 15, -16):
+                            config_space.add(ConvBackpropInputConfig(tile_h, tile_co, tile_m,
+                                                                     tile_k, tile_n, tile_w))
+    return None, config_space, op_desc.__str__(), None, None
+
+
+def _get_space_conv_backprop_filter(op_desc: ConvBackpropDesc):
+    """get config space of convolution backwprop filter"""
+    if not isinstance(op_desc, ConvBackpropDesc):
+        raise TypeError('op_desc must be ConvBackpropDesc')
+
+    stride_ = op_desc.stride
+    pad_ = op_desc.pad
+    dilation_ = op_desc.dilation
+    vc_util.convolution_format_check(
+        op_desc.fmap_shape, op_desc.filter_shape, pad_, stride_, dilation_)
+    config_space = ListConfigSpace(ConvBackpropFilterConfig)
+
+    # if double buff is not enabled, set it's value to 1
+    size_scale = 1
+    block_size = 16
+
+    l1_max_size = (1024 * 1024) // size_scale
+    l0a_max_size = (64 * 1024) // size_scale
+    l0b_max_size = (64 * 1024) // size_scale
+    l0c_max_size = ((256 - 8) * 1024) // size_scale // 2
+
+    in_n, in_c, in_h, in_w = op_desc.fmap_shape
+    cout, _, k_h, k_w = op_desc.filter_shape
+    k_n = cout
+
+    in_c = (in_c + block_size - 1) // block_size * block_size
+    cout = (cout + block_size - 1) // block_size * block_size
+
+    pad_top, pad_bottom, pad_left, pad_right = pad_
+    s_h, s_w = stride_
+    tile_co_start = 16
+    tile_ci_start = 16
+    data_len = 2
+    h_max = in_h + pad_top + pad_bottom
+    win_h = (h_max - k_h) // s_h + 1
+    h_max = (h_max - k_h) // s_h * s_h + k_h
+    w_max = in_w + pad_left + pad_right
+    win_w = (w_max - k_w) // s_w + 1
+    w_max = (w_max - k_w) // s_w * s_w + k_w
+
+    for tile_h in range(h_max, k_h - 1, -s_h):
+        size_h = tile_h
+        win_tile_h = (tile_h - k_h) // s_h + 1
+        # Only one head for cut H axis
+        if win_tile_h * s_h < pad_top:
+            continue
+        # Only one tail for cut H axis
+        if (((win_h + win_tile_h - 1) // win_tile_h - 1) * win_tile_h - 1) * s_h + k_h > in_h + pad_top:
+            continue
+        if tile_h == h_max:
+            w_range = range(w_max, k_w - 1, -s_w)
+            size_h = in_h
+        else:
+            w_range = [w_max]
+            h_tiles = (win_h + win_tile_h - 1) // win_tile_h
+            if h_tiles == 2:
+                size_h = max(tile_h - pad_top, in_h +
+                             pad_top - tile_h + k_h - s_h)
+
+        for tile_w in w_range:
+            size_w = tile_w
+            win_tile_w = (tile_w - k_w) // s_w + 1
+            # Only one head for cut W axis
+            if win_tile_w * s_w < pad_left:
+                continue
+            # Only one tail for cut W axis
+            if (((win_w + win_tile_w - 1) // win_tile_w - 1) * win_tile_w - 1) * s_w + k_w > in_w + pad_left:
+                continue
+            if size_w == w_max:
+                size_w = in_w
+            else:
+                w_tiles = (win_w + win_tile_w - 1) // win_tile_w
+                if w_tiles == 2:
+                    size_w = max(tile_w - pad_left, in_w +
+                                 pad_left - tile_w + k_w - s_w)
+            for tile_kh in range(k_h, 0, -1):
+                for tile_kw in range(k_w, 0, -1):
+                    k_n_ = ((k_n - 1) // 16 + 1) * 16
+                    co_range = range(k_n_, tile_co_start - 1, -16)
+                    for tile_co in co_range:
+                        in_c_ = ((in_c - 1) // 16 + 1) * 16
+                        ci_range = range(in_c_, tile_ci_start - 1, -16)
+                        for tile_ci in ci_range:
+                            tile_batch = 1
+                            l1_size = data_len * tile_batch * (tile_co * win_tile_h * win_tile_w +
+                                                               tile_ci * size_h * size_w)
+                            if l1_size > l1_max_size:
+                                continue
+
+                            if (tile_batch != in_n or tile_co != k_n_ or tile_ci != in_c_):
+                                tile_m = tile_co
+                                tile_n = tile_ci * tile_kh * tile_kw
+                                l0c_size = data_len * tile_n * tile_m
+                                if l0c_size > l0c_max_size:
+                                    continue
+                                k_max = tile_batch * tile_h * tile_w
+                                k_max_ = ((k_max - 1) // 16 + 1) * 16
+                                k_size1 = l0a_max_size // data_len // tile_m
+                                k_size1_ = k_size1 // 16 * 16
+                                k_size2 = l0b_max_size // data_len // tile_n
+                                k_size2_ = k_size2 // 16 * 16
+                                for tile_k in range(min(k_max_, k_size1_, k_size2_), 15, -16):
+                                    config_space.add(ConvBackpropFilterConfig(tile_ci, tile_kh, tile_kw, tile_co,
+                                                                              tile_batch, tile_h, tile_w, tile_m,
+                                                                              tile_k, tile_n))
+                            else:
+                                for tile_n in range(tile_ci * tile_kh * tile_kw, 15, -16):
+                                    k_max = tile_batch * tile_h * tile_w
+                                    k_max_ = ((k_max - 1) // 16 + 1) * 16
+                                    k_size = l0b_max_size // data_len // tile_n
+                                    k_size_ = k_size // 16 * 16
+                                    for tile_k in range(min(k_max_, k_size_), 15, -16):
+                                        m_max = tile_co
+                                        m_max_ = ((m_max - 1) // 16 + 1) * 16
+                                        m_size1 = l0a_max_size // data_len // tile_k
+                                        m_size1_ = m_size1 // 16 * 16
+                                        m_size2 = l0c_max_size // data_len // tile_n
+                                        m_size2_ = m_size2 // 16 * 16
+                                        for tile_m in range(min(m_max_, m_size1_, m_size2_), 15, -16):
+                                            config_space.add(ConvBackpropFilterConfig(tile_ci, tile_kh, tile_kw,
+                                                                                      tile_co, tile_batch, tile_h,
+                                                                                      tile_w, tile_m, tile_k, tile_n))
+    return None, config_space, op_desc.__str__(), None, None
+
+
+def _get_space_matmul_cube(op_desc: MatmulCubeDesc):
+    """get config space of matmul_cube"""
+    if not isinstance(op_desc, MatmulCubeDesc):
+        raise TypeError('op_desc must be MatmulCubeDesc')
+    config_space = ListConfigSpace(MatmulCubeConfig)
+    batch_tuple, m, k, n = matmul_run.extract_dim(
+        op_desc.x_shape, op_desc.y_shape, op_desc.adj_x, op_desc.adj_y)
+
+    mmax = (m + 15) // 16
+    nmax = (n + 15) // 16
+    kmax = (k + 15) // 16
+
+    double_buffer = True
+    mad_fp32 = True
+
+    l1_max_size = (1024 * 1024)      # L1  MEM 1024KB
+    l0a_max_size = (64 * 1024)        # L0A MEM 64KB
+    l0b_max_size = (64 * 1024)        # L0B MEM 64KB
+    l0c_max_size = (256 * 1024)       # L0C MEM 256KB
+    # UB  MEM 248KB, 8KB reserved for compiler
+    ub_max_size = ((256 - 8) * 1024)
+
+    if double_buffer:
+        l1_max_size = l1_max_size // 2
+        l0a_max_size = l0a_max_size // 2
+        l0b_max_size = l0b_max_size // 2
+        l0c_max_size = l0c_max_size // 2
+        ub_max_size = ub_max_size // 2
+
+    if mad_fp32:
+        l0c_max_size = l0c_max_size // 2
+    if op_desc.out_dtype == 'float32':
+        ub_max_size = ub_max_size // 2
+
+    bypass_options = [0, 1, 2]
+
+    for bypass in bypass_options:
+        if (bypass == 2) and ((op_desc.adj_x == False and op_desc.left_format[0].lower() == 'n') or
+                              (op_desc.adj_x == True and op_desc.left_format[0].lower() == 'z')):
+            continue
+
+        if (bypass == 1) and ((op_desc.adj_y == False and op_desc.right_format[0].lower() == 'z') or
+                              (op_desc.adj_y == True and op_desc.right_format[0].lower() == 'n')):
+            continue
+
+        for k_l1 in range(1, kmax + 1):
+            if kmax % k_l1 != 0:
+                continue
+            for k_l0 in range(1, k_l1 + 1):
+                if k_l1 % k_l0 != 0:
+                    continue
+
+                # no need to cut from l1 to l0 for m and n when k is cut
+                for m_l1 in range(1, mmax + 1):
+                    if mmax % m_l1 != 0:
+                        continue
+                    m_l0_range = [m_l1] if k_l1 != kmax else range(1, m_l1 + 1)
+                    for m_l0 in m_l0_range:
+                        if m_l1 % m_l0 != 0:
+                            continue
+                        for n_l1 in range(1, nmax + 1):
+                            if nmax % n_l1 != 0:
+                                continue
+                            n_l0_range = [n_l1] if k_l1 != kmax else range(
+                                1, n_l1 + 1)
+                            for n_l0 in n_l0_range:
+                                if n_l1 % n_l0 != 0:
+                                    continue
+
+                                if m_l0 * 16 * k_l0 * 16 > l0a_max_size:
+                                    continue
+
+                                if n_l0 * 16 * k_l0 * 16 > l0b_max_size:
+                                    continue
+
+                                if m_l0 * 16 * n_l0 * 16 > l0c_max_size:
+                                    continue
+
+                                if m_l0 * 16 * n_l0 * 16 > ub_max_size:
+                                    continue
+
+                                if bypass == 2:
+                                    l1_size = n_l1 * 16 * k_l1 * 16
+                                elif bypass == 1:
+                                    l1_size = m_l1 * 16 * k_l1 * 16
+                                else:
+                                    l1_size = (m_l1 * 16 + n_l1 *
+                                               16) * k_l1 * 16
+                                if l1_size > l1_max_size:
+                                    continue
+
+                                if nmax == 1:
+                                    n_l1 = 0
+                                    n_l0 = 0
+                                if mmax == 1:
+                                    m_l1 = 0
+                                    m_l0 = 0
+                                if kmax == 1:
+                                    k_l1 = 16
+                                    k_l0 = 16
+                                config_space.add(MatmulCubeConfig(
+                                    n_l1, n_l0, m_l1, m_l0, k_l1, k_l0, bypass))
+    shape_xx, shape_yy, _, _, k = matmul_run.get_converted_shapes(m, n, k, batch_tuple, op_desc.adj_x, op_desc.adj_y,
+                                                                  op_desc.bias, op_desc.left_format,
+                                                                  op_desc.right_format, op_desc.out_format)
+    return None, config_space, str((shape_xx, shape_yy, op_desc.bias, op_desc.left_format, op_desc.right_format,
+                                    op_desc.out_format, op_desc.adj_x, op_desc.adj_y, op_desc.dtype,
+                                    op_desc.out_dtype)), None, None
+
+
+
+def _get_space_batch_matmul_gpu(op_type: str, op_desc, tuning_attrs=[], tuning_attrs_info=None):
+    """get config space of batch_matmul operator in gpu"""
+    return 
+
+def get_range_block(space_res):
+    block_range = space_res.gpu_block_range_table.asnumpy().tolist()
+    block_mod = space_res.gpu_block_mod_table.asnumpy().tolist()
+    block_x_range = range(block_range[0][0], block_range[0][1]+1, block_mod[0][0])
+    block_y_range = range(block_range[1][0], block_range[1][1]+1, block_mod[1][0]) 
+    if len(block_y_range) == 0: block_y_range = range(1,2)
+    block_z_range = range(block_range[2][0], block_range[2][1]+1, block_mod[2][0])
+    if len(block_z_range) == 0: block_z_range = range(1,2)
+    return block_x_range,block_y_range,block_z_range
+
+def get_range_thread(space_res):
+    thread_range = space_res.gpu_thread_range_table.asnumpy().tolist()
+    thread_mod = space_res.gpu_thread_mod_table.asnumpy().tolist()
+    thread_x_range = range(thread_range[0][0], thread_range[0][1]+1, thread_mod[0][0])
+    thread_y_range = range(thread_range[1][0], thread_range[1][1]+1, thread_mod[1][0])
+    if len(thread_y_range) == 0: thread_y_range = range(1,2)
+    thread_z_range = range(thread_range[2][0], thread_range[2][1]+1, thread_mod[2][0])
+    if len(thread_z_range) == 0: thread_z_range = range(1,2)
+    return thread_x_range,thread_y_range,thread_z_range
+
+def get_space_with_block_thread(tiling_spaces, space_res, policy=GpuSpacePolicy.FULL):
+    total_shape = max([max(v) for v in tiling_spaces])
+    new_spaces = []
+    block_x_range, block_y_range, block_z_range = get_range_block(space_res)
+    thread_x_range, thread_y_range, thread_z_range = get_range_thread(space_res)
+    pbar = tqdm(total=len(tiling_spaces))
+    max_thread = 1024
+    for space in tiling_spaces:
+        pbar.set_description("Adding block, thread to spaces")
+        if policy == GpuSpacePolicy.REDUCE_ALL:
+            for bx in range((total_shape-1)//space[0]+1,(total_shape-1)//space[0]+2):
+                for by in block_y_range:
+                    for bz in block_z_range: 
+                        for tx in thread_x_range:
+                            for ty in thread_y_range:
+                                for tz in thread_z_range:
+                                    if tx * ty * tz > max_thread:
+                                        continue
+                                    tmp_space = space[:]
+                                    tmp_space.append(bx)
+                                    tmp_space.append(by)
+                                    tmp_space.append(bz)
+                                    tmp_space.append(tx)
+                                    tmp_space.append(ty)
+                                    tmp_space.append(tz)
+                                    new_spaces.append(tmp_space)    
+        elif policy == GpuSpacePolicy.BMM:
+            for tx in thread_x_range:
+                for ty in thread_y_range:
+                    for tz in thread_z_range:
+                        if tx * ty * tz > max_thread:
+                            continue
+                        tmp_space = space[:]
+                        if tx > tmp_space[-1] or (len(tmp_space) >= 2 and ty > tmp_space[-2]) or (len(tmp_space) >= 3 and tz > tmp_space[-3]):
+                            continue
+                        bx = max(1, tmp_space[-1] // tx)
+                        by = max(1, tmp_space[-2] // ty) if len(tmp_space) >= 2 else 1
+                        bz = max(1, tmp_space[-3] // tz) if len(tmp_space) >= 3 else 1
+                        if bx >= block_x_range.stop or by >= block_y_range.stop or bz >= block_z_range.stop:
+                            continue
+                        tmp_space.append(bx)
+                        tmp_space.append(by)
+                        tmp_space.append(bz)
+                        tmp_space.append(tx)
+                        tmp_space.append(ty)
+                        tmp_space.append(tz)
+                        new_spaces.append(tmp_space)
+        elif policy == GpuSpacePolicy.FULL:
+            for bx in block_x_range:
+                for by in block_y_range:
+                    for bz in block_z_range: 
+                        for tx in thread_x_range:
+                            for ty in thread_y_range:
+                                for tz in thread_z_range:
+                                    tmp_space = space[:]
+                                    tmp_space.append(bx)
+                                    tmp_space.append(by)
+                                    tmp_space.append(bz)
+                                    tmp_space.append(tx)
+                                    tmp_space.append(ty)
+                                    tmp_space.append(tz)
+                                    new_spaces.append(tmp_space)
+        else:
+            raise ValueError("Policy {} is not defined.".format(policy))
+
+        pbar.update(1)
+    print("total spaces size is: ",len(new_spaces))
+    return new_spaces
+
+def _get_space_conv_image2col_gemm_gpu(op_type: str, op_desc, tuning_attrs=[], tuning_attrs_info=None):
+    """get config space of conv_image2col_gemm operators in gpu"""
+    return 
+
+_get_space_func = {
+    'conv': _get_space_conv,
+    'conv_bn1': _get_space_conv_bn1,
+    'conv_backprop_input': _get_space_conv_backprop_input,
+    'conv_backprop_filter': _get_space_conv_backprop_filter,
+    'matmul': _get_space_matmul_cube,
+    "reduce_sum_gpu": _get_space_reduce_gpu_manually,
+    "batch_matmul_gpu": _get_space_batch_matmul_gpu,
+    "conv_image2col_gemm_gpu": _get_space_conv_image2col_gemm_gpu,
+}
+
+
+def get_space(op_type: str, op_desc: NamedTuple, tuning_attrs=[], tuning_attrs_info=None):
+    """get space of an operator"""
+    func = _get_space_func.get(op_type, None)
+    if func is None:
+        func = partial(_get_space_vector, op_type=op_type)
+    if "gpu" in op_type:
+        return func(op_type=op_type, op_desc=op_desc, tuning_attrs=tuning_attrs, tuning_attrs_info=tuning_attrs_info)
+    return func(op_desc=op_desc)
diff --git a/tests/fuzz/tune_for_gpu/autotuning/test_data_generators.py b/tests/fuzz/tune_for_gpu/autotuning/test_data_generators.py
new file mode 100644
index 00000000..f8ffed41
--- /dev/null
+++ b/tests/fuzz/tune_for_gpu/autotuning/test_data_generators.py
@@ -0,0 +1,147 @@
+# Copyright 2019 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Generating test data for operators"""
+from typing import NamedTuple
+
+import numpy as np
+from gen_json_data import gen_json_data
+from test_run import batchmatmul_run, conv_run, conv_backprop_input_run, conv_backprop_filter_run, matmul_run
+from .type_definitions import ConvDesc, ConvBackpropDesc, MatmulCubeDesc
+
+def _gen_data_json(op_desc):
+    """Generating test data for composite json"""
+    input_for_mod, expect, _ = gen_json_data(op_desc)
+    return input_for_mod, expect
+
+def _gen_data_conv(op_desc: ConvDesc):
+    """Generating test data for conv"""
+    fmap_data, filter_data, bias_data, expect = conv_run.gen_data(op_desc.fmap_shape, op_desc.filter_shape,
+                                                                  op_desc.pad, op_desc.stride, op_desc.dilation,
+                                                                  op_desc.use_bias)
+    out_data = np.full(expect.shape, 0, 'float16')
+
+    if op_desc.use_bias:
+        args = (fmap_data, filter_data, bias_data, out_data)
+    else:
+        args = (fmap_data, filter_data, out_data)
+    return args, expect
+
+
+def _gen_data_conv_bn1(op_desc: ConvDesc):
+    """Generating test data for conv_bn1"""
+    fmap_data, filter_data, bias_data, conv_expect = conv_run.gen_data(op_desc.fmap_shape, op_desc.filter_shape,
+                                                                       op_desc.pad, op_desc.stride, op_desc.dilation,
+                                                                       op_desc.use_bias)
+    axes = (0, 2, 3)
+    conv_mean = np.mean(conv_expect, axis=axes, keepdims=True)
+    conv_square = np.power(conv_expect, 2)
+    conv_var_part = np.mean(conv_square, axis=axes, keepdims=True)
+
+    expects = (conv_expect, conv_var_part, conv_mean)
+
+    out_datas = [np.full(e.shape, 0, 'float16') for e in expects]
+    out_datas[1] = out_datas[1].astype(np.float32)
+    out_datas[2] = out_datas[2].astype(np.float32)
+
+    if op_desc.use_bias:
+        in_data = [fmap_data, filter_data, bias_data]
+    else:
+        in_data = [fmap_data, filter_data]
+
+    args = in_data
+    for out in out_datas:
+        args.append(out)
+    args = tuple(args)
+
+    return {"args": args, 'outputs': (-3, -2, -1)}, expects
+
+
+def _gen_data_conv_backprop_input(op_desc: ConvBackpropDesc):
+    dout, w, dx = conv_backprop_input_run.gen_data(op_desc.fmap_shape, op_desc.filter_shape, op_desc.pad,
+                                                   op_desc.stride, op_desc.dilation)
+    out_data = np.full(dx.shape, 0, 'float16')
+
+    args = (dout, w, out_data)
+    return args, dx
+
+
+def _gen_data_conv_backprop_filter(op_desc: ConvBackpropDesc):
+    """Generating test data for conv_backprop_filter"""
+    block_size = 16
+
+    in_n, in_c, in_h, in_w = op_desc.fmap_shape
+    cout, _, w_h, w_w = op_desc.filter_shape
+
+    in_c = (in_c + block_size - 1) // block_size * block_size
+    cout = (cout + block_size - 1) // block_size * block_size
+
+    x_shape = (in_n, in_c, in_h, in_w)
+    w_shape = (cout, in_c, w_h, w_w)
+
+    dy_data, dx_data, expect = conv_backprop_filter_run.gen_data(x_shape, w_shape, op_desc.pad, op_desc.stride,
+                                                                 op_desc.dilation)
+    out_data = np.full(expect.shape, 0, 'float32')
+
+    args = (dy_data, dx_data, out_data)
+    return args, expect
+
+
+def _gen_data_matmul_cube(op_desc: MatmulCubeDesc):
+    """Generating test data for matmul_cube"""
+    batch_tuple, m, k, n = matmul_run.extract_dim(op_desc.x_shape, op_desc.y_shape, op_desc.adj_x, op_desc.adj_y)
+    m = (m + 15) // 16 * 16
+    n = (n + 15) // 16 * 16
+    k = (k + 15) // 16 * 16
+    _, _, _, out_shape, k = matmul_run.get_converted_shapes(m, n, k, batch_tuple, op_desc.adj_x, op_desc.adj_y,
+                                                            op_desc.bias, op_desc.left_format, op_desc.right_format,
+                                                            op_desc.out_format)
+    m_x, m_y, bench_mark, bias_data = matmul_run.matmul_data(batch_tuple, m, k, n, op_desc.dtype, op_desc.bias_dtype,
+                                                             op_desc.out_dtype, op_desc.bias, op_desc.adj_x,
+                                                             op_desc.adj_y, op_desc.left_format,
+                                                             op_desc.right_format, op_desc.out_format)
+
+    out_data = np.full(out_shape, np.nan, op_desc.out_dtype)
+
+    if op_desc.bias:
+        args = (m_x, m_y, bias_data, out_data)
+    else:
+        args = (m_x, m_y, out_data)
+    return args, bench_mark
+
+
+_gen_data_func = {
+    'json': _gen_data_json,
+    'conv': _gen_data_conv,
+    'conv_bn1': _gen_data_conv_bn1,
+    'conv_backprop_input': _gen_data_conv_backprop_input,
+    'conv_backprop_filter': _gen_data_conv_backprop_filter,
+    'matmul': _gen_data_matmul_cube,
+}
+
+
+def gen_data(op_type: str, op_desc: NamedTuple):
+    """Generate test data for operator
+
+    Parameters
+    op_type: str
+        operator name
+    op_desc: NamedTuple
+        operator definition parameters
+    ----------
+    """
+    gen_func = _gen_data_func.get(op_type, None)
+    if gen_func is None:
+        raise ValueError('Unsupported op type for test data generating: %s' % op_type)
+    return gen_func(op_desc)
diff --git a/tests/fuzz/tune_for_gpu/autotuning/tiling_strategies_gpu.py b/tests/fuzz/tune_for_gpu/autotuning/tiling_strategies_gpu.py
new file mode 100644
index 00000000..f8af6cc0
--- /dev/null
+++ b/tests/fuzz/tune_for_gpu/autotuning/tiling_strategies_gpu.py
@@ -0,0 +1,84 @@
+from akg.utils import custom_tiling as ct_util
+
+def reduce_gpu_tiling_strategy(in_shape, reduce_axis):
+    """Custom tiling strategy for reduce op in gpu"""
+    strategy = list()
+
+    if reduce_axis == None or len(reduce_axis) == len(in_shape):
+        """all-reduce"""
+        strategy.append(
+            ct_util.create_constraint_on_axis(
+                values=32, constraints=ct_util.TileConstraint.MOD, band=0, axis=0
+            )[0]
+        )
+        strategy.append(
+            ct_util.modify_common_constraints(
+                value=[32, 1, 1], constraint=ct_util.TileConstraint.THREAD_MOD
+            )
+        )
+        strategy.append(
+            ct_util.modify_common_constraints(
+                value=[1024, 1, 1], constraint=ct_util.TileConstraint.THREAD_MAX
+            )
+        )
+        strategy.append(
+            ct_util.modify_common_constraints(
+                value=[32, 1, 1], constraint=ct_util.TileConstraint.THREAD_MIN
+            )
+        )
+    elif (len(in_shape) - 1) in reduce_axis:
+        """Reduce-X: dummy strategy for hand-write space"""
+        strategy.append(
+            ct_util.create_constraint_on_axis(
+                values=1, constraints=ct_util.TileConstraint.MAX, band=0, axis=0
+            )[0]
+        )
+        strategy.append(
+            ct_util.create_constraint_on_axis(
+                values=1, constraints=ct_util.TileConstraint.MAX, band=0, axis=1
+            )[0]
+        )
+        strategy.append(
+            ct_util.modify_common_constraints(
+                value=[1, 1, 1], constraint=ct_util.TileConstraint.THREAD_MAX
+            )
+        )
+        strategy.append(
+            ct_util.modify_common_constraints(
+                value=[1, 1, 1], constraint=ct_util.TileConstraint.BLOCK_MAX
+            )
+        )        
+
+    else:
+        """Reduce-Y: dummy strategy for hand-write space"""
+        strategy.append(
+            ct_util.create_constraint_on_axis(
+                values=1, constraints=ct_util.TileConstraint.MAX, band=0, axis=0
+            )[0]
+        )
+        strategy.append(
+            ct_util.create_constraint_on_axis(
+                values=1, constraints=ct_util.TileConstraint.MAX, band=0, axis=1
+            )[0]
+        )
+        strategy.append(
+            ct_util.modify_common_constraints(
+                value=[1, 1, 1], constraint=ct_util.TileConstraint.THREAD_MAX
+            )
+        )
+        strategy.append(
+            ct_util.modify_common_constraints(
+                value=[1, 1, 1], constraint=ct_util.TileConstraint.BLOCK_MAX
+            )
+        )
+
+    return strategy
+
+
+def conv_dummy_strategy():
+    """Conv strategy: dummy strategy"""
+    return 
+
+def batch_matmul_gpu_tiling_strategy(desc):
+    """Custom tiling strategy for batch matmul in gpu with or without tensor core"""
+    return 
\ No newline at end of file
diff --git a/tests/fuzz/tune_for_gpu/autotuning/tuner.py b/tests/fuzz/tune_for_gpu/autotuning/tuner.py
new file mode 100644
index 00000000..98e35c25
--- /dev/null
+++ b/tests/fuzz/tune_for_gpu/autotuning/tuner.py
@@ -0,0 +1,359 @@
+# Copyright 2019 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tuner for finding best config for operators"""
+import logging
+import time
+import json
+import os
+import numpy as np
+from multiprocessing import Process
+from tvm.autotvm.tuner.xgboost_cost_model import XgbCostModel
+from tvm.autotvm.tuner.sa_model_optimizer import SimulatedAnnealingOptimizer
+from .space import ConfigSpace
+from .runner import KernelRunner
+from tqdm import tqdm
+
+logger = logging.getLogger('fuzz.tune.autotuning.tuner')
+
+
+class Tuner:
+    """Basic tuner class
+
+    Parameters
+    ----------
+    runner: KernelRunner
+        This is for run kernels in physical device
+    config_space: ConfigSpace
+        The space of configs
+    n_parallel: int
+        How many kernels are processed in a turn
+    """
+
+    def __init__(self, runner: KernelRunner, index_table: list, config_space: ConfigSpace, n_parallel: int = 1, skip_config_set=None):
+        self._runner = runner
+        self._index_table = index_table
+        self._space = config_space
+        self._n_parallel = n_parallel
+
+        # trial plan
+        self._trials = []
+        self._trial_pt = 0
+        self._visited = set()
+
+        # observed samples
+        self._xs = []
+        self._ys = []
+
+        # keep the current best
+        self._best_config = None  # type: ConfigEntity
+        self._best_time = np.inf
+        self._best_iter = 0
+        self._tuning_time = 0.0
+        self._original_time = np.inf
+        self._skip_config_set = skip_config_set
+
+    @property
+    def best_config(self):
+        return self._best_config
+
+    @property
+    def best_time(self):
+        return self._best_time
+
+    @property
+    def best_iter(self):
+        return self._best_iter
+
+    @property
+    def tuning_time(self):
+        return self._tuning_time
+
+    @property
+    def original_time(self):
+        return self._original_time
+
+    @property
+    def xs(self):
+        return self._xs
+
+    @property
+    def ys(self):
+        return self._ys
+
+    def info(self):
+        print('space size:', self._space.length)
+        print('best config:', self._best_config)
+        print('best time:', self._best_time)
+        print('best_iter:', self._best_iter)
+        print('tuning time:', self._tuning_time, 'secs')
+
+    def next_batch(self, batch_size: int, is_add_visited=True):
+        """extract next batch with xgboost model"""
+        ret = []
+        counter = 0
+        if not is_add_visited:
+            return [self._space.get(index) for index in range(min(batch_size, self._space.length))]
+        while counter < batch_size and self._space.has_next():
+            index = 0
+            while self._trial_pt < len(self._trials):
+                index = self._trials[self._trial_pt]
+                if index not in self._visited:
+                    break
+                self._trial_pt += 1
+
+            if self._trial_pt >= len(self._trials):
+                # if the trial list is empty choose randomly
+                index = self._space.fetch_index()
+
+            ret.append(self._space.get(index))
+            self._visited.add(index)
+
+            counter += 1
+        return ret
+
+    def next_config(self, batch_size: int):
+        """extract next config orderly"""
+        ret = []
+        counter = 0
+        while counter < batch_size and self._space.has_next():
+            index = self._space.fetch_next_index()
+            ret.append(self._space.get(index))
+            self._visited.add(index)
+            counter += 1
+        return ret
+
+    def export_configs(self, configs: list, output_file: str, append: bool = True, desc=""):
+        """export configs"""
+        mode = "a" if append else "w"
+        with open(output_file, mode) as f:
+            for x, y in configs:
+                if y != -1:
+                    f.write("{} | {} | {}\n".format(desc, json.dumps(x._asdict()), y))
+
+    def export_dim_configs(self, configs, output_file: str, append: bool = True, key=""):
+        """export dim configs"""
+        mode = "a" if append else "w"
+        data = {}
+        try:
+            if os.path.isfile(output_file):
+                with open(output_file, 'r') as f:
+                    data = json.load(f)
+        except IOError as e:
+            logger.debug("get dim info from [%s] failed: %s", output_file, str(e))
+        with open(output_file, mode) as f:
+            import re
+            data[key] = configs
+            s = json.dumps(data, sort_keys=True)
+            s = re.sub(r',\s*"', ',\n"', s)
+            s = '{\n' + s[1:-1] + '\n}'
+            f.write(s)
+
+    def export_dim_configs_for_keys(self, configs, output_file: str, append: bool = True, keys=[]):
+        """export dim configs"""
+        mode = "a" if append else "w"
+        data = {}
+        try:
+            if os.path.isfile(output_file):
+                with open(output_file, 'r') as f:
+                    data = json.load(f)
+        except IOError as e:
+            logger.debug("get dim info from [%s] failed: %s", output_file, str(e))
+        with open(output_file, mode) as f:
+            import copy
+            tmp = copy.deepcopy(configs)
+            for key in reversed(keys):
+                info = {key: tmp}
+                tmp = copy.deepcopy(info)
+            data.update(info)
+            s = json.dumps(data, sort_keys=True, indent=4)
+            print(s)
+            f.write(s)
+
+    def load_configs(self, input_file: str):
+        """load configs"""
+        configs = []
+        file_path = os.path.realpath(input_file)
+        if os.path.isfile(file_path):
+            with open(file_path, "r") as f:
+                for line in f:
+                    x, y, _ = line.split('|')
+                    configs.append((self._space.input_type(**json.loads(x)), np.float64(y)))
+        return configs
+
+    def tune(self, least_try_times: int, output_file: str = None):
+        """grid search all configs"""
+        i = 0
+        pbar = tqdm(total=least_try_times)
+        while i < least_try_times:
+            if not self._space.has_next():
+                break
+            configs = self.next_config(min(self._n_parallel, least_try_times - i))
+            run_times = self._runner.run(configs, self._best_time)
+            results = []
+            for idx, conf in enumerate(configs):
+                results.append((conf.input_id, run_times[idx]))
+                # keep best config
+                if self.best_time > run_times[idx]:
+                    self._best_time = run_times[idx]
+                    self._best_iter = i + idx
+                    self._best_config = conf
+
+            i += len(results)
+            pbar.update(len(results))
+
+            # update
+            for res in results:
+                self._xs.append(res[0])
+                self._ys.append(res[1])
+            if output_file:
+                configs = [(self._space.get(res[0]).input, res[1]) for res in results]
+                self.export_configs(configs, output_file)
+        return run_times
+
+
+class ModelBasedTuner(Tuner):
+    """Model based tuner
+    This tuner will fit a cost model and use an optimizer to find the maximums of the cost model as next trials
+
+    Parameters
+    ----------
+    plan_size: int
+        Tuner will re-fit model per `plan_size` new measure samples
+    pre_model: CostModel
+        The cost model that predicts the speed of a config (IR)
+    """
+
+    def __init__(self, runner, index_table, config_space, n_parallel=1, plan_size=32, pre_model=None):
+        super(ModelBasedTuner, self).__init__(runner, index_table, config_space, n_parallel)
+        self.__plan_size = plan_size
+
+        if pre_model is not None:
+            self.__cost_model = pre_model
+            self.__cost_model.reset_space(self._space)
+        else:
+            self.__cost_model = XgbCostModel(self._space)
+
+        self.__model_optimizer = SimulatedAnnealingOptimizer(self._space)
+        self.__train_ct = 0
+
+        self.__is_auto_set_dim = False#True
+
+        # time to leave
+        self.__ttl = None
+        self.__least_try_times = None
+        self.__early_stopping = None
+
+        self.__model_run_time = 0.0
+
+    def info(self):
+        super(ModelBasedTuner, self).info()
+        print('model run time:', self.__model_run_time, 'secs')
+
+    def model_res(self):
+        self.__cost_model.fit(self._xs, self._ys, self.__plan_size)
+        best_configs = self.__model_optimizer.find_best(
+            self.__cost_model, self.__plan_size, self._visited)
+        self._trials = best_configs
+
+    def tune(self, least_try_times: int, output_file: str = None):
+        early_stopping = least_try_times
+        self.__least_try_times = least_try_times
+        self.__early_stopping = early_stopping
+
+        logger.setLevel(logging.DEBUG)
+        old_level = logger.level
+        i = 0
+        error_ct = 0
+
+        tuning_start = time.time()
+        while (i < self._space.length and (i < least_try_times
+                                           or (self._best_time > self._original_time - 0.9
+                                               and i < least_try_times * 3))):
+            if not self._space.has_next():
+                break
+            iter_start = time.time()
+            if not self.__is_auto_set_dim:
+                configs = self.next_batch(min(self._n_parallel, self._space.length - i))
+            else:
+                configs = self.next_batch(min(self._n_parallel, self._space.length - i), False)
+
+            logger.debug('--indexes: %s', str([x.input_id for x in configs]))
+
+            run_times = self._runner.run(configs, self._best_time, self.__is_auto_set_dim)
+            if self.__is_auto_set_dim:
+                from operator import add
+                from functools import reduce
+                self._original_time = reduce(add, run_times) / len(run_times)
+                self._best_time = self._original_time
+                self._best_iter = -1
+                self._best_config = None
+                run_times = None
+                self.__is_auto_set_dim = False
+                continue
+
+            results = []
+            for idx, conf in enumerate(configs):
+                if run_times[idx] == -1:
+                    continue
+                results.append((conf.input_id, run_times[idx]))
+                # keep best config
+                if self._best_time > run_times[idx]:
+                    self._best_time = run_times[idx]
+                    self._best_iter = i + idx
+                    self._best_config = conf
+
+            i += len(results)
+            self.__ttl = min(early_stopping + self.best_iter, self._space.length) - i
+
+            start = time.time()
+            # update
+            for res in results:
+                self._xs.append(res[0])
+                self._ys.append(res[1])
+            if output_file:
+                configs = [(self._space.get(res[0]).input, res[1]) for res in results]
+                desc = str(self._runner.op_desc)
+                self.export_configs(configs, output_file, desc=desc)
+            # if we have enough new training samples
+            if len(self._xs) >= self.__plan_size * (self.__train_ct + 1):
+                p = Process(target=self.model_res)
+                p.start()
+                p.join()
+                self._trial_pt = 0
+                self.__train_ct += 1
+
+            end = time.time()
+            logger.debug('model running time: %f seconds', end - start)
+            self.__model_run_time += end - start
+
+            iter_end = time.time()
+            logger.debug('iter time: %f seconds', iter_end - iter_start)
+
+            if self._best_iter > 0 and i >= self.best_iter + early_stopping:
+                logger.debug('Early stopped. Best iter: %d', self._best_iter)
+                return
+
+            print("tuning time already, ", time.time() - tuning_start)
+            if time.time() - tuning_start > 7200:
+                logger.debug('Early stopped because of too long time. Best iter: %d', self._best_iter)
+                return
+
+            if error_ct > 150:
+                logging.warning('Too many errors happen in the tuning. Now is in debug mode')
+                logger.setLevel(logging.DEBUG)
+            else:
+                logger.setLevel(old_level)
+
+        self._tuning_time += time.time() - tuning_start
diff --git a/tests/fuzz/tune_for_gpu/autotuning/tuning_attrs_descs/reduce_tuning_attrs_desc.json b/tests/fuzz/tune_for_gpu/autotuning/tuning_attrs_descs/reduce_tuning_attrs_desc.json
new file mode 100644
index 00000000..2896a2d5
--- /dev/null
+++ b/tests/fuzz/tune_for_gpu/autotuning/tuning_attrs_descs/reduce_tuning_attrs_desc.json
@@ -0,0 +1,9 @@
+{
+  "enable_atomic_add": {
+    "dtype": "bool",
+    "options": [
+      "False",
+      "True"
+    ]
+  }
+}
\ No newline at end of file
diff --git a/tests/fuzz/tune_for_gpu/autotuning/tuning_utils.py b/tests/fuzz/tune_for_gpu/autotuning/tuning_utils.py
new file mode 100644
index 00000000..394cda1d
--- /dev/null
+++ b/tests/fuzz/tune_for_gpu/autotuning/tuning_utils.py
@@ -0,0 +1,155 @@
+from collections import namedtuple
+import os
+import logging
+
+
+def get_block_str_from_config(config: namedtuple):
+    block_param = ""
+    if "block_x" in getattr(config, "_fields"):
+        block_param += str(config.block_x) + " "
+
+    if "block_y" in getattr(config, "_fields"):
+        block_param += str(config.block_y) + " "
+
+    if "block_z" in getattr(config, "_fields"):
+        block_param += str(config.block_z) + " "
+    return block_param
+
+
+def get_thread_str_from_config(config: namedtuple):
+    thread_param = ""
+    if "thread_x" in getattr(config, "_fields"):
+        thread_param += str(config.thread_x) + " "
+
+    if "thread_y" in getattr(config, "_fields"):
+        thread_param += str(config.thread_y) + " "
+
+    if "thread_z" in getattr(config, "_fields"):
+        thread_param += str(config.thread_z) + " "
+    return thread_param
+
+
+def get_parallel_build_num():
+    """get the num of parallel build"""
+    env_dic = os.environ
+    try:
+        return int(env_dic.get('BUILD_PARALLEL_NUM').lower()) if env_dic.get('BUILD_PARALLEL_NUM') else 1
+    except NameError as e:
+        logging.error(e)
+        return 1
+
+
+def get_available_gpu_num():
+    """get the num of gpu"""
+    env_dic = os.environ
+    try:
+        return [int(id) for id in env_dic.get('USE_GPU_DEVICES').split(",")] if env_dic.get('USE_GPU_DEVICES') else [0, ]
+    except NameError as e:
+        logging.error(e)
+        return 1
+
+def get_real_attr(value ,key ,need_tune_json, need_tune_keys):
+    if key not in need_tune_keys:
+        return value
+    if need_tune_json[key]['dtype'] == "bool":
+        if need_tune_json[key]['options'][value].lower()  == "true":
+            return True
+        elif need_tune_json[key]['options'][value].lower()  == "false":
+            return False
+        else:
+            raise TypeError("Wrong boolean type, please check json file")
+    elif need_tune_json[key]['dtype'] == "str":
+        if isinstance(need_tune_json[key]['options'][value], str):
+            return need_tune_json[key]['options'][value]
+        else:
+            raise TypeError("Wrong str type, please check json file")
+    elif need_tune_json[key]['dtype'] == "int":
+        if isinstance(need_tune_json[key]['options'][value], int):
+            return need_tune_json[key]['options'][value]
+        else:
+            raise TypeError("Wrong int type, please check json file")
+
+
+def merge_attrs(attrs, config, need_tune_json):
+    tiling = [getattr(config, name) for name in getattr(
+            config, '_fields') if name.startswith('tiling')]
+    dim_str = ''
+    d_config = config._asdict()
+    d_attrs = attrs._asdict()
+    
+    is_2d_tiling = False
+    for name in getattr(config, '_fields'):
+        if name.startswith('tiling'):
+            if name.count("_") == 2:
+                is_2d_tiling = True
+            break
+    
+    for i, element in enumerate(tiling):
+        if is_2d_tiling:
+            if i % 2 == 0:
+                dim_str += "0 " + str(i//2) + " "
+            dim_str += str(element) + " "
+        else:
+            # 1d tiling
+            dim_str += "0 " + str(i) + " " + str(element) + " 1 "
+
+    # add block, thread information
+    block = [str(getattr(config, name)) for name in getattr(
+        config, '_fields') if name.startswith('block')]
+    bind_block_str = ' '.join(block)
+
+    thread = [str(getattr(config, name)) for name in getattr(
+        config, '_fields') if name.startswith('thread')]
+    bind_thread_str = ' '.join(thread)
+
+    d_attrs['dim'] = dim_str
+    d_attrs['bind_block'] = bind_block_str
+    d_attrs['bind_thread'] = bind_thread_str
+
+    need_tune_keys = need_tune_json.keys()
+    for key in need_tune_keys:
+        d_attrs[key] = d_config[key]
+
+    # make a new attrs with config info
+    attrs_type = type(attrs)
+    config_list = [get_real_attr(d_attrs[k],k,need_tune_json, need_tune_keys) for k in d_attrs]
+    new_attrs = attrs_type(*config_list)
+    return new_attrs
+
+
+def get_skip_configs_from_log(skip_configs_log):
+    skip_config_set = set()
+    if skip_configs_log != "":
+        with open(skip_configs_log, 'r') as file:
+            for line in file:
+                config = str(line.split("|")[1]).strip()
+                skip_config_set.add(config)
+            print("SKIP CONFIGS NUMBER:", len(skip_config_set))
+    return skip_config_set
+
+def get_tuning_attrs_from_json(tuning_attrs_json):
+    import json
+    need_tune_spaces = [[]]
+    keys = []
+    json_string = dict()
+    if tuning_attrs_json != "":
+        with open(tuning_attrs_json,'r') as file:
+            json_string =json.load(file)
+            for key in json_string.keys():
+                keys.append(key)
+                num_options = len(json_string[key]['options'])
+                tmp_spaces = []
+                for space in need_tune_spaces:
+                    for i in range(num_options):
+                        tmp_space = space[:]
+                        tmp_space.append(i)
+                        tmp_spaces.append(tmp_space)
+                need_tune_spaces = tmp_spaces[:]
+    return (keys, need_tune_spaces, json_string)
+
+if __name__ == "__main__":
+    """test components"""
+    file_name = "tuning_attrs_descs/reduce_tuning_attrs_desc.json"
+    keys, need_tune_spaces = get_tuning_attrs_from_json(file_name)
+    print(keys)
+    print(need_tune_spaces)
\ No newline at end of file
diff --git a/tests/fuzz/tune_for_gpu/autotuning/type_definitions.py b/tests/fuzz/tune_for_gpu/autotuning/type_definitions.py
new file mode 100644
index 00000000..f792c73e
--- /dev/null
+++ b/tests/fuzz/tune_for_gpu/autotuning/type_definitions.py
@@ -0,0 +1,49 @@
+# Copyright 2019 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""operator description and config param definitions"""
+from collections import namedtuple
+
+# op desc for ascend
+ConvDesc = namedtuple("ConvDesc", [
+                      'fmap_shape', 'filter_shape', 'pad', 'stride', 'dilation', 'use_bias'])
+
+ConvBackpropDesc = namedtuple(
+    "ConvBackpropDesc", ['fmap_shape', 'filter_shape', 'pad', 'stride', 'dilation'])
+
+MatmulCubeDesc = namedtuple("MatmulCubeDesc", ["x_shape", "y_shape", "bias", "left_format", "right_format",
+                                               "out_format", "adj_x", "adj_y", "dtype", "bias_dtype", "out_dtype"])
+
+
+# op desc for gpu
+ReduceGpuDesc = namedtuple("ReduceGpuDesc", [
+                           "in_shape", "in_dtype", "axis", "keepdims",
+                           "poly_sch", "dim", "bind_block", "bind_thread",
+                           "enable_akg_reduce_lib", "enable_atomic_add"])                
+
+
+# config param definitions for ascend
+ConvConfig = namedtuple('ConvConfig', [
+                        'tile_h', 'tile_co', 'tile_m', 'tile_k', 'tile_n', 'tile_w', 'bypass'])
+ConvBackpropInputConfig = namedtuple('ConvBackpropInputConfig',
+                                     ['tile_h', 'tile_co', 'tile_m', 'tile_k', 'tile_n', 'tile_w'])
+ConvBackpropFilterConfig = namedtuple('ConvBackpropFilterConfig',
+                                      ['tile_ci', 'tile_kh', 'tile_kw', 'tile_co', 'tile_batch',
+                                       'tile_h', 'tile_w', 'tile_m', 'tile_k', 'tile_n'])
+MatmulCubeConfig = namedtuple(
+    'MatmulCubeConfig', ['n_l1', 'n_l0', 'm_l1', 'm_l0', 'k_l1', 'k_l0', 'bypass'])
+
+# config param definitions for gpu
+
+EmptyConfig = namedtuple('empty', [])
diff --git a/tests/fuzz/tune_for_gpu/config_gpu.sh b/tests/fuzz/tune_for_gpu/config_gpu.sh
new file mode 100644
index 00000000..f6e082ee
--- /dev/null
+++ b/tests/fuzz/tune_for_gpu/config_gpu.sh
@@ -0,0 +1,16 @@
+# how many multi-processing to build
+export BUILD_PARALLEL_NUM=4
+
+# set the default gpu devices, plz never change it
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+
+# set the real devices you want to use
+export USE_GPU_DEVICES=0,1,2,3
+
+export RUNTIME_MODE=gpu
+
+export PROFILING_MODE=true
+
+# ascend config
+export DEVICE_ID=0
+export DEVICE_TOTAL_NUM=8
diff --git a/tests/fuzz/tune_for_gpu/test_gpu.py b/tests/fuzz/tune_for_gpu/test_gpu.py
new file mode 100644
index 00000000..a06064ce
--- /dev/null
+++ b/tests/fuzz/tune_for_gpu/test_gpu.py
@@ -0,0 +1,67 @@
+# Copyright 2019-2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""test"""
+import time
+from autotuning.job import launch
+from akg.utils import kernel_exec
+from akg.ops.math_gpu import reduce_sum
+from autotuning.type_definitions import ReduceGpuDesc
+import numpy as np
+import sys
+import argparse
+from autotuning.tuning_utils import get_skip_configs_from_log, get_tuning_attrs_from_json
+
+
+def reduce_sum_gpu_execute(in_shape, dtype, axis=None, keepdims=False, attrs=False):
+    mod = utils.op_build_test(reduce_sum, (in_shape, ), (in_dtype, ),
+                              kernel_name="reduce_sum_gpu", op_attrs=[axis, keepdims],
+                              attrs={"target": "cuda", "enable_akg_reduce_lib": True})
+    return mod
+
+def run_test_reduce_sum(in_shape, in_dtype, axis=None, keepdims=False, skip_config_set=None, tuning_attrs_info=None):
+    time_start = time.time()
+    op_type_ = 'reduce_sum_gpu'
+    debug_mode_ = True
+    save_res_ = True
+    all_space_ = True
+    op_config = [in_shape, in_dtype, axis, keepdims,
+                 "", "", "",
+                 True, True, True]
+    op_config = ReduceGpuDesc(*op_config)
+    desc_ = ('reduce_sum_gpu', reduce_sum_gpu_execute,
+             op_config, tuning_attrs_info)
+    launch(op_type=op_type_, debug_mode=debug_mode_,
+           save_res=save_res_, desc=desc_, all_space=all_space_,
+           from_json=False, skip_config_set=skip_config_set,
+           tuning_attrs_info=tuning_attrs_info)
+    time_end = time.time()
+    print("total tuning time: ", time_end - time_start)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--skip_configs_log", type=str,
+                        default="", help="skip those configs in .log file")
+    parser.add_argument("--tuning_attrs_json", type=str, default="",
+                        help="the json file to describe the tuning atttrs")
+    args = parser.parse_args()
+
+    # check whether have configs need to skip
+    skip_config_set = get_skip_configs_from_log(args.skip_configs_log)
+
+    # add tuning_attrs from json file
+    tuning_attrs_info = get_tuning_attrs_from_json(args.tuning_attrs_json)
+
+    run_test_reduce_sum((1024, 1024), "float32", (1,),
+                        False, skip_config_set=skip_config_set, tuning_attrs_info=tuning_attrs_info)
diff --git a/tests/test_env.sh b/tests/test_env.sh
index 2ca16cd7..ad80c58d 100644
--- a/tests/test_env.sh
+++ b/tests/test_env.sh
@@ -25,7 +25,7 @@ else
   TVM_ROOT="${AKG_DIR}/third_party/incubator-tvm"
 
   export LD_LIBRARY_PATH=${AKG_BUILD_DIR}:${LD_LIBRARY_PATH}
-  export PYTHONPATH=${TVM_ROOT}/python:${TVM_ROOT}/topi:${TVM_ROOT}/topi/python:${AKG_DIR}:${AKG_DIR}/python:${PYTHONPATH}
+  export PYTHONPATH=${TVM_ROOT}/python:${TVM_ROOT}/topi:${TVM_ROOT}/topi/python:${AKG_DIR}:${AKG_DIR}/tests/common:${AKG_DIR}/python:${AKG_DIR}/tests/operators/gpu:${AKG_DIR}/tests/fuzz/tune_for_gpu:${PYTHONPATH}
   if [ $# -eq 1 ] && [ $1 = "gpu" ]; then
     export LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:${LD_LIBRARY_PATH}
   fi
-- 
Gitee