mxnet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From GitBox <...@apache.org>
Subject [GitHub] marcoabreu closed pull request #9661: Revert "Refactor operators & MKLDNN (#8302)"
Date Thu, 01 Jan 1970 00:00:00 GMT
marcoabreu closed pull request #9661: Revert "Refactor operators & MKLDNN (#8302)"
URL: https://github.com/apache/incubator-mxnet/pull/9661
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/.gitmodules b/.gitmodules
index 42f0027505..170c105a6f 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -22,7 +22,3 @@
 [submodule "3rdparty/googletest"]
 	path = 3rdparty/googletest
 	url = https://github.com/google/googletest.git
-[submodule "3rdparty/mkldnn"]
-	path = 3rdparty/mkldnn
-	url = https://github.com/intel/mkl-dnn.git
-	branch = master
diff --git a/3rdparty/mkldnn b/3rdparty/mkldnn
deleted file mode 160000
index 3e1f8f53f6..0000000000
--- a/3rdparty/mkldnn
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 3e1f8f53f6845dce23abf8089501c2eb45420b9e
diff --git a/CMakeLists.txt b/CMakeLists.txt
index dfa9834ffb..14b40e4f7b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,8 +33,8 @@ mxnet_option(USE_OPENMP           "Build with Openmp support" ON)
 mxnet_option(USE_CUDNN            "Build with cudnn support"  ON) # one could set CUDNN_ROOT for search path
 mxnet_option(USE_LAPACK           "Build with lapack support" ON IF NOT MSVC)
 mxnet_option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON)
-mxnet_option(USE_MKLDNN           "Use MKLDNN variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND UNIX AND (NOT APPLE))
-mxnet_option(USE_MKLML_MKL        "Use MKLDNN variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND UNIX AND (NOT APPLE))
+mxnet_option(USE_MKLML_MKL        "Use MKLML variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND UNIX AND (NOT APPLE))
+mxnet_option(USE_MKL_EXPERIMENTAL "Use experimental MKL (if MKL enabled and found)" OFF)
 mxnet_option(USE_OPERATOR_TUNING  "Enable auto-tuning of operators" ON AND NOT MSVC)
 mxnet_option(USE_GPERFTOOLS       "Build with GPerfTools support (if found)" ON)
 mxnet_option(USE_JEMALLOC         "Build with Jemalloc support"   ON)
@@ -138,11 +138,14 @@ if(USE_VTUNE)
 endif()
 
 if(USE_MKL_IF_AVAILABLE)
+  if(USE_MKL_EXPERIMENTAL AND NOT USE_MKLML_MKL)
+    message(ERROR " USE_MKL_EXPERIMENTAL can only be used when USE_MKL_EXPERIMENTAL is enabled")
+  endif()
   find_package(MKL)
   if(MKL_FOUND)
     include_directories(${MKL_INCLUDE_DIR})
     include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src/operator/mkl)
-	add_definitions(-DMXNET_USE_MKLDNN=1)
+    add_definitions(-DMXNET_USE_MKL2017=1)
     add_definitions(-DUSE_MKL=1)
     add_definitions(-DCUB_MKL=1)
     list(APPEND mxnet_LINKER_LIBS ${MKL_LIBRARIES})
@@ -151,6 +154,11 @@ if(USE_MKL_IF_AVAILABLE)
     endif()
     # If using MKL, use the Intel OMP libraries
     list(APPEND mxnet_LINKER_LIBS iomp5)
+    if(USE_MKL_EXPERIMENTAL)
+      add_definitions(-DMKL_EXPERIMENTAL=1)
+    else()
+      add_definitions(-DMKL_EXPERIMENTAL=0)
+    endif()
   else()
     message(STATUS " MKL not found")
   endif()
diff --git a/Jenkinsfile b/Jenkinsfile
index 80f9424d68..05cda74066 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -24,7 +24,6 @@
 mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, dmlc-core/libdmlc.a, nnvm/lib/libnnvm.a'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
 mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/dmlc-core/libdmlc.a'
-mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmklml_gnu.so, lib/libmkldnn.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, dmlc-core/libdmlc.a, nnvm/lib/libnnvm.a'
 // command to start a docker container
 docker_run = 'tests/ci_build/ci_build.sh'
 // timeout in minutes
@@ -162,18 +161,18 @@ def python3_gpu_ut(docker_type) {
 }
 
 // Python 2
-def python2_mkldnn_ut(docker_type) {
+def python2_mklml_ut(docker_type) {
   timeout(time: max_time, unit: 'MINUTES') {
     sh "${docker_run} ${docker_type} find . -name '*.pyc' -type f -delete"
-    sh "${docker_run} ${docker_type} PYTHONPATH=./python/ MXNET_MKLDNN_DEBUG=1 nosetests-2.7 --with-timer --verbose tests/python/cpu"
+    sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests-2.7 --with-timer --verbose tests/python/cpu"
   }
 }
 
 // Python 3
-def python3_mkldnn_ut(docker_type) {
+def python3_mklml_ut(docker_type) {
   timeout(time: max_time, unit: 'MINUTES') {
     sh "${docker_run} ${docker_type} find . -name '*.pyc' -type f -delete"
-    sh "${docker_run} ${docker_type} PYTHONPATH=./python/ MXNET_MKLDNN_DEBUG=1 nosetests-3.4 --with-timer --verbose tests/python/cpu"
+    sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests-3.4 --with-timer --verbose tests/python/cpu"
   }
 }
 
@@ -244,20 +243,21 @@ try {
         }
       }
     },
-    'CPU: MKLDNN': {
+    'CPU: MKLML': {
       node('mxnetlinux-cpu') {
-        ws('workspace/build-mkldnn-cpu') {
+        ws('workspace/build-mklml-cpu') {
           init_git()
           def flag = """ \
             DEV=1                         \
             USE_PROFILER=1                \
             USE_CPP_PACKAGE=1             \
             USE_BLAS=openblas             \
-            USE_MKLDNN=1                  \
+            USE_MKL2017=1                 \
+            USE_MKL2017_EXPERIMENTAL=1    \
             -j\$(nproc)
             """
           make("cpu_mklml", flag)
-          pack_lib('mkldnn_cpu', mx_mkldnn_lib)
+          pack_lib('mklml_cpu')
         }
       }
     },
@@ -278,23 +278,24 @@ try {
         }
       }
     },
-    'GPU: MKLDNN': {
+    'GPU: MKLML': {
       node('mxnetlinux-cpu') {
-        ws('workspace/build-mkldnn-gpu') {
+        ws('workspace/build-mklml-gpu') {
           init_git()
           def flag = """ \
             DEV=1                         \
             USE_PROFILER=1                \
             USE_CPP_PACKAGE=1             \
             USE_BLAS=openblas             \
-            USE_MKLDNN=1                  \
+            USE_MKL2017=1                 \
+            USE_MKL2017_EXPERIMENTAL=1    \
             USE_CUDA=1                    \
             USE_CUDA_PATH=/usr/local/cuda \
             USE_CUDNN=1                   \
             -j\$(nproc)
             """
           make("build_cuda", flag)
-          pack_lib('mkldnn_gpu', mx_mkldnn_lib)
+          pack_lib('mklml_gpu')
         }
       }
     },
@@ -441,43 +442,43 @@ try {
         }
       }
     },
-    'Python2: MKLDNN-CPU': {
+    'Python2: MKLML-CPU': {
       node('mxnetlinux-cpu') {
-        ws('workspace/ut-python2-mkldnn-cpu') {
+        ws('workspace/ut-python2-mklml-cpu') {
           init_git()
-          unpack_lib('mkldnn_cpu', mx_mkldnn_lib)
+          unpack_lib('mklml_cpu')
           python2_ut('cpu_mklml')
-          python2_mkldnn_ut('cpu_mklml')
+          python2_mklml_ut('cpu_mklml')
         }
       }
     },
-    'Python2: MKLDNN-GPU': {
+    'Python2: MKLML-GPU': {
       node('mxnetlinux-gpu') {
-        ws('workspace/ut-python2-mkldnn-gpu') {
+        ws('workspace/ut-python2-mklml-gpu') {
           init_git()
-          unpack_lib('mkldnn_gpu', mx_mkldnn_lib)
+          unpack_lib('mklml_gpu')
           python2_gpu_ut('gpu_mklml')
-          python2_mkldnn_ut('gpu_mklml')
+          python2_mklml_ut('gpu_mklml')
         }
       }
     },
-    'Python3: MKLDNN-CPU': {
+    'Python3: MKLML-CPU': {
       node('mxnetlinux-cpu') {
-        ws('workspace/ut-python3-mkldnn-cpu') {
+        ws('workspace/ut-python3-mklml-cpu') {
           init_git()
-          unpack_lib('mkldnn_cpu', mx_mkldnn_lib)
+          unpack_lib('mklml_cpu')
           python3_ut('cpu_mklml')
-          python3_mkldnn_ut('cpu_mklml')
+          python3_mklml_ut('cpu_mklml')
         }
       }
     },
-    'Python3: MKLDNN-GPU': {
+    'Python3: MKLML-GPU': {
       node('mxnetlinux-gpu') {
-        ws('workspace/ut-python3-mkldnn-gpu') {
+        ws('workspace/ut-python3-mklml-gpu') {
           init_git()
-          unpack_lib('mkldnn_gpu', mx_mkldnn_lib)
+          unpack_lib('mklml_gpu')
           python3_gpu_ut('gpu_mklml')
-          python3_mkldnn_ut('gpu_mklml')
+          python3_mklml_ut('gpu_mklml')
         }
       }
     },
diff --git a/Makefile b/Makefile
index d325aa65ab..976035b108 100644
--- a/Makefile
+++ b/Makefile
@@ -59,11 +59,11 @@ endif
 # use customized config file
 include $(config)
 
-ifeq ($(USE_MKLDNN), 1)
-	RETURN_STRING := $(shell ./prepare_mkldnn.sh $(MKLDNN_ROOT))
-	MKLDNNROOT := $(firstword $(RETURN_STRING))
-	MKLROOT := $(lastword $(RETURN_STRING))
-	export USE_MKLML = 1
+ifeq ($(USE_MKL2017), 1)
+# must run ./prepare_mkl before including mshadow.mk
+	RETURN_STRING := $(shell ./prepare_mkl.sh $(MKLML_ROOT))
+	MKLROOT := $(firstword $(RETURN_STRING))
+	export USE_MKLML = $(lastword $(RETURN_STRING))
 endif
 
 include mshadow/make/mshadow.mk
@@ -131,16 +131,23 @@ ifeq ($(USE_NNPACK), 1)
 	LDFLAGS += -lnnpack
 endif
 
-ifeq ($(USE_MKLDNN), 1)
-	CFLAGS += -DMXNET_USE_MKLDNN=1
+ifeq ($(USE_MKL2017), 1)
+	CFLAGS += -DMXNET_USE_MKL2017=1
 	CFLAGS += -DUSE_MKL=1
-	CFLAGS += -I$(ROOTDIR)/src/operator/nn/mkldnn/
-	ifneq ($(MKLDNNROOT), $(MKLROOT))
-		CFLAGS += -I$(MKLROOT)/include
-		LDFLAGS += -L$(MKLROOT)/lib
+	CFLAGS += -I$(ROOTDIR)/src/operator/mkl/
+	CFLAGS += -I$(MKLML_ROOT)/include
+	LDFLAGS += -L$(MKLML_ROOT)/lib
+	ifeq ($(USE_MKL2017_EXPERIMENTAL), 1)
+		CFLAGS += -DMKL_EXPERIMENTAL=1
+	else
+		CFLAGS += -DMKL_EXPERIMENTAL=0
+	endif
+	ifeq ($(UNAME_S), Darwin)
+		LDFLAGS += -lmklml
+	else
+		LDFLAGS += -Wl,--as-needed -lmklml_intel -lmklml_gnu
 	endif
-	CFLAGS += -I$(MKLDNNROOT)/include
-	LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn -Wl,-rpath,'$${ORIGIN}'
+	LDFLAGS +=  -liomp5
 endif
 
 ifeq ($(USE_OPERATOR_TUNING), 1)
@@ -154,7 +161,7 @@ endif
 #   -  for Ubuntu, installing atlas will not automatically install the atlas provided lapack library
 # silently switching lapack off instead of letting the build fail because of backward compatibility
 ifeq ($(USE_LAPACK), 1)
-ifeq ($(USE_BLAS),$(filter $(USE_BLAS),blas openblas atlas mkl))
+ifeq ($(USE_BLAS),$(filter $(USE_BLAS),blas openblas atlas))
 ifeq (,$(wildcard /lib/liblapack.a))
 ifeq (,$(wildcard /usr/lib/liblapack.a))
 ifeq (,$(wildcard /usr/lib64/liblapack.a))
@@ -172,7 +179,7 @@ ifeq ($(USE_LAPACK), 1)
 	ifneq ($(USE_LAPACK_PATH), )
 		LDFLAGS += -L$(USE_LAPACK_PATH)
 	endif
-	ifeq ($(USE_BLAS),$(filter $(USE_BLAS),blas openblas atlas mkl))
+	ifeq ($(USE_BLAS),$(filter $(USE_BLAS),blas openblas atlas))
 		LDFLAGS += -llapack
 	endif
 	CFLAGS += -DMXNET_USE_LAPACK
@@ -562,8 +569,7 @@ clean: cyclean $(EXTRA_PACKAGES_CLEAN)
 else
 clean: cyclean testclean $(EXTRA_PACKAGES_CLEAN)
 	$(RM) -r build lib bin *~ */*~ */*/*~ */*/*/*~ R-package/NAMESPACE R-package/man R-package/R/mxnet_generated.R \
-		R-package/inst R-package/src/image_recordio.h R-package/src/*.o R-package/src/*.so mxnet_*.tar.gz \
-		external/mkldnn/install/*
+		R-package/inst R-package/src/image_recordio.h R-package/src/*.o R-package/src/*.so mxnet_*.tar.gz
 	cd $(DMLC_CORE); $(MAKE) clean; cd -
 	cd $(PS_PATH); $(MAKE) clean; cd -
 	cd $(NNVM_PATH); $(MAKE) clean; cd -
diff --git a/amalgamation/mxnet_predict0.cc b/amalgamation/mxnet_predict0.cc
index cfee605595..f35591d82b 100644
--- a/amalgamation/mxnet_predict0.cc
+++ b/amalgamation/mxnet_predict0.cc
@@ -66,7 +66,7 @@
 #include "src/operator/operator_util.cc"
 #include "src/operator/nn/activation.cc"
 #include "src/operator/nn/batch_norm.cc"
-#include "src/operator/nn/concat.cc"
+#include "src/operator/concat.cc"
 #include "src/operator/nn/convolution.cc"
 #include "src/operator/nn/deconvolution.cc"
 #include "src/operator/nn/dropout.cc"
diff --git a/cmake/ChooseBlas.cmake b/cmake/ChooseBlas.cmake
index 13d7083f3d..3a8723a5dd 100644
--- a/cmake/ChooseBlas.cmake
+++ b/cmake/ChooseBlas.cmake
@@ -23,7 +23,7 @@ if(USE_MKL_IF_AVAILABLE)
     find_package(MKL)
   endif()
   if(MKL_FOUND)
-	if(USE_MKLDNN)
+    if(USE_MKLML_MKL)
       set(BLAS "open")
     else()
       set(BLAS "MKL")
@@ -55,4 +55,4 @@ elseif(BLAS STREQUAL "apple")
   list(APPEND mshadow_LINKER_LIBS ${Accelerate_LIBRARIES})
   add_definitions(-DMSHADOW_USE_MKL=0)
   add_definitions(-DMSHADOW_USE_CBLAS=1)
-endif()
+endif()
\ No newline at end of file
diff --git a/cmake/Modules/FindMKL.cmake b/cmake/Modules/FindMKL.cmake
index 70405566d8..743a871ee7 100644
--- a/cmake/Modules/FindMKL.cmake
+++ b/cmake/Modules/FindMKL.cmake
@@ -19,7 +19,7 @@
 #
 # Options:
 #
-#   USE_MKLDNN                    : Search for MKL:ML library variant
+#   USE_MKLML_MKL                   : Search for MKL:ML library variant
 #
 #   MKL_USE_SINGLE_DYNAMIC_LIBRARY  : use single dynamic library interface
 #   MKL_USE_STATIC_LIBS             : use static libraries
@@ -33,7 +33,7 @@
 #   MKL_INCLUDE_DIR      : unclude directory
 #   MKL_LIBRARIES        : the libraries to link against.
 #
-# cjolivier01: Changed to also look for MKLDNN library (subset of mkl) instead of standard MKL package
+# cjolivier01: Changed to also look for MKLML library (subset of mkl) instead of standard MKL package
 #
 
 if(MKL_FOUND)
@@ -43,7 +43,7 @@ endif()
 # ---[ Root folders
 set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs")
 
-if(USE_MKLDNN)
+if(USE_MKLML_MKL)
 
   find_path(MKL_ROOT include/mkl_blas.h
     PATHS $ENV{MKL_ROOT}
@@ -66,14 +66,13 @@ if(USE_MKLDNN)
   set(__mkl_libs "")
 
   if(WIN32)
-    list(APPEND __mkl_libs mklml_intel)
+    list(APPEND __mkl_libs intel)
   else()
-    list(APPEND __mkl_libs mklml_gnu)
+    list(APPEND __mkl_libs gnu)
   endif()
-  list(APPEND __mkl_libs mkldnn)
 
   foreach (__lib ${__mkl_libs})
-    set(__mkl_lib "${__lib}")
+    set(__mkl_lib "mklml_${__lib}")
     string(TOUPPER ${__mkl_lib} __mkl_lib_upper)
 
     if(MKL_USE_STATIC_LIBS)
@@ -91,7 +90,8 @@ if(USE_MKLDNN)
     list(APPEND MKL_LIBRARIES ${${__mkl_lib_upper}_LIBRARY})
   endforeach()
 
-else(USE_MKLDNN)
+
+else(USE_MKLML_MKL)
 
   # ---[ Options
   mxnet_option(MKL_USE_SINGLE_DYNAMIC_LIBRARY "Use single dynamic library interface" ON)
@@ -193,7 +193,7 @@ else(USE_MKLDNN)
     list(APPEND MKL_LIBRARIES ${MKL_RTL_LIBRARY})
   endif()
 
-endif(USE_MKLDNN)
+endif(USE_MKLML_MKL)
 
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(MKL DEFAULT_MSG ${__looked_for})
diff --git a/example/image-classification/common/data.py b/example/image-classification/common/data.py
index 05f5ddc450..dc8915cda4 100755
--- a/example/image-classification/common/data.py
+++ b/example/image-classification/common/data.py
@@ -112,8 +112,7 @@ def get_rec_iter(args, kv=None):
     image_shape = tuple([int(l) for l in args.image_shape.split(',')])
     if 'benchmark' in args and args.benchmark:
         data_shape = (args.batch_size,) + image_shape
-        train = SyntheticDataIter(args.num_classes, data_shape,
-                args.num_examples / args.batch_size, np.float32)
+        train = SyntheticDataIter(args.num_classes, data_shape, 500, np.float32)
         return (train, None)
     if kv:
         (rank, nworker) = (kv.rank, kv.num_workers)
diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index 23c24766a0..47582fa595 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -34,14 +34,14 @@
 #include <vector>
 #include <map>
 #include <string>
-#include <memory>
 #include <algorithm>
-#if MXNET_USE_MKLDNN == 1
-#include <mkldnn.hpp>
-#endif
+#include <memory>
 #include "./base.h"
 #include "./storage.h"
 #include "./engine.h"
+#if MKL_EXPERIMENTAL == 1
+#include <mkl_memory.h>
+#endif
 // check c++11
 #if DMLC_USE_CXX11 == 0
 #error "cxx11 was required for ndarray module"
@@ -73,7 +73,6 @@ enum NDArrayFormatErr {
   kRSPIdxErr,     // indices error for row sparse
 };
 
-class MKLDNNMemory;
 
 /*!
  * \brief ndarray interface
@@ -82,6 +81,9 @@ class NDArray {
  public:
   /*! \brief default constructor */
   NDArray() {
+#if MKL_EXPERIMENTAL == 1
+    Mkl_mem_ = MKLMemHolder::create();
+#endif
   }
   /*!
    * \brief constructs a new dynamic NDArray
@@ -95,14 +97,56 @@ class NDArray {
       : ptr_(std::make_shared<Chunk>(shape, ctx, delay_alloc, dtype)),
         shape_(shape), dtype_(dtype), storage_type_(kDefaultStorage),
         entry_({nullptr, 0, 0}) {
+#if MKL_EXPERIMENTAL == 1
+    Mkl_mem_ = std::make_shared<MKLMemHolder>();
+#endif
   }
   /*! \brief constructor for NDArray with storage type
    */
   NDArray(const NDArrayStorageType stype, const TShape &shape, Context ctx,
           bool delay_alloc = true, int dtype = mshadow::default_type_flag,
           std::vector<int> aux_types = {}, std::vector<TShape> aux_shapes = {},
-          TShape storage_shape = TShape(mshadow::Shape1(0)));
-
+          TShape storage_shape = TShape(mshadow::Shape1(0)))
+      : shape_(shape), dtype_(dtype), storage_type_(stype),
+        entry_({nullptr, 0, 0}) {
+      // Assign default aux types if not given
+      if (aux_types.size() == 0) {
+        if (stype == kRowSparseStorage) {
+          aux_types = {mshadow::kInt64};
+        } else if (stype == kCSRStorage) {
+          aux_types = {mshadow::kInt64, mshadow::kInt64};
+        } else {
+          LOG(FATAL) << "Unknown storage type " << stype;
+        }
+      }
+      // Assign default shapes if not given
+      // unknown shapes are intialized as {0} such that Size() would return 0
+      if (aux_shapes.size() == 0) {
+        if (stype == kRowSparseStorage) {
+          aux_shapes = {TShape(mshadow::Shape1(0))};
+        } else if (stype == kCSRStorage) {
+          // aux shapes for indptr and indices
+          aux_shapes = {TShape(mshadow::Shape1(0)), TShape(mshadow::Shape1(0))};
+        } else {
+          LOG(FATAL) << "Unknown storage type " << stype;
+        }
+      }
+      if (storage_shape.Size() == 0) {
+        if (stype == kRowSparseStorage) {
+          storage_shape = shape;
+          storage_shape[0] = aux_shapes[rowsparse::kIdx][0];
+        } else if (stype == kCSRStorage) {
+          storage_shape = aux_shapes[csr::kIdx];
+        } else {
+          LOG(FATAL) << "Unknown storage type " << stype;
+        }
+      }
+      ptr_ = std::make_shared<Chunk>(stype, storage_shape, ctx, delay_alloc,
+                                     dtype, aux_types, aux_shapes);
+#if MKL_EXPERIMENTAL == 1
+      Mkl_mem_ = std::make_shared<MKLMemHolder>();
+#endif
+  }
   /*!
    * \brief constructing a static NDArray that shares data with TBlob
    *  Use with caution: allocate ONLY ONE NDArray for each TBlob,
@@ -114,11 +158,17 @@ class NDArray {
       : ptr_(std::make_shared<Chunk>(data, dev_id)), shape_(data.shape_),
         dtype_(data.type_flag_), storage_type_(kDefaultStorage),
         entry_({nullptr, 0, 0}) {
+#if MKL_EXPERIMENTAL == 1
+    Mkl_mem_ = std::make_shared<MKLMemHolder>();
+#endif
   }
   /*! \brief create ndarray from shared memory */
   NDArray(int shared_pid, int shared_id, const TShape& shape, int dtype)
       : ptr_(std::make_shared<Chunk>(shared_pid, shared_id, shape, dtype)), shape_(shape),
         dtype_(dtype), storage_type_(kDefaultStorage), entry_({nullptr, 0, 0}) {
+#if MKL_EXPERIMENTAL == 1
+    Mkl_mem_ = std::make_shared<MKLMemHolder>();
+#endif
   }
 
   /*!
@@ -135,24 +185,11 @@ class NDArray {
           const TBlob &data, const std::vector<TBlob> &aux_data, int dev_id)
       : ptr_(std::make_shared<Chunk>(stype, data, aux_data, dev_id)), shape_(shape),
         dtype_(data.type_flag_), storage_type_(stype), entry_({nullptr, 0, 0}) {
+#if MKL_EXPERIMENTAL == 1
+    Mkl_mem_ = std::make_shared<MKLMemHolder>();
+#endif
   }
 
-  /*
-   * This indicates whether an array is a view of another array (created by
-   * reshape or slice). If an array is a view and the the data is stored in
-   * MKLDNN format, we need to convert the data to the default format when
-   * data in the view is accessed.
-   */
-  inline bool IsView() const {
-    // View only works on the default storage
-    if (storage_type() != kDefaultStorage)
-      return false;
-    // If the array reuses memory, its shape may be different from the storage
-    // shape. However, we shouldn't consider it as a view.
-    if (reuse_)
-      return false;
-    return byte_offset_ > 0 || shape() != ptr_->storage_shape;
-  }
 
   /*!
    * \return the shape of current NDArray.
@@ -235,6 +272,9 @@ class NDArray {
             << "Unexpected storage type: " << stype;
       res = TBlob(dptr, shape, ptr_->aux_handles[i].ctx.dev_mask(), type);
     });
+#if MKL_EXPERIMENTAL == 1
+    res.Mkl_mem_ = Mkl_mem_;
+#endif
     return res;
   }
   /*!
@@ -495,12 +535,15 @@ class NDArray {
     CHECK_GE(ptr_->shandle.size,
              shape.Size() * mshadow::mshadow_sizeof(dtype))
         << "NDArray.AsArray: target memory size is bigger";
-    // We can't reuse memory in a view.
-    CHECK(!IsView());
+#if MKL_EXPERIMENTAL == 1
+    if (Mkl_mem_ != nullptr) {
+      // convert prv to cpu
+      Mkl_mem_->check_and_prv_to_cpu(ptr_->shandle.dptr);
+    }
+#endif
     NDArray ret = *this;
     ret.shape_ = shape;
     ret.dtype_ = dtype;
-    ret.reuse_ = true;
     return ret;
   }
   /*!
@@ -569,83 +612,6 @@ class NDArray {
              << "CheckAndAllocAuxData is not intended for kDefaultStorage";
     ptr_->CheckAndAllocAuxData(i, aux_shape);
   }
-
-#if MXNET_USE_MKLDNN == 1
-  /*
-   * Test if the data is stored in one of special MKLDNN format.
-   */
-  bool IsMKLDNNData() const {
-    return ptr_->IsMKLDNN();
-  }
-  /*
-   * Test if the data is stored in one of default MXNet formats.
-   */
-  bool IsDefaultData() const {
-    return ptr_->IsDefault();
-  }
-  /*
-   * All functions below return a raw pointer to mkldnn memory. Actually there
-   * is a shared pointer that hold the memory either in NDArray or in MKLDNN
-   * stream. As long as we call these functions inside an operator, the return
-   * memory is always valid.
-   */
-
-  /*
-   * This function returns mkldnn::memory with the default primitive_desc.
-   */
-  const mkldnn::memory *GetMKLDNNData() const;
-  /*
-   * This function returns mkldnn::memory with the given primitive_desc
-   * as long as the array size meets the required size in the given primitive_desc.
-   */
-  const mkldnn::memory *GetMKLDNNData(
-      const mkldnn::memory::primitive_desc &desc) const;
-  /*
-   * This function returns mkldnn::memory with the given primitive_desc.
-   * The returned mkldnn::memory will have the same physical layout as
-   * the given primitive_desc.
-   */
-  const mkldnn::memory *GetMKLDNNDataReorder(
-      const mkldnn::memory::primitive_desc &desc) const;
-
-  /*
-   * This function copies data from mkldnn memory.
-   */
-  void CopyFrom(const mkldnn::memory &mem);
-  /*
-   * This function allocates memory for array and creates mkldnn memory
-   * with the specified format.
-   */
-  mkldnn::memory *CreateMKLDNNData(
-      const mkldnn::memory::primitive_desc &desc);
-
-  /*
-   * Reorder the memory to the specified layout.
-   */
-  void MKLDNNDataReorder(const mkldnn::memory::primitive_desc &desc);
-  void Reorder2Default() {
-    CHECK_EQ(storage_type(), kDefaultStorage);
-    ptr_->Reorder2Default();
-  }
-
-  void InvalidateMKLDNNData() {
-    // Removing mkl_mem_ means the NDArray will store data in the default format.
-    ptr_->mkl_mem_ = nullptr;
-  }
-
-  /*
-   * This function is used inside operators to reshape an array.
-   * It doesn't change the layout of the original array and allocate memory from
-   * the temporary buffer. The returned array is only valid inside the current
-   * invocation of this operator.
-   * This is different from Reshape. Reshape will cause data in the array to be
-   * converted to the default layout and allocate memory from malloc directly,
-   * which can be expensive.
-   * It's used by FullyConnected right now.
-   */
-  NDArray MKLDNNDataReshape(const TShape &shape) const;
-#endif
-
   /*!
    * \brief Save list of ndarray into the Stream.x
    * \param fo The stream of output.
@@ -680,12 +646,6 @@ class NDArray {
                for csr, aux_handles[0] = indptr, aux_handles[1] = indices
     */
     std::vector<Storage::Handle> aux_handles;
-
-#if MXNET_USE_MKLDNN == 1
-    /*! This is created when data is stored in MKLDNN format.
-     */
-    std::shared_ptr<mkldnn::memory> mkl_mem_;
-#endif
     /*! \brief variable from engine */
     Engine::VarHandle var;
     /*!
@@ -747,7 +707,7 @@ class NDArray {
         : static_data(false), delay_alloc(false) {
       var = Engine::Get()->NewVariable();
       ctx = Context::CPUShared(0);
-      shandle.size = shape.Size() * mshadow::mshadow_sizeof(dtype);
+      shandle.size = shape.Size() * mshadow::mshadow_sizeof(dtype);;
       shandle.ctx = ctx;
       shandle.shared_pid = shared_pid;
       shandle.shared_id = shared_id;
@@ -822,9 +782,6 @@ class NDArray {
     inline void CheckAndAlloc(void) {
       if (delay_alloc) {
         shandle = Storage::Get()->Alloc(shandle.size, shandle.ctx);
-#if MXNET_USE_MKLDNN == 1
-        mkl_mem_ = nullptr;
-#endif
         delay_alloc = false;
       }
     }
@@ -837,18 +794,12 @@ class NDArray {
       dbytes = std::max(dbytes, static_cast<uint64_t>(shandle.size));
       if (delay_alloc) {
         shandle = Storage::Get()->Alloc(dbytes, shandle.ctx);
-#if MXNET_USE_MKLDNN == 1
-        mkl_mem_ = nullptr;
-#endif
         delay_alloc = false;
       } else if (shandle.size < dbytes) {
         // free storage if necessary and alloc again
         if (shandle.size > 0) Storage::Get()->Free(shandle);
         // init storage
         shandle = Storage::Get()->Alloc(dbytes, shandle.ctx);
-#if MXNET_USE_MKLDNN == 1
-        mkl_mem_ = nullptr;
-#endif
       }
     }
 
@@ -874,19 +825,20 @@ class NDArray {
     // storage shape is also updated
     // if data is already allocated, try reuse the storage. Otherwise, free the current one
     // and allocate new storage
-    void CheckAndAllocData(const TShape &shape, int dtype);
-
-#if MXNET_USE_MKLDNN == 1
-    // Have MKL memory reference to the data in the default storage
-    // or create memory for MKLDNN.
-    void SetMKLMem(const TShape &shape, int dtype);
-    // In the data is stored in MKLDNN layout, we reorder data in mkl_mem_ and
-    // save the result in shandle.
-    void Reorder2Default();
-    bool IsMKLDNN() const;
-    bool IsDefault() const;
-#endif
-
+    inline void CheckAndAllocData(const TShape &shape, int dtype) {
+      CHECK_NE(aux_shapes.size(), 0) << "data is expected to be allocated after aux_data";
+      auto dbytes = shape.Size() * mshadow::mshadow_sizeof(dtype);
+      if (shandle.size < dbytes) {
+        // free storage if necessary and alloc again
+        if (shandle.size > 0) Storage::Get()->Free(shandle);
+        // init storage
+        shandle = Storage::Get()->Alloc(dbytes, ctx);
+      }
+      // init shape
+      storage_shape = shape;
+      // delay_alloc is only set when data storage handle is present
+      delay_alloc = false;
+    }
     // create storage handle for aux data based on shape
     // this function assumes ctx, aux shapes and aux types are set
     // aux shape is also updated
@@ -912,11 +864,45 @@ class NDArray {
       set_aux_shape(i, shape);
     }
     /*! \brief destructor */
-    ~Chunk();
+    ~Chunk() {
+      bool skip_free = static_data || delay_alloc;
+      Storage::Handle h = this->shandle;
+      std::vector<Storage::Handle> aux_h = this->aux_handles;
+      Engine::Get()->DeleteVariable([h, aux_h, skip_free](RunContext s) {
+        if (skip_free == false) {
+          Storage::Get()->Free(h);
+          for (size_t i = 0; i < aux_h.size(); i++) {
+            if (aux_h[i].size > 0) Storage::Get()->Free(aux_h[i]);
+          }
+        }
+      }, shandle.ctx, var);
+    }
   };  // struct Chunk
 
-  void SetTBlob() const;
+  void SetTBlob() const {
+    CHECK(ptr_ != nullptr);
+    TShape shape = shape_;
+    char *dptr = static_cast<char*>(ptr_->shandle.dptr);
+    auto stype = storage_type();
+    if (stype == kDefaultStorage) {
+      dptr += byte_offset_;
+    } else if (stype == kCSRStorage || stype == kRowSparseStorage) {
+      shape = storage_shape();
+    } else {
+      LOG(FATAL) << "unknown storage type " << stype;
+    }
+    tblob_.dptr_ = dptr;
+    tblob_.shape_ = shape;
+    tblob_.type_flag_ = dtype_;
+    tblob_.SetDLTensor(ptr_->shandle.ctx.dev_mask(), ptr_->shandle.ctx.dev_id);
+#if MKL_EXPERIMENTAL == 1
+    tblob_.Mkl_mem_ = Mkl_mem_;
+#endif
+  }
 
+#if MKL_EXPERIMENTAL == 1
+  std::shared_ptr<MKLMemHolder> Mkl_mem_;
+#endif
   /*! \brief internal data of NDArray */
   std::shared_ptr<Chunk> ptr_{nullptr};
   /*! \brief shape of current NDArray */
@@ -925,8 +911,6 @@ class NDArray {
   size_t byte_offset_ = 0;
   /*! \brief type of data */
   int dtype_ = -1;
-  /*! \brief whether the NDArray uses memory of another NDArray. */
-  bool reuse_ = false;
   /*! \brief storage type of data */
   NDArrayStorageType storage_type_ = kUndefinedStorage;
   /*! \brief node entry for autograd */
diff --git a/include/mxnet/tensor_blob.h b/include/mxnet/tensor_blob.h
index 168ddcca24..b65cd2b434 100755
--- a/include/mxnet/tensor_blob.h
+++ b/include/mxnet/tensor_blob.h
@@ -36,6 +36,9 @@
 #include <utility>
 #include <algorithm>
 #include "./base.h"
+#if MXNET_USE_MKL2017 == 1
+#include <mkl_memory.h>
+#endif
 namespace mxnet {
 
 /* Forward declaration for friend declaration in TBlob */
@@ -63,10 +66,17 @@ class TBlob {
   /*! \brief type flag of the tensor blob */
   int type_flag_;
 
+  /*! \brief storing mkl chunk buffer blob, use for experimental only */
+#if MKL_EXPERIMENTAL == 1
+  std::shared_ptr<MKLMemHolder> Mkl_mem_;
+#endif
   /*! \brief default constructor, default copy assign will work */
   TBlob(void)
       : dptr_(NULL),
         type_flag_(mshadow::DataType<real_t>::kFlag) {
+#if MKL_EXPERIMENTAL == 1
+    Mkl_mem_ = NULL;
+#endif
     SetDLTensor(cpu::kDevMask, 0);
   }
   /*!
@@ -80,6 +90,9 @@ class TBlob {
   TBlob(DType *dptr, const TShape &shape, int dev_mask, int dev_id = -1)
       : dptr_(dptr), shape_(shape),
         type_flag_(mshadow::DataType<DType>::kFlag) {
+#if MKL_EXPERIMENTAL == 1
+    Mkl_mem_ = NULL;
+#endif
     SetDLTensor(dev_mask, dev_id);
   }
   /*!
@@ -92,6 +105,9 @@ class TBlob {
    */
   TBlob(void *dptr, const TShape &shape, int dev_mask, int type_flag, int dev_id = -1)
       : dptr_(dptr), shape_(shape), type_flag_(type_flag) {
+#if MKL_EXPERIMENTAL == 1
+    Mkl_mem_ = NULL;
+#endif
     SetDLTensor(dev_mask, dev_id);
   }
   /*!
@@ -119,6 +135,9 @@ class TBlob {
     shape_ = src.shape_;
     type_flag_ = mshadow::DataType<DType>::kFlag;
     SetDLTensor(Device::kDevMask, -1);
+#if MKL_EXPERIMENTAL == 1
+    Mkl_mem_ = NULL;
+#endif
     return *this;
   }
   /*!
@@ -153,6 +172,11 @@ class TBlob {
     CHECK(mshadow::DataType<DType>::kFlag == type_flag_)
       << "TBlob.get_with_shape: data type do not match specified type."
       << "Expected: " << type_flag_ << " v.s. given " << mshadow::DataType<DType>::kFlag;
+#if MKL_EXPERIMENTAL == 1
+    if (Mkl_mem_ != nullptr) {
+      Mkl_mem_->check_and_prv_to_cpu(dptr_);
+    }
+#endif
     return mshadow::Tensor<Device, 2, DType>(static_cast<DType*>(dptr_),
                                              shape_.FlatTo2D(),
                                              shape_[shape_.ndim() - 1],
@@ -193,6 +217,11 @@ class TBlob {
     CHECK(mshadow::DataType<DType>::kFlag == type_flag_)
       << "TBlob.get_with_shape: data type do not match specified type."
       << "Expected: " << type_flag_ << " v.s. given " << mshadow::DataType<DType>::kFlag;
+#if MKL_EXPERIMENTAL == 1
+    if (Mkl_mem_ != nullptr) {
+      Mkl_mem_->check_and_prv_to_cpu(dptr_);
+    }
+#endif
     return static_cast<DType*>(dptr_);
   }
   /*! \brief device mask of the corresponding device */
diff --git a/prepare_mkldnn.sh b/prepare_mkldnn.sh
deleted file mode 100755
index 7cd7d6af06..0000000000
--- a/prepare_mkldnn.sh
+++ /dev/null
@@ -1,118 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# set -ex
-#
-# All modification made by Intel Corporation: ? 2016 Intel Corporation
-#
-# All contributions by the University of California:
-# Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
-# All rights reserved.
-#
-# All other contributions:
-# Copyright (c) 2014, 2015, the respective contributors
-# All rights reserved.
-# For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md
-#
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-#     * Redistributions of source code must retain the above copyright notice,
-#       this list of conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright
-#       notice, this list of conditions and the following disclaimer in the
-#       documentation and/or other materials provided with the distribution.
-#     * Neither the name of Intel Corporation nor the names of its contributors
-#       may be used to endorse or promote products derived from this software
-#       without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-
-MXNET_ROOTDIR="$(pwd)"
-MKLDNN_ROOTDIR="$MXNET_ROOTDIR/3rdparty/mkldnn/"
-MKLDNN_SRCDIR="$MKLDNN_ROOTDIR/src"
-MKLDNN_BUILDDIR="$MKLDNN_ROOTDIR/build"
-MKLDNN_INSTALLDIR="$MKLDNN_ROOTDIR/install"
-MKLDNN_LIBDIR="$MXNET_ROOTDIR/lib"
-
-# MKLDNN install destination
-HOME_MKLDNN=$1
-if [ ! -z "$HOME_MKLDNN" ]; then
-  mkdir -p $HOME_MKLDNN
-  if [ ! -w $HOME_MKLDNN ]; then
-    echo "MKLDNN install to $HOME_MKLDNN failed, please try with sudo" >&2
-    exit 1
-  fi
-fi
-
-if [ -z $MKLDNNROOT ]; then
-if [ ! -f "$MKLDNN_INSTALLDIR/lib/libmkldnn.so" ]; then
-    mkdir -p $MKLDNN_INSTALLDIR
-	cd $MKLDNN_ROOTDIR
-    if [ -z $MKLROOT ] && [ ! -f $MKLDNN_INSTALLDIR/include/mkl_cblas.h ]; then
-        rm -rf external && cd scripts && ./prepare_mkl.sh && cd ..
-        cp -a external/*/* $MKLDNN_INSTALLDIR/.
-    fi 
-    echo "Building MKLDNN ..." >&2
-    cd $MXNET_ROOTDIR
-	g++ --version >&2
-    if [ -z $ARCH_OPT ]; then
-        cmake $MKLDNN_ROOTDIR -DCMAKE_INSTALL_PREFIX=$MKLDNN_INSTALLDIR -B$MKLDNN_BUILDDIR
-    else
-        cmake $MKLDNN_ROOTDIR -DCMAKE_INSTALL_PREFIX=$MKLDNN_INSTALLDIR -B$MKLDNN_BUILDDIR -DARCH_OPT_FLAGS=$ARCH_OPT
-    fi
-    make -C $MKLDNN_BUILDDIR -j$(cat /proc/cpuinfo | grep processor | wc -l) VERBOSE=1 >&2
-    make -C $MKLDNN_BUILDDIR install
-    rm -rf $MKLDNN_BUILDDIR
-    mkdir -p $MKLDNN_LIBDIR
-    cp $MKLDNN_INSTALLDIR/lib/* $MKLDNN_LIBDIR
-fi
-MKLDNNROOT=$MKLDNN_INSTALLDIR
-fi
-
-if [ -z $MKLROOT ] && [ -f $MKLDNNROOT/include/mkl_cblas.h ]; then 
-  MKLROOT=$MKLDNNROOT;
-fi
-
-# user specified MKLDNN install folder
-if [ -d "$HOME_MKLDNN" ]; then
-  # skip if user specificed MKLDNNROOT
-  [ "$MKLDNNROOT" != "$HOME_MKLDNN" ] && rsync -a $MKLDNNROOT/include $MKLDNNROOT/lib $HOME_MKLDNN/.
-  [ "$MKLROOT" != "$HOME_MKLDNN" ] && rsync -a $MKLROOT/include $MKLROOT/lib $HOME_MKLDNN/.
-  # update ldconfig if possible
-  if [ -w /etc/ld.so.conf.d ]; then
-    echo "$HOME_MKLDNN/lib" > /etc/ld.so.conf.d/mxnmkldnn.conf && ldconfig
-  fi
-# return value to calling script (Makefile,cmake)
-  echo $HOME_MKLDNN $HOME_MKLDNN
-else
-  echo $MKLDNNROOT $MKLROOT
-fi
-
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index 56f4b9c83e..6461904486 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -1287,10 +1287,6 @@ def check_consistency(sym, ctx_list, scale=1.0, grad_req='write',
             arr[:] = arg_params[name]
         for name, arr in exe.aux_dict.items():
             arr[:] = aux_params[name]
-        # We need to initialize the gradient arrays if it's add.
-        if (grad_req == "add"):
-            for arr in exe.grad_arrays:
-                arr[:] = np.zeros(arr.shape, dtype=arr.dtype)
 
     dtypes = [np.dtype(exe.outputs[0].dtype) for exe in exe_list]
     max_idx = np.argmax(dtypes)
diff --git a/src/common/exec_utils.h b/src/common/exec_utils.h
index 5fd1a9b1d1..dcd1504fb8 100644
--- a/src/common/exec_utils.h
+++ b/src/common/exec_utils.h
@@ -43,61 +43,19 @@ namespace common {
           indices are not recorded
  * \return true if any source NDArray need to cast storage
  */
-inline bool SetupDefaultBlobsIn(const std::vector<NDArray>& src,
-                                const std::vector<NDArray> *bufs,
-                                std::vector<TBlob> *blobs,
-                                std::vector<NDArray> *temp_src,
-                                std::vector<NDArray> *temp_dst,
-                                std::unordered_map<uint32_t, uint32_t> *idx_map) {
+inline bool SetupDefaultBlobs(const std::vector<NDArray>& src,
+                              std::vector<TBlob> *blobs,
+                              std::vector<NDArray> *temp_src,
+                              std::vector<NDArray> *temp_dst,
+                              std::unordered_map<uint32_t, uint32_t> *idx_map = nullptr) {
   bool require_cast = false;
   for (size_t i = 0; i < src.size(); i++) {
     auto& nd = src[i];
-    bool is_default = nd.storage_type() == kDefaultStorage;
-#if MXNET_USE_MKLDNN == 1
-    // We have to make sure it's default storage and default layout.
-    is_default = nd.IsDefaultData();
-#endif
-    if (!is_default) {
-      (*idx_map)[i] = temp_dst->size();
-      NDArray temp = bufs != nullptr ? bufs->at(i) : NDArray(nd.shape(), nd.ctx(),
-                                                             true, nd.dtype());
-#if MXNET_USE_MKLDNN == 1
-      CHECK(temp.IsDefaultData());
-#endif
-      temp_src->emplace_back(nd);
-      temp_dst->emplace_back(temp);
-      blobs->emplace_back(temp.data());
-      require_cast = true;
-    } else {
-      blobs->push_back(nd.data());
-    }
-  }
-  return require_cast;
-}
-
-inline bool SetupDefaultBlobsOut(const std::vector<NDArray>& src,
-                                 const std::vector<OpReqType> &req,
-                                 const std::vector<NDArray> *bufs,
-                                 std::vector<TBlob> *blobs,
-                                 std::vector<NDArray> *temp_src,
-                                 std::vector<NDArray> *temp_dst) {
-  bool require_cast = false;
-  for (size_t i = 0; i < src.size(); i++) {
-    auto& nd = src[i];
-    bool is_default = nd.storage_type() == kDefaultStorage;
-#if MXNET_USE_MKLDNN == 1
-    // If it's writeTo, we don't need to worry whether it contains valid data.
-    if (req[i] == kWriteTo && is_default)
-      const_cast<NDArray &>(nd).InvalidateMKLDNNData();
-    // We have to make sure it's default storage and default layout.
-    is_default = nd.IsDefaultData();
-#endif
-    if (!is_default) {
-      NDArray temp = bufs != nullptr ? bufs->at(i) : NDArray(nd.shape(), nd.ctx(),
-                                                             true, nd.dtype());
-#if MXNET_USE_MKLDNN == 1
-      CHECK(temp.IsDefaultData());
-#endif
+    if (nd.storage_type() != kDefaultStorage) {
+      if (idx_map != nullptr) {
+        (*idx_map)[i] = temp_dst->size();
+      }
+      NDArray temp(nd.shape(), nd.ctx(), false, nd.dtype());
       temp_src->emplace_back(nd);
       temp_dst->emplace_back(temp);
       blobs->emplace_back(temp.data());
@@ -118,9 +76,6 @@ inline bool SetupDefaultBlobsOut(const std::vector<NDArray>& src,
  */
 inline void SetupDefaultBlobsInOut(const std::vector<NDArray> &ndinputs,
                                    const std::vector<NDArray> &ndoutputs,
-                                   const std::vector<OpReqType> &req,
-                                   const std::vector<NDArray> *in_bufs,
-                                   const std::vector<NDArray> *out_bufs,
                                    std::vector<TBlob> *input_blobs,
                                    std::vector<TBlob> *output_blobs,
                                    std::vector<NDArray> *pre_temp_src,
@@ -130,11 +85,9 @@ inline void SetupDefaultBlobsInOut(const std::vector<NDArray> &ndinputs,
                                    std::unordered_map<uint32_t, uint32_t> *in_temp_idx_map,
                                    const std::vector<uint32_t> &mutate_idx) {
   // populate input blobs
-  SetupDefaultBlobsIn(ndinputs, in_bufs, input_blobs, pre_temp_src, pre_temp_dst,
-                      in_temp_idx_map);
+  SetupDefaultBlobs(ndinputs, input_blobs, pre_temp_src, pre_temp_dst, in_temp_idx_map);
   // populate output blobs
-  SetupDefaultBlobsOut(ndoutputs, req, out_bufs, output_blobs, post_temp_dst,
-                       post_temp_src);
+  SetupDefaultBlobs(ndoutputs, output_blobs, post_temp_dst, post_temp_src);
   // add mutable inputs to post temp list
   for (const auto idx : mutate_idx) {
     auto map_iter = in_temp_idx_map->find(idx);
diff --git a/src/executor/attach_op_execs_pass.cc b/src/executor/attach_op_execs_pass.cc
index e4d4955462..1bcc40a894 100644
--- a/src/executor/attach_op_execs_pass.cc
+++ b/src/executor/attach_op_execs_pass.cc
@@ -30,8 +30,11 @@
 #include "../common/utils.h"
 #include "../common/exec_utils.h"
 #include "./exec_pass.h"
-#include "../operator/nn/mkldnn/mkldnn_base-inl.h"
-
+#if MXNET_USE_MKL2017 == 1
+#include <mkl_memory.h>
+#include "../operator/mkl/mkl_memory-inl.h"
+#include "../operator/mkl/mkl_util-inl.h"
+#endif
 namespace mxnet {
 
 namespace op {
@@ -55,34 +58,23 @@ class StorageFallbackOpExecutor : public OpExecutor {
  protected:
   // initialize the data blobs
   void InitBlobs() {
+    using namespace common;
     if (!init_) {
-      pre_temp_buf_.clear();
-      post_temp_buf_.clear();
-      for (size_t i = 0; i < in_array.size(); i++) {
-        auto &nd = in_array[i];
-        pre_temp_buf_.emplace_back(nd.shape(), nd.ctx(), true, nd.dtype());
-      }
-      for (size_t i = 0; i < out_array.size(); i++) {
-        auto &nd = out_array[i];
-        post_temp_buf_.emplace_back(nd.shape(), nd.ctx(), true, nd.dtype());
-      }
+      in_data_.clear(); out_data_.clear();
+      pre_temp_src_.clear(); pre_temp_dst_.clear();
+      post_temp_src_.clear(); post_temp_dst_.clear();
+      in_temp_idx_map_.clear();
+      SetupDefaultBlobsInOut(in_array, out_array, &in_data_, &out_data_,
+                             &pre_temp_src_, &pre_temp_dst_,
+                             &post_temp_src_, &post_temp_dst_,
+                             &in_temp_idx_map_, mutate_idx_);
       init_ = true;
     }
   }
 
   // storage fallback before fcompute is launched
   void PreFCompute(bool is_gpu) {
-    using namespace common;
     InitBlobs();
-    in_data_.clear(); out_data_.clear();
-    pre_temp_src_.clear(); pre_temp_dst_.clear();
-    post_temp_src_.clear(); post_temp_dst_.clear();
-    in_temp_idx_map_.clear();
-    SetupDefaultBlobsInOut(in_array, out_array, req, &pre_temp_buf_, &post_temp_buf_,
-                           &in_data_, &out_data_,
-                           &pre_temp_src_, &pre_temp_dst_,
-                           &post_temp_src_, &post_temp_dst_,
-                           &in_temp_idx_map_, mutate_idx_);
     common::CastNonDefaultStorage(pre_temp_src_, pre_temp_dst_, op_ctx, is_gpu);
   }
 
@@ -93,8 +85,6 @@ class StorageFallbackOpExecutor : public OpExecutor {
 
   // default storage tensor blobs for fcompute
   std::vector<TBlob> in_data_, out_data_;
-  // These are NDArray buffers for cast storage.
-  std::vector<NDArray> pre_temp_buf_, post_temp_buf_;
   // source NDArray for cast storage
   std::vector<NDArray> pre_temp_src_, post_temp_src_;
   // destination NDArray for cast storage
@@ -116,6 +106,10 @@ class StatefulComputeExecutor : public StorageFallbackOpExecutor {
     PreFCompute(is_gpu);
     fcompute_(state_, op_ctx, in_data_, req, out_data_);
     PostFCompute(is_gpu);
+#if MKL_EXPERIMENTAL == 1
+    mkl_tblobs_prv_to_cpu(in_data_);
+    mkl_tblobs_prv_to_cpu(out_data_);
+#endif
   }
 
   ExecType exec_type() const override {
@@ -181,6 +175,10 @@ class FComputeExecutor : public StorageFallbackOpExecutor {
     PreFCompute(is_gpu);
     fcompute_(attrs_, op_ctx, in_data_, req, out_data_);
     PostFCompute(is_gpu);
+#if MKL_EXPERIMENTAL == 1
+    mkl_tblobs_prv_to_cpu(in_data_);
+    mkl_tblobs_prv_to_cpu(out_data_);
+#endif
   }
 
   ExecType exec_type() const override {
@@ -204,9 +202,6 @@ class FComputeExExecutor : public OpExecutor {
  public:
   void Run(RunContext rctx, bool is_gpu) override {
     op_ctx.run_ctx = rctx;
-#if MXNET_USE_MKLDNN == 1
-    InvalidateOutputs(out_array, req);
-#endif
     fcompute_(attrs_, op_ctx, in_array, req, out_array);
   }
 
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index f685370619..2a7d2b9066 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -1209,8 +1209,7 @@ void GraphExecutor::InitDataEntryMemory(std::vector<NDArray>* shared_pool) {
       const NDArray& src = data_pool_.at(storage_id);
       data_entry_[i] = src.AsArray(vshape[i], vdtype[i]);
     } else {
-      data_entry_[i] = NDArray(storage_type, vshape[i], data_context[i],
-                               true, vdtype[i]);
+      data_entry_[i] = NDArray(storage_type, vshape[i], data_context[i]);
     }
     if (log_verbose_) {
       LOG(INFO) << "\tinit data entry\t" << i << "\tas " << common::stype_string(storage_type);
diff --git a/src/executor/infer_graph_attr_pass.cc b/src/executor/infer_graph_attr_pass.cc
index 01fab22409..73a34c8b0f 100644
--- a/src/executor/infer_graph_attr_pass.cc
+++ b/src/executor/infer_graph_attr_pass.cc
@@ -423,6 +423,11 @@ nnvm::Graph InferStorageType(nnvm::Graph&& graph,
     DispatchModeVector dispatch_modes(graph.indexed_graph().num_nodes(), DispatchMode::kUndefined);
     graph.attrs["dispatch_mode"] = std::make_shared<any>(std::move(dispatch_modes));
   }
+  // initialize unknown values for dispatch modes
+  if (graph.attrs.count("dispatch_mode") == 0) {
+    DispatchModeVector dispatch_modes(graph.indexed_graph().num_nodes(), DispatchMode::kUndefined);
+    graph.attrs["dispatch_mode"] = std::make_shared<any>(std::move(dispatch_modes));
+  }
   // initialize the dev_mask vector from the context vector
   if (graph.attrs.count("dev_mask") == 0) {
     CHECK_GT(graph.attrs.count("context"), 0);
diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc
index 93a8bc6c54..eaa95a5f24 100644
--- a/src/imperative/cached_op.cc
+++ b/src/imperative/cached_op.cc
@@ -214,12 +214,6 @@ nnvm::Graph Imperative::CachedOp::GetForwardGraph(
 
   StorageVector storage(idx.num_node_entries(), exec::kBadStorageID);
   for (const auto i : idx.input_nodes()) storage[idx.entry_id(i, 0)] = exec::kExternalStorageID;
-  const auto& stypes = g.GetAttr<StorageTypeVector>("storage_type");
-  CHECK_EQ(stypes.size(), storage.size());
-  for (size_t i = 0; i < stypes.size(); i++) {
-    if (stypes[i] != kDefaultStorage)
-      storage[i] = exec::kDynamicStorageID;
-  }
 
   auto mem_plan = PlanMemory(
       &g, std::move(storage), g.GetAttr<std::vector<uint32_t> >(
@@ -326,10 +320,6 @@ nnvm::Graph Imperative::CachedOp::GetBackwardGraph(
   for (size_t i = 0; i < num_forward_entries; ++i) storage[i] = exec::kExternalStorageID;
   for (const auto i : idx.input_nodes()) storage[idx.entry_id(i, 0)] = exec::kExternalStorageID;
   for (const auto i : idx.outputs()) storage[idx.entry_id(i)] = exec::kExternalStorageID;
-  for (size_t i = 0; i < stypes.size(); i++) {
-    if (stypes[i] != kDefaultStorage)
-      storage[i] = exec::kDynamicStorageID;
-  }
 
   auto mem_plan = PlanMemory(
       &g, std::move(storage), g.GetAttr<std::vector<uint32_t> >("backward_ref_count"),
diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index 966a753dc1..fc28f50103 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -362,9 +362,9 @@ inline void PushFCompute(const FCompute& fn,
       // mapping from index in input_blobs to index in pre_temp_dst
       std::unordered_map<uint32_t, uint32_t> in_temp_idx_map;
       // setup blobs
-      SetupDefaultBlobsInOut(inputs, outputs, req, nullptr, nullptr,
-                             &input_blobs, &output_blobs, &pre_temp_src, &pre_temp_dst,
-                             &post_temp_src, &post_temp_dst, &in_temp_idx_map, mutate_idx);
+      SetupDefaultBlobsInOut(inputs, outputs, &input_blobs, &output_blobs,
+                             &pre_temp_src, &pre_temp_dst, &post_temp_src,
+                             &post_temp_dst, &in_temp_idx_map, mutate_idx);
       // setup context
       OpContext opctx{is_train, rctx, engine::CallbackOnComplete(), requested};
       bool is_gpu = ctx.dev_mask() == gpu::kDevMask;
@@ -460,9 +460,9 @@ inline void PushOperator(const OpStatePtr& state,
         // mapping from index in input_blobs to index in pre_temp_dst
         std::unordered_map<uint32_t, uint32_t> in_temp_idx_map;
         // populate input blobs and output blobs
-        SetupDefaultBlobsInOut(inputs, outputs, req, nullptr, nullptr,
-                               &input_blobs, &output_blobs, &pre_temp_src, &pre_temp_dst,
-                               &post_temp_src, &post_temp_dst, &in_temp_idx_map, mutate_idx);
+        SetupDefaultBlobsInOut(inputs, outputs, &input_blobs, &output_blobs,
+                               &pre_temp_src, &pre_temp_dst, &post_temp_src, &post_temp_dst,
+                               &in_temp_idx_map, mutate_idx);
         // setup contexts
         bool is_gpu = rctx.get_ctx().dev_mask() == gpu::kDevMask;
         // pre-fcompute fallback
@@ -607,7 +607,6 @@ inline bool CheckAndInferStorageType(nnvm::Graph* p_g, exec::DevMaskVector&& dev
     }
     if (match) return true;
   }
-  g.attrs.erase("dispatch_mode");
   g.attrs.erase("storage_type");
   g.attrs.erase("storage_type_inputs");
   if (node_range.second > node_range.first) {
diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h
index e01cc4206b..e98102b6b0 100644
--- a/src/kvstore/kvstore_dist.h
+++ b/src/kvstore/kvstore_dist.h
@@ -32,6 +32,11 @@
 #include "mxnet/engine.h"
 #include "ps/ps.h"
 #include "./kvstore_dist_server.h"
+#if MKL_EXPERIMENTAL == 1
+#include <mkl_memory.h>
+#include "../operator/mkl/mkl_memory-inl.h"
+#include "../operator/mkl/mkl_util-inl.h"
+#endif
 namespace mxnet {
 namespace kvstore {
 
@@ -232,6 +237,9 @@ class KVStoreDist : public KVStoreLocal {
         PSKV& pskv = (gradient_compression_->get_type() == CompressionType::kNone) ?
                       EncodeDefaultKey(key, size, false) :
                       EncodeCompressedKey(key, size, false);
+#if MKL_EXPERIMENTAL == 1
+        mkl_set_tblob_eager_mode(recv_buf.data());
+#endif
         real_t* data = recv_buf.data().dptr<real_t>();
         // false means not to delete data when SArray is deleted
         auto vals = new ps::SArray<real_t>(data, size, false);
@@ -381,6 +389,9 @@ class KVStoreDist : public KVStoreLocal {
       [this, key, pskv, small_buf](RunContext rctx, Engine::CallbackOnComplete cb) {
         size_t size = small_buf.shape().Size();
         real_t* data = small_buf.data().dptr<real_t>();
+#if MKL_EXPERIMENTAL == 1
+        mkl_set_tblob_eager_mode(small_buf.data());
+#endif
         // do push. false means no delete
         ps::SArray<real_t> vals(data, size, false);
         CHECK_NOTNULL(ps_worker_)->ZPush(
@@ -405,6 +416,9 @@ class KVStoreDist : public KVStoreLocal {
           // convert to ps keys
           size_t size = send_buf.shape().Size();
           real_t* data = send_buf.data().dptr<real_t>();
+#if MKL_EXPERIMENTAL == 1
+          mkl_set_tblob_eager_mode(send_buf.data());
+#endif
           // do push. false means no delete
           ps::SArray<real_t> vals(data, size, false);
           CHECK_NOTNULL(ps_worker_)->ZPush(
@@ -426,6 +440,9 @@ class KVStoreDist : public KVStoreLocal {
     using namespace rowsparse;
     auto push_to_servers = [this, key, send_buf]
                            (RunContext rctx, Engine::CallbackOnComplete cb) {
+#if MKL_EXPERIMENTAL == 1
+      mkl_set_tblob_eager_mode(send_buf.data());
+#endif
       real_t* data = send_buf.data().dptr<real_t>();
       const int64_t num_rows = send_buf.aux_shape(kIdx)[0];
       const auto offsets = send_buf.aux_data(kIdx).dptr<int64_t>();
@@ -464,6 +481,9 @@ class KVStoreDist : public KVStoreLocal {
       // allocate memory for the buffer
       size_t num_rows = indices.shape().Size();
       recv_buf.CheckAndAlloc({mshadow::Shape1(num_rows)});
+#if MKL_EXPERIMENTAL == 1
+      mkl_set_tblob_eager_mode(recv_buf.data());
+#endif
       real_t* data = recv_buf.data().dptr<real_t>();
       const auto offsets = indices.data().dptr<int64_t>();
       const auto unit_len = recv_buf.shape().ProdShape(1, recv_buf.shape().ndim());
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index ae7209e272..4db314f9cf 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -31,14 +31,10 @@
 #include <mxnet/resource.h>
 #include <mxnet/imperative.h>
 #include <mshadow/tensor.h>
-#if MXNET_USE_MKLDNN == 1
-#include <mkldnn.hpp>
-#endif
 #include "./ndarray_function.h"
 #include "../common/utils.h"
 #include "../operator/tensor/matrix_op-inl.h"
 #include "../operator/tensor/init_op.h"
-#include "../operator/nn/mkldnn/mkldnn_base-inl.h"
 
 #if MXNET_USE_OPENCV
 #include <opencv2/opencv.hpp>
@@ -50,104 +46,6 @@ DMLC_REGISTRY_ENABLE(::mxnet::NDArrayFunctionReg);
 
 namespace mxnet {
 
-NDArray::NDArray(const NDArrayStorageType stype, const TShape &shape, Context ctx,
-    bool delay_alloc, int dtype, std::vector<int> aux_types,
-    std::vector<TShape> aux_shapes, TShape storage_shape) : shape_(shape),
-  dtype_(dtype), storage_type_(stype), entry_({nullptr, 0, 0}) {
-  // Assign default aux types if not given
-  if (aux_types.size() == 0
-      && stype != kDefaultStorage) {
-    if (stype == kRowSparseStorage) {
-      aux_types = {mshadow::kInt64};
-    } else if (stype == kCSRStorage) {
-      aux_types = {mshadow::kInt64, mshadow::kInt64};
-    } else {
-      LOG(FATAL) << "Unknown storage type " << stype;
-    }
-  }
-  // Assign default shapes if not given
-  // unknown shapes are intialized as {0} such that Size() would return 0
-  if (aux_shapes.size() == 0
-      && stype != kDefaultStorage) {
-    if (stype == kRowSparseStorage) {
-      aux_shapes = {TShape(mshadow::Shape1(0))};
-    } else if (stype == kCSRStorage) {
-      // aux shapes for indptr and indices
-      aux_shapes = {TShape(mshadow::Shape1(0)), TShape(mshadow::Shape1(0))};
-    } else {
-      LOG(FATAL) << "Unknown storage type " << stype;
-    }
-  }
-  if (storage_shape.Size() == 0
-      && stype != kDefaultStorage) {
-    if (stype == kRowSparseStorage) {
-      storage_shape = shape;
-      storage_shape[0] = aux_shapes[rowsparse::kIdx][0];
-    } else if (stype == kCSRStorage) {
-      storage_shape = aux_shapes[csr::kIdx];
-    } else {
-      LOG(FATAL) << "Unknown storage type " << stype;
-    }
-  }
-  if (stype == kDefaultStorage)
-    ptr_ = std::make_shared<Chunk>(shape, ctx, delay_alloc, dtype);
-  else
-    ptr_ = std::make_shared<Chunk>(stype, storage_shape, ctx, delay_alloc,
-        dtype, aux_types, aux_shapes);
-}
-
-struct ChunkMem {
-  Storage::Handle h;
-  std::vector<Storage::Handle> aux_h;
-#if MXNET_USE_MKLDNN == 1
-  std::shared_ptr<mkldnn::memory> mem;
-#endif
-};
-
-NDArray::Chunk::~Chunk() {
-  bool skip_free = static_data || delay_alloc;
-  ChunkMem mem;
-  mem.h = this->shandle;
-  mem.aux_h = this->aux_handles;
-#if MXNET_USE_MKLDNN == 1
-  // We want to delete mkldnn memory after deleting the variable.
-  mem.mem = this->mkl_mem_;
-#endif
-  Engine::Get()->DeleteVariable([mem, skip_free](RunContext s) {
-    if (skip_free == false) {
-#if MXNET_USE_MKLDNN == 1
-      if (mem.mem) {
-        CHECK_LE(mem.mem->get_primitive_desc().get_size(), mem.h.size);
-        CHECK_EQ(mem.mem->get_data_handle(), mem.h.dptr);
-      }
-#endif
-      if (mem.h.size > 0) Storage::Get()->Free(mem.h);
-      for (size_t i = 0; i < mem.aux_h.size(); i++) {
-        if (mem.aux_h[i].size > 0) Storage::Get()->Free(mem.aux_h[i]);
-      }
-    }
-  }, shandle.ctx, var);
-}
-
-void NDArray::Chunk::CheckAndAllocData(const TShape &shape, int dtype) {
-  CHECK_NE(aux_shapes.size(), 0)
-      << "data is expected to be allocated after aux_data";
-  auto dbytes = shape.Size() * mshadow::mshadow_sizeof(dtype);
-  if (shandle.size < dbytes) {
-    // free storage if necessary and alloc again
-    if (shandle.size > 0) Storage::Get()->Free(shandle);
-    // init storage
-    shandle = Storage::Get()->Alloc(dbytes, ctx);
-#if MXNET_USE_MKLDNN == 1
-    mkl_mem_ = nullptr;
-#endif
-  }
-  // init shape
-  storage_shape = shape;
-  // delay_alloc is only set when data storage handle is present
-  delay_alloc = false;
-}
-
 NDArray NDArray::grad() const {
   if (Imperative::AGInfo::IsNone(*this)) return NDArray();
   Imperative::AGInfo& info = Imperative::AGInfo::Get(entry_.node);
@@ -166,55 +64,15 @@ nnvm::Symbol NDArray::get_autograd_symbol() const {
   return ret;
 }
 
-#if MXNET_USE_MKLDNN == 1
-
-NDArray NDArray::MKLDNNDataReshape(const TShape &shape) const {
-  CHECK(!is_none()) << "NDArray is not initialized";
-  CHECK_GE(shape_.Size(), shape.Size())
-    << "NDArray.Reshape: target shape size is larger current shape";
-  CHECK_EQ(storage_type(), kDefaultStorage);
-  if (!IsMKLDNNData()) {
-    NDArray ret = this->Detach();
-    ret.shape_ = shape;
-    return ret;
-  } else {
-    NDArray ret(shape, ctx(), true, dtype());
-    // We shouldn't submit the reorder primitive here because submit will
-    // be called in operators.
-    auto format = GetDefaultFormat(ptr_->mkl_mem_->get_primitive_desc().desc());
-    CHECK_NE(format, ptr_->mkl_mem_->get_primitive_desc().desc().data.format);
-    auto def_pd = GetPrimitiveDesc(ptr_->mkl_mem_->get_primitive_desc(), format);
-    auto def_mem = TmpMemMgr::Get()->Alloc(def_pd);
-    MKLDNNStream *stream = MKLDNNStream::Get();
-    stream->RegisterMem(ptr_->mkl_mem_);
-    stream->RegisterPrim(mkldnn::reorder(*ptr_->mkl_mem_, *def_mem));
-    // def_mem points to a memory region in the temp space. It's only valid
-    // inside an operator. As such, the returned NDArray can only be valid
-    // inside an operator and the shared point doesn't need to do anything
-    // when it's destroyed.
-    ret.ptr_->mkl_mem_ = std::shared_ptr<mkldnn::memory>(def_mem,
-                                                         [](mkldnn::memory *mem){});
-    ret.ptr_->shandle.dptr = def_mem->get_data_handle();
-    ret.ptr_->shandle.size = def_mem->get_primitive_desc().get_size();
-    ret.ptr_->delay_alloc = false;
-    ret.ptr_->static_data = true;
-    ret.byte_offset_ = byte_offset_;
-    return ret;
-  }
-}
-
-#endif
-
 NDArray NDArray::Reshape(const TShape &shape) const {
   CHECK(!is_none()) << "NDArray is not initialized";
+  auto stype = storage_type();
+  // reshape is not supported for non-default ndarray with dismatching shapes
+  CHECK((shape_ == shape) || stype == kDefaultStorage)
+    << "Reshape for storage type " << stype << " is not implemented yet";
   CHECK_GE(shape_.Size(), shape.Size())
     << "NDArray.Reshape: target shape size is larger current shape";
   NDArray ret = this->Detach();
-  // If the shape doesn't change, we can just return it now.
-  if (ret.shape_ == shape)
-    return ret;
-  // Otherwise, reshape only works on the default layout.
-  CHECK_EQ(storage_type(), kDefaultStorage);
   ret.shape_ = shape;
   return ret;
 }
@@ -237,6 +95,7 @@ NDArray NDArray::ReshapeWithRecord(const TShape &shape) {
   return ret;
 }
 
+
 NDArray NDArray::Slice(index_t begin, index_t end) const {
   CHECK(!is_none()) << "NDArray is empty";
   CHECK_LE(begin, end)
@@ -268,8 +127,8 @@ NDArray NDArray::SliceWithRecord(index_t begin, index_t end) {
 }
 
 NDArray NDArray::At(index_t idx) const {
-  CHECK(storage_type() == kDefaultStorage)
-      << "Storage type " << storage_type() << " doesn't support At()";
+  CHECK(storage_type() == kDefaultStorage) << "Storage type "
+                                           << storage_type() << " doesn't support At()";
   NDArray ret = this->Slice(idx, idx+1);
   if (shape_.ndim() > 1) {
     return ret.Reshape(TShape(shape_.data()+1, shape_.data()+shape_.ndim()));
@@ -322,400 +181,6 @@ void NDArray::set_fresh_out_grad(bool state) const {
   info.fresh_out_grad = state;
 }
 
-#if MXNET_USE_MKLDNN == 1
-static inline bool same_shape(const TShape &shape, mkldnn_dims_t dims, int ndims) {
-  if (shape.ndim() != (size_t)ndims)
-    return false;
-  for (int i = 0; i < ndims; i++)
-    if (shape[i] != dims[i])
-      return false;
-  return true;
-}
-
-static inline bool same_shape(const TShape &shape, int dtype, mkldnn::memory::desc desc) {
-  return same_shape(shape, desc.data.dims, desc.data.ndims)
-      && get_mkldnn_type(dtype) == desc.data.data_type;
-}
-
-bool NDArray::Chunk::IsMKLDNN() const {
-  if (storage_type != kDefaultStorage)
-    return false;
-  if (mkl_mem_ == nullptr)
-    return false;
-  auto desc = mkl_mem_->get_primitive_desc().desc();
-  return desc.data.format != GetDefaultFormat(desc);
-}
-
-bool NDArray::Chunk::IsDefault() const {
-  if (storage_type != kDefaultStorage)
-    return false;
-  // If we don't have mkldnn memory yet, we just assume it's not the default
-  // format.
-  if (mkl_mem_ == nullptr)
-    return true;
-  auto desc = mkl_mem_->get_primitive_desc().desc();
-  return desc.data.format == GetDefaultFormat(desc);
-}
-
-void NDArray::Chunk::Reorder2Default() {
-  if (mkl_mem_ == nullptr)
-    return;
-
-  auto format = GetDefaultFormat(mkl_mem_->get_primitive_desc().desc());
-  CHECK(format != mkl_mem_->get_primitive_desc().desc().data.format);
-
-  auto def_pd = GetPrimitiveDesc(mkl_mem_->get_primitive_desc(), format);
-  mkldnn_mem_ptr def_mem(new mkldnn::memory(def_pd));
-  // This may be called in MKLDNN operators. We can't use MKLDNNStream here.
-  std::vector<mkldnn::primitive> net;
-  net.push_back(mkldnn::reorder(*mkl_mem_, *def_mem));
-  mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait();
-
-  CHECK(shandle.size >= def_pd.get_size());
-  CheckAndAlloc(def_pd.get_size());
-  // TODO(zhengda) We need to avoid memory copy here.
-  memcpy(shandle.dptr, def_mem->get_data_handle(), def_pd.get_size());
-  mkl_mem_.reset(new mkldnn::memory(def_pd, shandle.dptr));
-}
-
-void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) {
-  // The shape of the array and the one of the MKL memory may mismatch.
-  // For example, if the array stores parameters, the MKL memory may store data
-  // in 5 dimensions while the NDArray stores data in 4 dimensions.
-  if (mkl_mem_ && mkl_mem_->get_data_handle() == shandle.dptr
-      && same_shape(shape, dtype, mkl_mem_->get_primitive_desc().desc())) {
-    return;
-  }
-
-  mkldnn::memory::dims dims;
-  // These are shapes supprted by MKLDNN.
-  if (shape.ndim() == 1 || shape.ndim() == 2 || shape.ndim() == 4
-      || shape.ndim() == 5) {
-    dims.resize(shape.ndim());
-    for (size_t i = 0; i < dims.size(); i++)
-      dims[i] = shape[i];
-  } else if (shape.ndim() == 3) {
-    // If there are 3 dimensions, we'll force it to 4 dimensions.
-    dims.resize(shape.ndim() + 1);
-    dims[0] = 1;
-    for (size_t i = 0; i < shape.ndim(); i++)
-      dims[i + 1] = shape[i];
-  } else {
-    LOG(FATAL) << "MKLDNN doesn't support " << shape.ndim() << " dimensions";
-  }
-  mkldnn::memory::format layout = mkldnn::memory::format::format_undef;
-  switch (dims.size()) {
-    case 1: layout = mkldnn::memory::format::x; break;
-    case 2: layout = mkldnn::memory::format::nc; break;
-    case 4: layout = mkldnn::memory::format::nchw; break;
-    // This isn't the right layout when the data has 5 dimensions in MXNet.
-    // MXNet interprets 5 dimensions as ncdhw, but MKLDNN doesn't have
-    // a corresponding format.
-    case 5: layout = mkldnn::memory::format::goihw; break;
-  }
-  mkldnn::memory::desc data_md{dims, get_mkldnn_type(dtype), layout};
-  auto cpu_engine = CpuEngine::Get()->get_engine();
-  if (shandle.dptr == nullptr) {
-    CHECK(delay_alloc);
-    CheckAndAlloc();
-  }
-  mkldnn::memory::primitive_desc pd(data_md, cpu_engine);
-  CHECK(shandle.size >= pd.get_size());
-  mkl_mem_.reset(new mkldnn::memory(pd, shandle.dptr));
-}
-
-/*
- * Here we want to get MKLDNN memory whose primitive desc is exactly the same as
- * the given one. operator== can't guarantee that. == can return true even if
- * the formats are different. I need to double check its format.
- */
-static inline mkldnn::memory *GetMKLDNNExact(
-    const mkldnn::memory *mem, mkldnn::memory::primitive_desc desc) {
-  auto src_desc = mem->get_primitive_desc();
-  if (desc == src_desc && desc.desc().data.format == src_desc.desc().data.format) {
-    return const_cast<mkldnn::memory *>(mem);
-  } else {
-    std::shared_ptr<mkldnn::memory> ret(new mkldnn::memory(
-            desc, mem->get_data_handle()));
-    MKLDNNStream::Get()->RegisterMem(ret);
-    return ret.get();
-  }
-}
-
-const mkldnn::memory *NDArray::GetMKLDNNData(
-    const mkldnn::memory::primitive_desc &desc) const {
-  if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) {
-    LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc";
-    return nullptr;
-  }
-  auto mem = GetMKLDNNData();
-  mkldnn::memory::primitive_desc _desc = desc;
-  auto desc1 = mem->get_primitive_desc().desc();
-  auto desc2 = _desc.desc();
-  // The MKL memory has the same format and shape as required,
-  // or both use the default format, we can return the MKL memory.
-  if (mem->get_primitive_desc() == desc
-      || (desc1.data.format == GetDefaultFormat(desc1)
-        && desc2.data.format == GetDefaultFormat(desc2))) {
-    return GetMKLDNNExact(ptr_->mkl_mem_.get(), desc);
-  } else {
-    return nullptr;
-  }
-}
-
-const mkldnn::memory *NDArray::GetMKLDNNDataReorder(
-    const mkldnn::memory::primitive_desc &desc) const {
-  if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) {
-    LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc";
-    return nullptr;
-  }
-  CHECK(storage_type() == kDefaultStorage);
-
-  auto mem = GetMKLDNNData();
-  // If the memory descriptor matches, it's easy.
-  MKLDNNStream *stream = MKLDNNStream::Get();
-  if (mem->get_primitive_desc() == desc) {
-    return GetMKLDNNExact(mem, desc);
-  }
-
-  mkldnn::memory::primitive_desc _desc = desc;
-  // Now we need to determine if we should reorder the memory.
-  // If both use the default formats, we think we don't need to reorder.
-  auto desc1 = mem->get_primitive_desc().desc();
-  auto desc2 = _desc.desc();
-  if (desc1.data.format == GetDefaultFormat(desc1) &&
-      desc2.data.format == GetDefaultFormat(desc2)) {
-    mkldnn_mem_ptr ret(new mkldnn::memory(desc, mem->get_data_handle()));
-    stream->RegisterMem(ret);
-    return ret.get();
-  } else {
-    auto ret = TmpMemMgr::Get()->Alloc(desc);
-    stream->RegisterPrim(mkldnn::reorder(*mem, *ret));
-    return ret;
-  }
-}
-
-const mkldnn::memory *NDArray::GetMKLDNNData() const {
-  CHECK(storage_type() == kDefaultStorage);
-  // If this array uses MKLDNN layout and it's a view, we have to change its
-  // layout to the default layout.
-  if (IsMKLDNNData() && IsView())
-    ptr_->Reorder2Default();
-  ptr_->SetMKLMem(IsView() ? ptr_->storage_shape : shape_, dtype_);
-  // If shandle has data, the data in shandle and mkl_mem_ should match.
-  if (ptr_->shandle.dptr)
-    CHECK(ptr_->shandle.dptr == ptr_->mkl_mem_->get_data_handle());
-  MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_);
-  auto pd = ptr_->mkl_mem_->get_primitive_desc();
-  if (IsView()) {
-    // Sliced array must use the default layout.
-    CHECK_EQ(GetDefaultFormat(pd.desc()), pd.desc().data.format);
-  }
-  if (IsView()) {
-    void *off_addr = static_cast<char *>(ptr_->mkl_mem_->get_data_handle())
-        + byte_offset_;
-
-    // Create the primitive desc for the new mkldnn memory.
-    mkldnn::memory::dims dims(shape().ndim());
-    for (size_t i = 0; i < dims.size(); i++)
-      dims[i] = shape()[i];
-    mkldnn::memory::format cpp_format = static_cast<mkldnn::memory::format>(
-        GetDefaultFormat(shape().ndim()));
-    mkldnn::memory::data_type cpp_type = static_cast<mkldnn::memory::data_type>(
-        pd.desc().data.data_type);
-    mkldnn::memory::desc data_md(dims, cpp_type, cpp_format);
-    mkldnn::memory::primitive_desc new_pd(data_md, pd.get_engine());
-
-    std::shared_ptr<mkldnn::memory> ret(new mkldnn::memory(new_pd, off_addr));
-    MKLDNNStream::Get()->RegisterMem(ret);
-    return ret.get();
-  } else {
-    return ptr_->mkl_mem_.get();
-  }
-}
-
-void NDArray::MKLDNNDataReorder(const mkldnn::memory::primitive_desc &pd) {
-  CHECK_EQ(storage_type(), kDefaultStorage);
-  // If the memory already uses the specified layout, don't do anything.
-  if (ptr_->mkl_mem_ != nullptr && ptr_->mkl_mem_->get_primitive_desc() == pd)
-    return;
-  auto _pd = pd;
-  auto _desc = _pd.desc();
-  auto def_format = GetDefaultFormat(_desc);
-  // If the memory is default, don't do anything.
-  if (def_format == _desc.data.format && ptr_->IsDefault())
-    return;
-  // If the specified layout is default, we should use Reorder2Default.
-  if (def_format == _desc.data.format) {
-    ptr_->Reorder2Default();
-    return;
-  }
-
-  std::shared_ptr<mkldnn::memory> new_mem(new mkldnn::memory(pd));
-  ptr_->SetMKLMem(shape_, dtype_);
-  auto old_mem = ptr_->mkl_mem_;
-  // It's possible that the specified layout has a different number of dimensions.
-  if (old_mem->get_primitive_desc().desc().data.ndims != _desc.data.ndims) {
-    // For now, we only support reorder from the default layout.
-    CHECK(ptr_->IsDefault());
-    auto def_pd = GetPrimitiveDesc(pd, def_format);
-    old_mem.reset(new mkldnn::memory(def_pd, old_mem->get_data_handle()));
-  }
-  // This may be called in MKLDNN operators. We can't use MKLDNNStream here.
-  std::vector<mkldnn::primitive> net;
-  net.push_back(mkldnn::reorder(*old_mem, *new_mem));
-  mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait();
-
-  CHECK(ptr_->shandle.size >= pd.get_size());
-  ptr_->CheckAndAlloc(pd.get_size());
-  // TODO(zhengda) We need to avoid memory copy here.
-  memcpy(ptr_->shandle.dptr, new_mem->get_data_handle(), pd.get_size());
-  ptr_->mkl_mem_.reset(new mkldnn::memory(pd, ptr_->shandle.dptr));
-}
-
-void NDArray::CopyFrom(const mkldnn::memory &mem) {
-  CHECK(ptr_ != nullptr) << "The NDArray hasn't been initialized";
-  if (ptr_->mkl_mem_.get() == &mem)
-    return;
-
-  CHECK(mem.get_primitive_desc().get_size() == shape().Size() * GetTypeSize(dtype_))
-      << "The size of NDArray doesn't match the requested MKLDNN memory desc";
-  MKLDNNStream *stream = MKLDNNStream::Get();
-  // If this array uses MKLDNN layout and it's a view, we have to change its
-  // layout to the default layout.
-  if (IsMKLDNNData() && IsView())
-    ptr_->Reorder2Default();
-  ptr_->SetMKLMem(IsView() ? ptr_->storage_shape : shape_,
-                  dtype_);
-  stream->RegisterMem(ptr_->mkl_mem_);
-  auto from_desc = mem.get_primitive_desc().desc();
-  auto this_desc = ptr_->mkl_mem_->get_primitive_desc().desc();
-  auto from_def_format = GetDefaultFormat(from_desc);
-  if (IsView()) {
-    // Sliced array must use the default layout.
-    CHECK_EQ(GetDefaultFormat(this_desc), this_desc.data.format);
-  }
-  // It's possible that the memory and the NDArray don't have the same shape.
-  if (!same_shape(shape_, from_desc.data.dims, from_desc.data.ndims)
-      // If the source memory uses the default layout, we can reshape directly.
-      && from_def_format == from_desc.data.format) {
-    // In this case, we can simply create a new MKLDNN memory for the required
-    // shape.
-    mkldnn::memory::dims dims(this_desc.data.dims,
-                              this_desc.data.dims + this_desc.data.ndims);
-    auto this_dtype = static_cast<mkldnn::memory::data_type>(this_desc.data.data_type);
-    auto this_format = static_cast<mkldnn::memory::format>(GetDefaultFormat(this_desc));
-    mkldnn::memory::desc data_md(dims, this_dtype, this_format);
-    mkldnn::memory::primitive_desc pd(data_md, mem.get_primitive_desc().get_engine());
-    mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, mem.get_data_handle()));
-    stream->RegisterMem(tmp_mem);
-    stream->RegisterPrim(mkldnn::reorder(*tmp_mem, *ptr_->mkl_mem_));
-  } else if (!same_shape(shape_, from_desc.data.dims, from_desc.data.ndims)) {
-    // In this case, the source memory stores data in a customized layout. We
-    // need to reorganize the data in memory before we can reshape.
-    auto def_pd = GetPrimitiveDesc(mem.get_primitive_desc(), from_def_format);
-    auto def_mem = TmpMemMgr::Get()->Alloc(def_pd);
-    stream->RegisterPrim(mkldnn::reorder(mem, *def_mem));
-    // Now we can reshape it
-    mkldnn::memory::dims dims(this_desc.data.dims,
-                              this_desc.data.dims + this_desc.data.ndims);
-    auto this_dtype = static_cast<mkldnn::memory::data_type>(this_desc.data.data_type);
-    auto this_format = static_cast<mkldnn::memory::format>(GetDefaultFormat(this_desc));
-    mkldnn::memory::desc data_md(dims, this_dtype, this_format);
-    mkldnn::memory::primitive_desc pd(data_md, mem.get_primitive_desc().get_engine());
-    mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, def_mem->get_data_handle()));
-    stream->RegisterMem(tmp_mem);
-    stream->RegisterPrim(mkldnn::reorder(*tmp_mem, *ptr_->mkl_mem_));
-  } else if (mem.get_primitive_desc() == ptr_->mkl_mem_->get_primitive_desc()) {
-    // If the layout is the same, we can just copy data.
-    stream->RegisterPrim(mkldnn::reorder(mem, *ptr_->mkl_mem_));
-  } else {
-    auto src_def = GetDefaultFormat(mem.get_primitive_desc().desc());
-    auto dst_def = GetDefaultFormat(ptr_->mkl_mem_->get_primitive_desc().desc());
-    // If both are not using the default layouts. There isn't much we can do,
-    // other than reorder data layout directly.
-    if (dst_def != ptr_->mkl_mem_->get_primitive_desc().desc().data.format
-        && src_def != mem.get_primitive_desc().desc().data.format) {
-      stream->RegisterPrim(mkldnn::reorder(mem, *ptr_->mkl_mem_));
-    } else if (dst_def == ptr_->mkl_mem_->get_primitive_desc().desc().data.format) {
-      // If the dest mem uses the default memory layout, we can simply use
-      // the default format of the source memory to improve perf of reorder.
-      auto pd = GetPrimitiveDesc(ptr_->mkl_mem_->get_primitive_desc(), src_def);
-      mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, ptr_->mkl_mem_->get_data_handle()));
-      stream->RegisterMem(tmp_mem);
-      stream->RegisterPrim(mkldnn::reorder(mem, *tmp_mem));
-    } else {
-      // If the src mem uses the default memory layout, we can use
-      // the default format of the source memory to improve perf.
-      auto pd = GetPrimitiveDesc(mem.get_primitive_desc(), dst_def);
-      mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, mem.get_data_handle()));
-      stream->RegisterMem(tmp_mem);
-      stream->RegisterPrim(mkldnn::reorder(*tmp_mem, *ptr_->mkl_mem_));
-    }
-  }
-}
-mkldnn::memory::primitive_desc GetPrimitiveDesc(mkldnn::memory::primitive_desc pd,
-                                                mkldnn_memory_format_t format);
-
-mkldnn::memory *NDArray::CreateMKLDNNData(const mkldnn::memory::primitive_desc &desc) {
-  // This array shouldn't be a view.
-  CHECK(!IsView());
-
-  if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) {
-    LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc";
-    return nullptr;
-  }
-
-  mkldnn::memory::primitive_desc _desc = desc;
-  auto required_format = _desc.desc().data.format;
-  auto def_format = GetDefaultFormat(_desc.desc());
-  // If the required format is a default format, we don't need to worry about the shape.
-  // If the shape isn't the same, it actually implicitly reshapes data.
-  if (required_format == def_format) {
-    ptr_->SetMKLMem(shape_, dtype_);
-    MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_);
-    return GetMKLDNNExact(ptr_->mkl_mem_.get(), desc);
-  }
-
-  if (ptr_->mkl_mem_)
-    CHECK(ptr_->mkl_mem_->get_data_handle() == ptr_->shandle.dptr);
-  if (ptr_->mkl_mem_ && ptr_->mkl_mem_->get_primitive_desc() == desc) {
-    MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_);
-    return GetMKLDNNExact(ptr_->mkl_mem_.get(), desc);
-  }
-
-  CHECK(ptr_->shandle.size >= desc.get_size());
-  ptr_->CheckAndAlloc(desc.get_size());
-  ptr_->mkl_mem_.reset(new mkldnn::memory(desc, ptr_->shandle.dptr));
-  MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_);
-  return ptr_->mkl_mem_.get();
-}
-#endif
-
-void NDArray::SetTBlob() const {
-  CHECK(ptr_ != nullptr);
-  TShape shape = shape_;
-  char *dptr = static_cast<char*>(ptr_->shandle.dptr);
-  auto stype = storage_type();
-  if (stype == kDefaultStorage) {
-#if MXNET_USE_MKLDNN == 1
-    if (IsMKLDNNData()) {
-      ptr_->Reorder2Default();
-      dptr = static_cast<char*>(ptr_->shandle.dptr);
-    }
-#endif
-    dptr += byte_offset_;
-  } else if (stype == kCSRStorage || stype == kRowSparseStorage) {
-    CHECK_EQ(byte_offset_, 0);
-    shape = storage_shape();
-  } else {
-    LOG(FATAL) << "unknown storage type " << stype;
-  }
-  tblob_.dptr_ = dptr;
-  tblob_.shape_ = shape;
-  tblob_.type_flag_ = dtype_;
-  tblob_.SetDLTensor(ptr_->shandle.ctx.dev_mask(), ptr_->shandle.ctx.dev_id);
-}
 
 /*!
 * \brief run a ternary operation
@@ -984,51 +449,11 @@ inline void CopyFromToRspImpl(const NDArray& from, const NDArray& to, RunContext
 // Make a copy of a dense NDArray
 template<typename from_xpu, typename to_xpu>
 inline void CopyFromToDnsImpl(const NDArray& from, const NDArray& to, RunContext ctx) {
-#if MXNET_USE_MKLDNN == 1
-  // If neither is MKLDNN, we can copy data normally.
-  if (!from.IsMKLDNNData() && !to.IsMKLDNNData()) {
-#endif
-    using namespace mshadow;
-    CHECK_EQ(from.storage_type(), to.storage_type()) << "Copying with different storage type";
-    TBlob tmp = to.data();
-    ndarray::Copy<from_xpu, to_xpu>(from.data(), &tmp,
-                                    from.ctx(), to.ctx(), ctx);
-#if MXNET_USE_MKLDNN == 1
-  } else if (SupportMKLDNN(from.dtype(), from.shape())
-             && SupportMKLDNN(to.dtype(), to.shape())
-             && from.ctx().dev_mask() == cpu::kDevMask
-             && to.ctx().dev_mask() == cpu::kDevMask) {
-    // If we copy data directly, we need to make sure both NDArrays are supported
-    // by MKLDNN.
-    auto from_mem = from.GetMKLDNNData();
-    auto to_mem = to.GetMKLDNNData();
-    if (from_mem->get_primitive_desc() == to_mem->get_primitive_desc()) {
-      size_t size = std::min(from_mem->get_primitive_desc().get_size(),
-                             to_mem->get_primitive_desc().get_size());
-      memcpy(to_mem->get_data_handle(), from_mem->get_data_handle(), size);
-    } else {
-      std::vector<mkldnn::primitive> net;
-      net.push_back(mkldnn::reorder(*from_mem, *to_mem));
-      mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait();
-    }
-  } else {
-    // In this case, one of the NDArray isn't supported by MKLDNN, we need
-    // to convert the MKLDNN array to the default format first and copy data
-    // with Copy().
-    NDArray tmp_from = from;
-    if (tmp_from.IsMKLDNNData()) {
-      tmp_from = NDArray(from.shape(), from.ctx(), false, from.dtype());
-      auto tmp_mem = from.GetMKLDNNData();
-      tmp_from.CopyFrom(*tmp_mem);
-      MKLDNNStream::Get()->Submit();
-    }
-    CHECK(tmp_from.IsDefaultData());
-    CHECK(to.IsDefaultData());
-    TBlob tmp = to.data();
-    ndarray::Copy<from_xpu, to_xpu>(from.data(), &tmp,
-                                    from.ctx(), to.ctx(), ctx);
-  }
-#endif
+  using namespace mshadow;
+  CHECK_EQ(from.storage_type(), to.storage_type()) << "Copying with different storage type";
+  TBlob tmp = to.data();
+  ndarray::Copy<from_xpu, to_xpu>(from.data(), &tmp,
+                                  from.ctx(), to.ctx(), ctx);
 }
 
 // Make a copy of an NDArray based on storage type
diff --git a/src/operator/concat-inl.h b/src/operator/concat-inl.h
new file mode 100644
index 0000000000..4225ddf4ea
--- /dev/null
+++ b/src/operator/concat-inl.h
@@ -0,0 +1,264 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file concat-inl.h
+ * \brief
+ * \author Bing Xu
+*/
+#ifndef MXNET_OPERATOR_CONCAT_INL_H_
+#define MXNET_OPERATOR_CONCAT_INL_H_
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <cstring>
+#include <map>
+#include <string>
+#include <vector>
+#include <utility>
+#include "./operator_common.h"
+#include "./channel_op_common.h"
+#include "./tensor/broadcast_reduce_op.h"
+
+namespace mxnet {
+namespace op {
+
+namespace concat_enum {
+enum ConcatOpInputs {kData0, kData1, kData2, kData3, kData4};
+enum ConcatOpOutputs {kOut};
+}  // namespace concat_enum
+
+struct ConcatParam : public dmlc::Parameter<ConcatParam> {
+  int num_args;
+  int dim;
+  DMLC_DECLARE_PARAMETER(ConcatParam) {
+    DMLC_DECLARE_FIELD(num_args).set_lower_bound(1)
+    .describe("Number of inputs to be concated.");
+    DMLC_DECLARE_FIELD(dim).set_default(1)
+    .describe("the dimension to be concated.");
+  }
+};  // struct ConcatParam
+
+template<typename xpu, typename DType>
+class ConcatOp : public Operator {
+ public:
+  explicit ConcatOp(ConcatParam param)
+    : size_(param.num_args), dimension_(param.dim) {}
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(static_cast<int>(in_data.size()), size_);
+    CHECK_EQ(out_data.size(), 1U);
+    int axis = CheckAxis(dimension_, in_data[concat_enum::kData0].ndim());
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    std::vector<Tensor<xpu, 3, DType> > data(size_);
+    Tensor<xpu, 3, DType> out;
+    size_t leading = 1, trailing = 1;
+    for (int i = 0; i < axis; ++i) {
+      leading *= out_data[concat_enum::kOut].shape_[i];
+    }
+    for (int i = axis + 1; i < out_data[concat_enum::kOut].ndim(); ++i) {
+      trailing *= out_data[concat_enum::kOut].shape_[i];
+    }
+    size_t mid = out_data[concat_enum::kOut].shape_[axis];
+    Shape<3> oshape = Shape3(leading, mid, trailing);
+    out = out_data[concat_enum::kOut].get_with_shape<xpu, 3, DType>(oshape, s);
+
+    for (int i = 0; i < size_; ++i) {
+      Shape<3> dshape = Shape3(leading, in_data[i].shape_[axis], trailing);
+      data[i] = in_data[i].get_with_shape<xpu, 3, DType>(dshape, s);
+    }
+    Concatenate(data, &out, 1, req[concat_enum::kOut]);
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_states) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), 1U);
+    CHECK_EQ(in_grad.size(), static_cast<size_t>(size_));
+    int axis = CheckAxis(dimension_, out_grad[concat_enum::kData0].ndim());
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    std::vector<Tensor<xpu, 3, DType> > grad_in(size_);
+    Tensor<xpu, 3, DType> grad;
+    size_t leading = 1, trailing = 1;
+    for (int i = 0; i < axis; ++i) {
+      leading *= out_grad[concat_enum::kOut].shape_[i];
+    }
+    for (int i = axis + 1; i < out_grad[concat_enum::kOut].ndim(); ++i) {
+      trailing *= out_grad[concat_enum::kOut].shape_[i];
+    }
+    size_t mid = out_grad[concat_enum::kOut].shape_[axis];
+    Shape<3> oshape = Shape3(leading, mid, trailing);
+    grad = out_grad[concat_enum::kOut].get_with_shape<xpu, 3, DType>(oshape, s);
+
+    for (int i = 0; i < size_; ++i) {
+      Shape<3> dshape = Shape3(leading, in_grad[i].shape_[axis], trailing);
+      grad_in[i] = in_grad[i].get_with_shape<xpu, 3, DType>(dshape, s);
+    }
+    Split(grad, &grad_in, 1, req);
+  }
+
+ private:
+  int size_;
+  int dimension_;
+};  // class ConcatOp
+
+template<typename xpu>
+Operator *CreateOp(ConcatParam param, int dtype, std::vector<TShape> *in_shape);
+
+#if DMLC_USE_CXX11
+class ConcatProp : public OperatorProperty {
+ public:
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    param_.Init(kwargs);
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  std::vector<std::string> ListArguments() const override {
+    std::vector<std::string> ret;
+    for (int i = 0; i < param_.num_args; ++i) {
+      ret.push_back(std::string("arg") + std::to_string(i));
+    }
+    return ret;
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), static_cast<size_t>(param_.num_args));
+    TShape dshape;
+    index_t size = 0;
+    bool has_zero = false;
+    int axis = -1;
+    for (int i = 0; i < param_.num_args; ++i) {
+      TShape tmp = (*in_shape)[i];
+      if (tmp.ndim()) {
+        axis = CheckAxis(param_.dim, tmp.ndim());
+        has_zero = tmp[axis] == 0 || has_zero;
+        size += tmp[axis];
+        tmp[axis] = 0;
+        shape_assign(&dshape, tmp);
+      }
+    }
+
+    TShape tmp = (*out_shape)[0];
+    if (tmp.ndim()) {
+      axis = CheckAxis(param_.dim, tmp.ndim());
+      tmp[axis] = 0;
+      shape_assign(&dshape, tmp);
+    }
+
+    if (dshape.ndim() == 0) return false;
+
+    for (int i = 0; i < param_.num_args; ++i) {
+      CHECK(shape_assign(&(*in_shape)[i], dshape))
+        << "Incompatible input shape: expected " << dshape << ", got " << (*in_shape)[i];
+    }
+
+    if (!has_zero) dshape[axis] = size;
+    CHECK(shape_assign(&(*out_shape)[0], dshape))
+      << "Incompatible output shape: expected " << dshape << ", got " << (*out_shape)[0];
+
+    return dshape.Size() != 0;
+  }
+
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    int dtype = -1;
+
+    for (size_t i = 0; i < in_type->size(); ++i) {
+      if (dtype == -1) {
+        dtype = in_type->at(i);
+      } else {
+        CHECK(in_type->at(i) == dtype ||
+              in_type->at(i) == -1) <<
+              "Non-uniform data type in Concat";
+      }
+    }
+
+    if (dtype == -1) {
+      LOG(FATAL) << "Not enough information to infer type in Concat.";
+      return false;
+    }
+
+    size_t nin = this->ListArguments().size();
+    in_type->clear();
+    for (size_t i = 0; i < nin; ++i) in_type->push_back(dtype);
+
+    size_t naux = this->ListAuxiliaryStates().size();
+    aux_type->clear();
+    for (size_t i = 0; i < naux; ++i) aux_type->push_back(dtype);
+
+    size_t nout = this->ListOutputs().size();
+    out_type->clear();
+    for (size_t i = 0; i < nout; ++i) out_type->push_back(dtype);
+
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    auto ptr = new ConcatProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  std::string TypeString() const override {
+    return "Concat";
+  }
+
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    return out_grad;
+  }
+
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not implemented";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
+
+ private:
+  ConcatParam param_;
+};  // class ConcatProp
+#endif  // DMLC_USE_CXX11
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_CONCAT_INL_H_
diff --git a/src/operator/concat.cc b/src/operator/concat.cc
new file mode 100644
index 0000000000..4d3c2fa166
--- /dev/null
+++ b/src/operator/concat.cc
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file concat.cc
+ * \brief
+ * \author Bing Xu
+*/
+
+#include "./concat-inl.h"
+#if MXNET_USE_MKL2017 == 1
+#include <mkl_memory.h>
+#include "./mkl/mkl_memory-inl.h"
+#include "./mkl/mkl_concat-inl.h"
+#endif  // MXNET_USE_MKL2017
+
+namespace mxnet {
+namespace op {
+template<>
+Operator* CreateOp<cpu>(ConcatParam param, int dtype, std::vector<TShape> *in_shape) {
+  Operator *op = NULL;
+#if MXNET_USE_MKL2017 == 1
+  // MKL supports 4D input tensors only for concat operation
+  // 2D/3D input tensors are reshaped to 4D in mkl_concat-inl.h
+  // hence MKL supports 2D/3D/4D input tensors for concat operation
+  size_t dims = (*in_shape)[0].ndim();
+  bool supportedDim = (dims >= 2 && dims <= 4);
+  if ((1 == param.dim) && supportedDim &&
+    (param.num_args < (dnnResourceMultipleDst - dnnResourceMultipleSrc))) {
+    switch (dtype) {
+      case mshadow::kFloat32:
+      return new MKLConcatOp<cpu, float>(param);
+    case mshadow::kFloat64:
+      return new MKLConcatOp<cpu, double>(param);
+    default:
+      break;
+    }
+  }
+  if (enableMKLWarnGenerated())
+    LOG(INFO) << MKLConcatOp<cpu, float>::getName() << " Skip MKL optimization";
+#endif
+  MSHADOW_TYPE_SWITCH(dtype, DType, {
+    op = new ConcatOp<cpu, DType>(param);
+  });
+  return op;
+}
+
+Operator* ConcatProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                       std::vector<int> *in_type) const {
+  DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0), in_shape);
+}
+
+DMLC_REGISTER_PARAMETER(ConcatParam);
+
+MXNET_REGISTER_OP_PROPERTY(Concat, ConcatProp)
+.describe(R"code(Joins input arrays along a given axis.
+
+.. note:: `Concat` is deprecated. Use `concat` instead.
+
+The dimensions of the input arrays should be the same except the axis along
+which they will be concatenated.
+The dimension of the output array along the concatenated axis will be equal
+to the sum of the corresponding dimensions of the input arrays.
+
+Example::
+
+   x = [[1,1],[2,2]]
+   y = [[3,3],[4,4],[5,5]]
+   z = [[6,6], [7,7],[8,8]]
+
+   concat(x,y,z,dim=0) = [[ 1.,  1.],
+                          [ 2.,  2.],
+                          [ 3.,  3.],
+                          [ 4.,  4.],
+                          [ 5.,  5.],
+                          [ 6.,  6.],
+                          [ 7.,  7.],
+                          [ 8.,  8.]]
+
+   Note that you cannot concat x,y,z along dimension 1 since dimension
+   0 is not the same for all the input arrays.
+
+   concat(y,z,dim=1) = [[ 3.,  3.,  6.,  6.],
+                         [ 4.,  4.,  7.,  7.],
+                         [ 5.,  5.,  8.,  8.]]
+
+)code" ADD_FILELINE)
+.add_argument("data", "NDArray-or-Symbol[]", "List of arrays to concatenate")
+.add_arguments(ConcatParam::__FIELDS__())
+.set_key_var_num_args("num_args");
+
+NNVM_REGISTER_OP(Concat).add_alias("concat");
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/nn/concat.cu b/src/operator/concat.cu
similarity index 81%
rename from src/operator/nn/concat.cu
rename to src/operator/concat.cu
index f6bf5ece5c..394fa736ee 100644
--- a/src/operator/nn/concat.cu
+++ b/src/operator/concat.cu
@@ -28,12 +28,14 @@
 
 namespace mxnet {
 namespace op {
-
-NNVM_REGISTER_OP(Concat)
-.set_attr<FCompute>("FCompute<gpu>", ConcatCompute<gpu>);
-
-NNVM_REGISTER_OP(_backward_Concat)
-.set_attr<FCompute>("FCompute<gpu>", ConcatGradCompute<gpu>);
+template<>
+Operator* CreateOp<gpu>(ConcatParam param, int dtype, std::vector<TShape> *in_shape) {
+  Operator *op = NULL;
+  MSHADOW_TYPE_SWITCH(dtype, DType, {
+    op = new ConcatOp<gpu, DType>(param);
+  });
+  return op;
+}
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/convolution_v1.cc b/src/operator/convolution_v1.cc
index 86c0fbb332..7de6a34425 100644
--- a/src/operator/convolution_v1.cc
+++ b/src/operator/convolution_v1.cc
@@ -25,6 +25,11 @@
 */
 
 #include "./convolution_v1-inl.h"
+#if MXNET_USE_MKL2017 == 1
+#include <mkl_memory.h>
+#include "./mkl/mkl_memory-inl.h"
+#include "./mkl/mkl_convolution-inl.h"
+#endif  // MXNET_USE_MKL2017
 #if MXNET_USE_NNPACK == 1
 #include "./nnpack/nnpack_convolution-inl.h"
 #endif  // MXNET_USE_NNPACK
diff --git a/src/operator/lrn-inl.h b/src/operator/lrn-inl.h
new file mode 100644
index 0000000000..adfe467670
--- /dev/null
+++ b/src/operator/lrn-inl.h
@@ -0,0 +1,215 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file lrn-inl.h
+ * \brief
+ * \author Bing Xu
+*/
+#ifndef MXNET_OPERATOR_LRN_INL_H_
+#define MXNET_OPERATOR_LRN_INL_H_
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include "./operator_common.h"
+#include "./mshadow_op.h"
+
+namespace mxnet {
+namespace op {
+
+namespace lrn_enum {
+enum LRNInputs {kData};
+enum LRNOutputs {kOut, kTmpNorm};
+}  // namespace lrn_enum
+
+struct LRNParam : public dmlc::Parameter<LRNParam> {
+  float alpha;
+  float beta;
+  float knorm;
+  uint32_t nsize;
+  DMLC_DECLARE_PARAMETER(LRNParam) {
+    DMLC_DECLARE_FIELD(alpha).set_default(1e-4f)
+    .describe("The variance scaling parameter :math:`\alpha` in the LRN expression.");
+    DMLC_DECLARE_FIELD(beta).set_default(0.75f)
+    .describe("The power parameter :math:`\beta` in the LRN expression.");
+    DMLC_DECLARE_FIELD(knorm).set_default(2.0f)
+    .describe("The parameter :math:`k` in the LRN expression.");
+    DMLC_DECLARE_FIELD(nsize)
+    .describe("normalization window width in elements.");
+  }
+};  // struct LRNParam
+
+template<typename xpu>
+class LocalResponseNormOp : public Operator {
+ public:
+  explicit LocalResponseNormOp(LRNParam param) {
+    param_ = param;
+  }
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_states) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    // TODO(xxx): Test with gradient chceker
+    CHECK_EQ(in_data.size(), 1U);
+    CHECK_EQ(out_data.size(), 2U);
+    // CHECK_EQ(req.size(), 2);
+    CHECK_EQ(param_.nsize % 2, 1U) << "LRN only supports odd values for local_size";
+    const real_t salpha = param_.alpha / param_.nsize;
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4> data = in_data[lrn_enum::kData].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> out = out_data[lrn_enum::kOut].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> tmp_norm = out_data[lrn_enum::kTmpNorm].get<xpu, 4, real_t>(s);
+    tmp_norm = chpool<red::sum>(F<mshadow_op::square>(data) , param_.nsize) * salpha + param_.knorm;
+    Assign(out, req[lrn_enum::kOut], data *  F<mshadow_op::power>(tmp_norm, -param_.beta));
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_states) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), 1U);
+    CHECK_EQ(in_data.size(), 1U);
+    CHECK_EQ(out_data.size(), 2U);
+    const real_t salpha = param_.alpha / param_.nsize;
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4> grad = out_grad[lrn_enum::kOut].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> tmp_norm = out_data[lrn_enum::kTmpNorm].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> data = in_data[lrn_enum::kData].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> grad_in = in_grad[lrn_enum::kData].get<xpu, 4, real_t>(s);
+    grad_in = grad * F<mshadow_op::power>(tmp_norm, -param_.beta);
+    grad_in += (- 2.0f * param_.beta * salpha) *
+               chpool<red::sum>(grad * data *
+                                F<mshadow_op::power>(tmp_norm, -param_.beta - 1.0f),
+                                param_.nsize)  * data;
+  }
+
+ private:
+  LRNParam param_;
+};  // class LocalResponseNormOp
+
+template<typename xpu>
+Operator *CreateOp(LRNParam param, int dtype);
+
+#if DMLC_USE_CXX11
+class LocalResponseNormProp : public OperatorProperty {
+ public:
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    param_.Init(kwargs);
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), 1U) << "Input:[data]";
+    const TShape &dshape = in_shape->at(0);
+    if (dshape.ndim() == 0) return false;
+    out_shape->clear();
+    out_shape->push_back(dshape);
+    out_shape->push_back(dshape);
+    return true;
+  }
+
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_GE(in_type->size(), 1U);
+    int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "First input must have specified type";
+    for (index_t i = 0; i < in_type->size(); ++i) {
+      if ((*in_type)[i] == -1) {
+        (*in_type)[i] = dtype;
+      } else {
+        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
+      }
+    }
+    int n_out = this->ListOutputs().size();
+    out_type->clear();
+    for (int i = 0; i < n_out; ++i ) out_type->push_back(dtype);
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    auto ptr = new LocalResponseNormProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  std::string TypeString() const override {
+    return "LRN";
+  }
+
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    return {
+      out_grad[lrn_enum::kOut], in_data[lrn_enum::kData],
+      out_data[lrn_enum::kTmpNorm], out_data[lrn_enum::kOut]
+    };
+  }
+
+  int NumVisibleOutputs() const override {
+    return 1;
+  }
+
+  int NumOutputs() const override {
+    return 2;
+  }
+
+  std::vector<std::string> ListArguments() const override {
+    return {"data"};
+  }
+
+  std::vector<std::string> ListOutputs() const override {
+    return {"output", "tmp_norm"};
+  }
+
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented.";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
+
+ private:
+  LRNParam param_;
+};  // LocalResponseNormProp
+#endif  // DMLC_USE_CXX11
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_LRN_INL_H_
diff --git a/src/operator/lrn.cc b/src/operator/lrn.cc
new file mode 100644
index 0000000000..9b3afd80cd
--- /dev/null
+++ b/src/operator/lrn.cc
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file lrn.cc
+ * \brief
+ * \author Bing Xu
+*/
+
+#include "./lrn-inl.h"
+#if MXNET_USE_CUDNN == 1
+#include "./cudnn_lrn-inl.h"
+#endif
+#if MXNET_USE_MKL2017 == 1
+#include <mkl_memory.h>
+#include "./mkl/mkl_memory-inl.h"
+#include "./mkl/mkl_lrn-inl.h"
+#endif
+
+namespace mxnet {
+namespace op {
+template<>
+Operator* CreateOp<cpu>(LRNParam param, int dtype) {
+#if MXNET_USE_MKL2017 == 1
+  return new MKLLRNOp<cpu, float>(param);
+#endif
+  return new LocalResponseNormOp<cpu>(param);
+}
+
+// DO_BIND_DISPATCH comes from operator_common.h
+Operator* LocalResponseNormProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+    std::vector<int> *in_type) const {
+    DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
+}
+
+DMLC_REGISTER_PARAMETER(LRNParam);
+
+MXNET_REGISTER_OP_PROPERTY(LRN, LocalResponseNormProp)
+.add_argument("data", "NDArray-or-Symbol", "Input data.")
+.add_arguments(LRNParam::__FIELDS__())
+.describe(R"code(Applies local response normalization to the input.
+
+The local response normalization layer performs "lateral inhibition" by normalizing
+over local input regions.
+
+If :math:`a_{x,y}^{i}` is the activity of a neuron computed by applying kernel :math:`i` at position
+:math:`(x, y)` and then applying the ReLU nonlinearity, the response-normalized
+activity :math:`b_{x,y}^{i}` is given by the expression:
+
+.. math::
+   b_{x,y}^{i} = \frac{a_{x,y}^{i}}{\Bigg({k + \alpha \sum_{j=max(0, i-\frac{n}{2})}^{min(N-1, i+\frac{n}{2})} (a_{x,y}^{j})^{2}}\Bigg)^{\beta}}
+
+where the sum runs over :math:`n` "adjacent" kernel maps at the same spatial position, and :math:`N` is the total
+number of kernels in the layer.
+
+)code" ADD_FILELINE);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/nn/lrn.cu b/src/operator/lrn.cu
similarity index 64%
rename from src/operator/nn/lrn.cu
rename to src/operator/lrn.cu
index 4c31ca9602..ba872f1d26 100644
--- a/src/operator/nn/lrn.cu
+++ b/src/operator/lrn.cu
@@ -25,15 +25,29 @@
 */
 
 #include "./lrn-inl.h"
+#if MXNET_USE_CUDNN == 1
+#include "./cudnn_lrn-inl.h"
+#endif
 
 namespace mxnet {
 namespace op {
-
-NNVM_REGISTER_OP(LRN)
-.set_attr<FCompute>("FCompute<gpu>", LRNCompute<gpu>);
-
-NNVM_REGISTER_OP(_backward_LRN)
-.set_attr<FCompute>("FCompute<gpu>", LRNGradCompute<gpu>);
+template<>
+Operator* CreateOp<gpu>(LRNParam param, int dtype) {
+  Operator *op = NULL;
+#if MXNET_USE_CUDNN == 1
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new CuDNNLocalResponseNormOp<DType>(param);
+  })
+#else
+#if CUDA_VERSION == 7000
+  LOG(FATAL) << "Due to old CUDA compiler bug, LRN is disabled."
+             << "Please upgrade CUDA to 7.5+ or use CUDNN";
+#else
+  op = new LocalResponseNormOp<gpu>(param);
+#endif  // CUDA_VERSION
+#endif  // MXNET_USE_CUDNN
+  return op;
+}
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/mkl/mkl_batch_norm-inl.h b/src/operator/mkl/mkl_batch_norm-inl.h
new file mode 100644
index 0000000000..b5967f4de2
--- /dev/null
+++ b/src/operator/mkl/mkl_batch_norm-inl.h
@@ -0,0 +1,391 @@
+/*******************************************************************************
+* Copyright 2016 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+* \file mkl_batch_norm-inl.h
+* \brief
+* \author lingyan.guo@intel.com
+*         zhenlin.luo@intel.com
+*
+*******************************************************************************/
+#ifndef MXNET_OPERATOR_MKL_MKL_BATCH_NORM_INL_H_
+#define MXNET_OPERATOR_MKL_MKL_BATCH_NORM_INL_H_
+#include <mxnet/storage.h>
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include "../operator_common.h"
+#include "../mshadow_op.h"
+#include "./mkl_util-inl.h"
+
+namespace mxnet {
+namespace op {
+
+template<typename xpu, typename DType>
+class MKLBatchNormOp : public Operator {
+ public:
+  explicit MKLBatchNormOp(BatchNormParam param) {
+    this->param_ = param;
+    fwd_top_data = MKLData<DType>::create();
+    fwd_bottom_data = MKLData<DType>::create();
+    bwd_top_diff = MKLData<DType>::create();
+    bwd_bottom_diff = MKLData<DType>::create();
+    scaleShift_space.dptr = NULL;
+    scaleShiftDiff_space.dptr = NULL;
+  }
+  virtual ~MKLBatchNormOp() {
+    if (batchNormFwdInference != NULL) dnnDelete<DType>(batchNormFwdInference);
+    if (batchNormFwdTraining != NULL) dnnDelete<DType>(batchNormFwdTraining);
+    if (batchNormBwdScaleShift != NULL) dnnDelete<DType>(batchNormBwdScaleShift);
+    dnnLayoutDelete<DType>(layout_usr_);
+    if (scaleShift_space.dptr)
+      Storage::Get()->Free(scaleShift_space);
+    if (scaleShiftDiff_space.dptr)
+      Storage::Get()->Free(scaleShiftDiff_space);
+  }
+  static std::string getName() {
+    return "MKLBatchNormOp";
+  }
+
+ private:
+  void LayerSetUp(const mshadow::Tensor<xpu, 4, DType> &data,
+                  const mshadow::Tensor<xpu, 4, DType> &out) {
+    eps_ = param_.eps;
+    size_t dim = 4, sizes[4], strides[4];
+    channels_ = data.shape_[1];
+    height_ = data.shape_[2];
+    width_ = data.shape_[3];
+    num_ = data.shape_[0];
+
+    sizes[0] = width_;
+    sizes[1] = height_;
+    sizes[2] = channels_;
+    sizes[3] = num_;
+
+    strides[0] = 1;
+    strides[1] = sizes[0];
+    strides[2] = sizes[0] * sizes[1];
+    strides[3] = sizes[0] * sizes[1] * sizes[2];
+
+    // Names are for debugging only
+    fwd_bottom_data->name = "fwd_bottom_data   @ " + getName();
+    fwd_top_data->name = "fwd_top_data      @ " + getName();
+    bwd_bottom_diff->name = "bwd_bottom_diff   @ " + getName();
+    bwd_top_diff->name = "bwd_top_diff      @ " + getName();
+
+    dnnError_t e;
+    e = dnnLayoutCreate<DType>(&layout_usr_, dim, sizes, strides);
+    CHECK_EQ(e, E_SUCCESS);
+
+    fwd_bottom_data->create_user_layout(dim, sizes, strides);
+    fwd_top_data->create_user_layout(dim, sizes, strides);
+    bwd_bottom_diff->create_user_layout(dim, sizes, strides);
+    bwd_top_diff->create_user_layout(dim, sizes, strides);
+
+    // Primitives will be allocated during the first fwd pass
+    batchNormFwdInference = NULL;
+    batchNormFwdTraining = NULL;
+    batchNormBwdScaleShift = NULL;
+    int scaleShift_size = channels_*2*sizeof(DType);
+    scaleShift_space = Storage::Get()->Alloc(scaleShift_size, Context::CPU());
+    scaleShiftDiff_space = Storage::Get()->Alloc(scaleShift_size, Context::CPU());
+    DType * scaleShift_buf = reinterpret_cast<DType*>(scaleShift_space.dptr);
+    /*!use_weight_bias_*/
+    for (int i = 0; i < channels_; i++) {
+        scaleShift_buf[i] = 1.0;
+        scaleShift_buf[channels_ + i] = 0;
+    }
+  }
+
+ public:
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_states) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 3);
+    CHECK_EQ(aux_states.size(), 2);
+    if (ctx.is_train) {
+      CHECK_EQ(out_data.size(), 3);
+      CHECK_EQ(req.size(), 3);
+    } else {
+      CHECK_GE(out_data.size(), 1);
+      CHECK_GE(req.size(), 1);
+      CHECK_EQ(req[batchnorm::kOut], kWriteTo);
+    }
+
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4, DType>  data;
+    Tensor<xpu, 4, DType>  out;
+    if (in_data[batchnorm::kData].ndim() == 2) {
+      Shape<4> dshape = Shape4(in_data[batchnorm::kData].shape_[0],
+                               in_data[batchnorm::kData].shape_[1], 1, 1);
+      data = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
+        in_data[batchnorm::kData], dshape, s);
+      out = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
+        out_data[batchnorm::kOut], dshape, s);
+    } else {
+      data = mkl_experimental_direct_get<xpu, 4, DType>(in_data[batchnorm::kData], s);
+      out = mkl_experimental_direct_get<xpu, 4, DType>(out_data[batchnorm::kOut], s);
+    }
+
+    // const real_t scale = static_cast<real_t>(in_data[batchnorm::kData].shape_[1]) /
+    //   static_cast<real_t>(in_data[batchnorm::kData].shape_.Size());
+
+    Tensor<xpu, 1, DType> slope = in_data[batchnorm::kGamma].get<xpu, 1, DType>(s);
+    Tensor<xpu, 1, DType> bias = in_data[batchnorm::kBeta].get<xpu, 1, DType>(s);
+    Tensor<xpu, 1, DType> moving_mean = aux_states[batchnorm::kMovingMean].get<xpu, 1, DType>(s);
+    Tensor<xpu, 1, DType> moving_var = aux_states[batchnorm::kMovingVar].get<xpu, 1, DType>(s);
+
+    if (param_.fix_gamma)
+      slope = 1.f;
+
+    dnnError_t e;
+    if (!init_mkldnn_) {
+      LayerSetUp(data, out);
+      init_mkldnn_ = true;
+    }
+    void* bottom_data = NULL;
+#if MKL_EXPERIMENTAL == 1
+    bottom_data =
+          reinterpret_cast<void *>(mkl_prv_data<DType>(in_data[batchnorm::kData]));
+#endif
+    int bwd_flags = dnnUseScaleShift;
+    if (param_.use_global_stats)
+      bwd_flags = dnnUseScaleShift | dnnUseInputMeanVariance;
+#if MKL_EXPERIMENTAL == 1
+    if (NULL != bottom_data) {
+      // Is it the first pass? Create a primitive.
+      if (batchNormFwdInference == NULL) {
+        std::shared_ptr<MKLMemHolder> bottom_data_mem = in_data[batchnorm::kData].Mkl_mem_;
+        std::shared_ptr<PrvMemDescr> bottom_prv_desc = bottom_data_mem->get_prv_descriptor();
+        CHECK(bottom_prv_desc->get_descr_type() == PrvMemDescr::PRV_DESCR_MKL2017);
+        std::shared_ptr<MKLData<DType> > mem_descr
+          = std::static_pointer_cast<MKLData<DType>>(bottom_prv_desc);
+        CHECK(mem_descr != NULL);
+        fwd_bottom_data = mem_descr;
+
+        e = dnnBatchNormalizationCreateForward_v2<DType>(
+             &batchNormFwdInference, NULL, mem_descr->layout_int, eps_,
+             dnnUseInputMeanVariance | dnnUseScaleShift);
+        CHECK_EQ(e, E_SUCCESS);
+
+        e = dnnBatchNormalizationCreateForward_v2<DType>(
+              &batchNormFwdTraining, NULL, mem_descr->layout_int, eps_,
+              dnnUseScaleShift);
+        CHECK_EQ(e, E_SUCCESS);
+
+        fwd_top_data->create_internal_layout(batchNormFwdInference, dnnResourceDst);
+        bwd_top_diff->create_internal_layout(batchNormFwdInference, dnnResourceDst);
+        bwd_bottom_diff->create_internal_layout(batchNormFwdInference, dnnResourceSrc);
+
+        e = dnnBatchNormalizationCreateBackward_v2<DType>(
+                &batchNormBwdScaleShift, NULL, mem_descr->layout_int, eps_, bwd_flags);
+        CHECK_EQ(e, E_SUCCESS);
+      }
+    }
+#endif
+    if (NULL == bottom_data) {
+      if (batchNormFwdInference == NULL) {
+        e = dnnBatchNormalizationCreateForward_v2<DType>(
+          &batchNormFwdInference, NULL, layout_usr_, eps_,
+          dnnUseInputMeanVariance | dnnUseScaleShift);
+        CHECK_EQ(e, E_SUCCESS);
+
+        e = dnnBatchNormalizationCreateForward_v2<DType>(
+              &batchNormFwdTraining, NULL, layout_usr_, eps_, dnnUseScaleShift);
+        CHECK_EQ(e, E_SUCCESS);
+
+        e = dnnBatchNormalizationCreateBackward_v2<DType>(
+              &batchNormBwdScaleShift, NULL, layout_usr_, eps_, bwd_flags);
+        CHECK_EQ(e, E_SUCCESS);
+      }
+      bottom_data = reinterpret_cast<void *>(data.dptr_);
+    }
+
+    DType * scaleShift_buf = reinterpret_cast<DType*>(scaleShift_space.dptr);
+     // use_weight_bias_
+    for (int i = 0; i < channels_; i++) {
+        scaleShift_buf[i] = (slope.dptr_)[i];
+    }
+    for (int i = 0; i < channels_; i++) {
+      scaleShift_buf[channels_ + i] = (bias.dptr_)[i];
+    }
+
+    void* BatchNorm_res[dnnResourceNumber];
+    BatchNorm_res[dnnResourceSrc] = bottom_data;
+    BatchNorm_res[dnnResourceScaleShift] = scaleShift_space.dptr;
+
+    BatchNorm_res[dnnResourceDst] = fwd_top_data->get_output_ptr(out.dptr_,
+      fwd_top_data, out_data[batchnorm::kOut]);
+    if (ctx.is_train && !param_.use_global_stats) {
+      Tensor<xpu, 1, DType> mean = out_data[batchnorm::kMean].get<xpu, 1, DType>(s);
+      Tensor<xpu, 1, DType> var = out_data[batchnorm::kVar].get<xpu, 1, DType>(s);
+      CHECK(req[batchnorm::kMean] == kNullOp || req[batchnorm::kMean] == kWriteTo);
+      CHECK(req[batchnorm::kVar] == kNullOp || req[batchnorm::kVar] == kWriteTo);
+      BatchNorm_res[dnnResourceMean] = mean.dptr_;
+      BatchNorm_res[dnnResourceVariance] = var.dptr_;
+      e = dnnExecute<DType>(batchNormFwdTraining, BatchNorm_res);
+      CHECK_EQ(e, E_SUCCESS);
+    } else {
+      BatchNorm_res[dnnResourceMean] = moving_mean.dptr_;
+      BatchNorm_res[dnnResourceVariance] = moving_var.dptr_;
+      e = dnnExecute<DType>(batchNormFwdInference, BatchNorm_res);
+      CHECK_EQ(e, E_SUCCESS);
+    }
+
+#if MKL_EXPERIMENTAL == 0
+    if (fwd_top_data->conversion_needed()) {
+      fwd_top_data->convert_from_prv(out.dptr_);
+    }
+#endif
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_states) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), 1);
+    CHECK_EQ(in_data.size(), 3);
+    CHECK_EQ(out_data.size(), 3);
+    CHECK_EQ(in_grad.size(), 3);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4, DType> data, grad, grad_in;
+
+    if (in_data[batchnorm::kData].ndim() == 2) {
+      Shape<4> dshape = Shape4(out_grad[batchnorm::kOut].shape_[0],
+                               out_grad[batchnorm::kOut].shape_[1], 1, 1);
+      data = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
+        in_data[batchnorm::kData], dshape, s);
+      grad = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
+        out_grad[batchnorm::kOut], dshape, s);
+      grad_in = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
+        in_grad[batchnorm::kData], dshape, s);
+    } else {
+      data = mkl_experimental_direct_get<xpu, 4, DType>(in_data[batchnorm::kData], s);
+      grad = mkl_experimental_direct_get<xpu, 4, DType>(out_grad[batchnorm::kOut], s);
+      grad_in = mkl_experimental_direct_get<xpu, 4, DType>(in_grad[batchnorm::kData], s);
+    }
+
+    Tensor<xpu, 1, DType> slope = in_data[batchnorm::kGamma].get<xpu, 1, DType>(s);
+    Tensor<xpu, 1, DType> gslope = in_grad[batchnorm::kGamma].get<xpu, 1, DType>(s);
+    Tensor<xpu, 1, DType> gbias = in_grad[batchnorm::kBeta].get<xpu, 1, DType>(s);
+    Tensor<xpu, 1, DType> mean = out_data[batchnorm::kMean].get<xpu, 1, DType>(s);
+    Tensor<xpu, 1, DType> var = out_data[batchnorm::kVar].get<xpu, 1, DType>(s);
+    Tensor<xpu, 1, DType> moving_mean = aux_states[batchnorm::kMovingMean].get<xpu, 1, DType>(s);
+    Tensor<xpu, 1, DType> moving_var = aux_states[batchnorm::kMovingVar].get<xpu, 1, DType>(s);
+
+    if (param_.fix_gamma)  slope = 1.f;
+
+    void* bottom_data = NULL;
+#if MKL_EXPERIMENTAL == 1
+    bottom_data = reinterpret_cast<void *>(mkl_prv_data<DType>(in_data[batchnorm::kData]));
+#endif
+    if (NULL == bottom_data)
+      bottom_data = reinterpret_cast<void *>(data.dptr_);
+
+    dnnError_t e;
+    void* BatchNorm_res[dnnResourceNumber];
+    BatchNorm_res[dnnResourceSrc] = bottom_data;
+    BatchNorm_res[dnnResourceScaleShift] = scaleShift_space.dptr;
+    if (ctx.is_train && !param_.use_global_stats) {
+      int size = mean.size(0);  // Tensor<xpu, 1, DType>
+      float * moving_mean_ptr = reinterpret_cast<float*>(moving_mean.dptr_);
+      float * mean_ptr = reinterpret_cast<float*>(mean.dptr_);
+      float * moving_var_ptr = reinterpret_cast<float*>(moving_var.dptr_);
+      float * var_ptr = reinterpret_cast<float*>(var.dptr_);
+      float minus_mom = (1 - param_.momentum);
+      for (int i = 0; i < size; i++) {
+        moving_mean_ptr[i] = moving_mean_ptr[i] * param_.momentum
+          + mean_ptr[i] * minus_mom;
+      }
+      for (int i = 0; i < size; i++) {
+        moving_var_ptr[i] = moving_var_ptr[i] * param_.momentum
+          + var_ptr[i] * minus_mom;
+      }
+      BatchNorm_res[dnnResourceMean] = mean.dptr_;
+      BatchNorm_res[dnnResourceVariance] = var.dptr_;
+    } else {
+      BatchNorm_res[dnnResourceMean] = moving_mean.dptr_;
+      BatchNorm_res[dnnResourceVariance] = moving_var.dptr_;
+    }
+
+
+    BatchNorm_res[dnnResourceDiffSrc] = bwd_bottom_diff->get_output_ptr(grad_in.dptr_,
+      bwd_bottom_diff, in_grad[batchnorm::kData]);
+    BatchNorm_res[dnnResourceDiffDst] = bwd_top_diff->get_converted_prv(grad.dptr_,
+             true, out_grad[batchnorm::kOut]);
+    BatchNorm_res[dnnResourceDiffScaleShift] = scaleShiftDiff_space.dptr;
+    e = dnnExecute<DType>(batchNormBwdScaleShift, BatchNorm_res);
+    CHECK_EQ(e, E_SUCCESS);
+#if MKL_EXPERIMENTAL == 0
+    if (bwd_bottom_diff->conversion_needed()) {
+      bwd_bottom_diff->convert_from_prv(grad_in.dptr_);
+    }
+#endif
+    DType * scaleShiftDiff_buf = reinterpret_cast<DType*>(scaleShiftDiff_space.dptr);
+    if (!param_.fix_gamma) {
+      // Store ScaleShift blobs
+      DType* diff_scale = gslope.dptr_;
+      for (int i = 0; i < channels_; i++) {
+        diff_scale[i] = scaleShiftDiff_buf[i];
+      }
+    } else {
+      int gslope_size = gslope.size(0);
+      float * gslope_ptr = reinterpret_cast<float*>(gslope.dptr_);
+      for (int i = 0; i < gslope_size; i++) {
+        *gslope_ptr++ = 0.0f;
+      }
+    }
+    DType* diff_shift = gbias.dptr_;
+    for (int i = 0; i < channels_; i++) {
+      diff_shift[i] = scaleShiftDiff_buf[channels_ + i];
+    }
+  }
+
+ private:
+  BatchNormParam param_;
+  DType eps_;
+  bool use_weight_bias_;
+
+  int num_;
+  int channels_;
+  int height_;
+  int width_;
+  bool init_mkldnn_ = false;
+  std::shared_ptr<MKLData<DType> > fwd_top_data;
+  std::shared_ptr<MKLData<DType> > fwd_bottom_data;
+  std::shared_ptr<MKLData<DType> > bwd_top_diff;
+  std::shared_ptr<MKLData<DType> > bwd_bottom_diff;
+  dnnPrimitive_t batchNormFwdInference = NULL;
+  dnnPrimitive_t batchNormFwdTraining = NULL;
+  dnnPrimitive_t batchNormBwdScaleShift = NULL;
+  Storage::Handle scaleShift_space;
+  Storage::Handle scaleShiftDiff_space;
+  dnnLayout_t layout_usr_ = NULL;
+};  // class BatchNormOp
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_MKL_MKL_BATCH_NORM_INL_H_
diff --git a/src/operator/mkl/mkl_concat-inl.h b/src/operator/mkl/mkl_concat-inl.h
new file mode 100644
index 0000000000..1ed1e81d13
--- /dev/null
+++ b/src/operator/mkl/mkl_concat-inl.h
@@ -0,0 +1,314 @@
+/*******************************************************************************
+* Copyright 2016 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+* \file mkl_concat-inl.h
+* \brief
+* \author lingyan.guo@intel.com
+*         zhenlin.luo@intel.com
+*
+*******************************************************************************/
+#ifndef MXNET_OPERATOR_MKL_MKL_CONCAT_INL_H_
+#define MXNET_OPERATOR_MKL_MKL_CONCAT_INL_H_
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <cstring>
+#include <map>
+#include <string>
+#include <vector>
+#include <utility>
+#include "../operator_common.h"
+#include "../channel_op_common.h"
+#include "./mkl_util-inl.h"
+namespace mxnet {
+namespace op {
+
+
+template<typename xpu, typename DType>
+class MKLConcatOp : public Operator {
+ public:
+  static std::string getName() {
+    return "MKLConcatOp";
+  }
+  explicit MKLConcatOp(ConcatParam param)
+    : size_(param.num_args), dimension_(param.dim), init_mkldnn_(false) {
+    concatFwd_ = static_cast<dnnPrimitive_t>(NULL);
+    concatBwd_ = static_cast<dnnPrimitive_t>(NULL);
+    fwd_top_data_ = MKLData<DType>::create();
+    bwd_top_diff_ = MKLData<DType>::create();
+
+    num_concats_ = param.num_args;
+  }
+  virtual ~MKLConcatOp() {
+    dnnDelete<DType>(concatFwd_);
+    dnnDelete<DType>(concatBwd_);
+  }
+
+ private:
+  void LayerSetUp(const std::vector<mshadow::Tensor<xpu, 4, DType> > &data,
+                  const mshadow::Tensor<xpu, 4, DType> &out,
+                  size_t data_shape_size, size_t *split_channels_) {
+    size_t dim_src = data_shape_size;
+    size_t dim_dst = dim_src;
+    num_concats_ = size_;
+    channels_ = 0;
+
+    for (size_t i = 1; i < num_concats_; ++i) {
+      for (size_t j = 1; j < data_shape_size; ++j) {
+        if (j == dimension_) continue;
+        CHECK_EQ(data[0].shape_[j], data[i].shape_[j]);
+      }
+    }
+
+    for (size_t i = 0; i < num_concats_; ++i) {
+      CHECK_EQ((int)dim_src, data[i].shape_.kDimension);
+
+      fwd_bottom_data_.push_back(MKLData<DType>::create());
+      bwd_bottom_diff_.push_back(MKLData<DType>::create());
+      fwd_bottom_data_[i]->name = "fwd_bottom_data_[i]";
+      bwd_bottom_diff_[i]->name = "bwd_bottom_data[i]";
+
+      size_t *sizes_src = new size_t[dim_src];
+      size_t *strides_src = new size_t[dim_src];
+      for (size_t d = 0; d < dim_src; ++d) {
+        sizes_src[d] = data[i].shape_[dim_src - d - 1];
+        strides_src[d] = (d == 0) ? 1 : strides_src[d - 1] * sizes_src[d - 1];
+      }
+
+      split_channels_[i] = data[i].shape_[1];
+      channels_ += split_channels_[i];
+      fwd_bottom_data_[i]->create_user_layout(dim_src, sizes_src, strides_src);
+      bwd_bottom_diff_[i]->create_user_layout(dim_src, sizes_src, strides_src);
+      delete[] sizes_src;
+      delete[] strides_src;
+    }
+    size_t *sizes_dst = new size_t[dim_dst];
+    size_t *strides_dst = new size_t[dim_dst];
+    for (size_t d = 0; d < dim_dst; ++d) {
+      if (d == 2)
+        sizes_dst[d] = channels_;
+      else
+        sizes_dst[d] = data[0].shape_[dim_dst - 1 - d];
+      strides_dst[d] = (d == 0) ? 1 : strides_dst[d - 1] * sizes_dst[d - 1];
+    }
+    bwd_top_diff_->create_user_layout(dim_dst, sizes_dst, strides_dst);
+    fwd_top_data_->create_user_layout(dim_dst, sizes_dst, strides_dst);
+    delete[] sizes_dst;
+    delete[] strides_dst;
+    concatFwd_ = NULL;
+    concatBwd_ = NULL;
+  }
+
+ public:
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(static_cast<int>(in_data.size()), size_);
+    CHECK_EQ(out_data.size(), 1);
+    CHECK_LT(dimension_, (size_t)in_data[concat_enum::kData0].ndim());
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    std::vector<Tensor<xpu, 4, DType> > data(size_);
+    Tensor<xpu, 4, DType> out;
+    if (in_data[0].ndim() == 2) {
+      for (int i = 0; i < size_; ++i) {
+        Shape<4> dshape = Shape4(in_data[i].shape_[0],
+                                 in_data[i].shape_[1], 1, 1);
+        data[i] = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
+          in_data[i], dshape, s);
+      }
+      Shape<4> dshape = Shape4(out_data[concat_enum::kOut].shape_[0],
+                               out_data[concat_enum::kOut].shape_[1], 1, 1);
+      out = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
+        out_data[concat_enum::kOut], dshape, s);
+    } else if (in_data[0].ndim() == 3) {
+      for (int i = 0; i < size_; ++i) {
+        Shape<4> dshape = Shape4(in_data[i].shape_[0],
+          in_data[i].shape_[1], in_data[i].shape_[2], 1);
+        data[i] = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
+          in_data[i], dshape, s);
+      }
+      Shape<4> dshape = Shape4(out_data[concat_enum::kOut].shape_[0],
+        out_data[concat_enum::kOut].shape_[1],
+        out_data[concat_enum::kOut].shape_[2], 1);
+      out = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
+        out_data[concat_enum::kOut], dshape, s);
+    } else {
+      for (int i = 0; i < size_; ++i) {
+        data[i] = mkl_experimental_direct_get<xpu, 4, DType>(in_data[i], s);
+      }
+      out = mkl_experimental_direct_get<xpu, 4, DType>(out_data[concat_enum::kOut], s);
+    }
+    size_t *split_channels_ = new size_t[num_concats_];
+    if (!init_mkldnn_) {
+      init_mkldnn_ = true;
+      LayerSetUp(data, out, 4, split_channels_);
+    }
+
+    dnnError_t e;
+    std::vector<void*> bottom_data;
+    bool isFirstPass = (concatFwd_ == NULL);
+    dnnLayout_t *layouts = NULL;
+    if (isFirstPass) {
+      layouts = new dnnLayout_t[num_concats_];
+    }
+
+    for (size_t i = 0; i < num_concats_; i++) {
+      void * bottom_i = NULL;
+#if MKL_EXPERIMENTAL == 1
+      bottom_i = mkl_prv_data<DType>(in_data[i]);
+      if (bottom_i != NULL) {
+        if (isFirstPass) {
+          std::shared_ptr<MKLData<DType> > mem_descr =
+            mkl_get_mem_desc<DType>(in_data[i].Mkl_mem_);
+          fwd_bottom_data_[i] = mem_descr;
+          layouts[i] = mem_descr->layout_int;
+        }
+      }
+#endif
+      if (bottom_i == NULL) {
+        bottom_i = data[i].dptr_;
+        if (isFirstPass) {
+          layouts[i] = fwd_bottom_data_[i]->layout_usr;
+        }
+      }
+
+      bottom_data.push_back(reinterpret_cast<void *>(bottom_i));
+    }
+
+    if (isFirstPass) {
+      e = dnnConcatCreate<DType>(&concatFwd_, NULL, num_concats_, layouts);
+      CHECK_EQ(e, E_SUCCESS);
+
+      fwd_top_data_->create_internal_layout(concatFwd_, dnnResourceDst);
+      bwd_top_diff_->create_internal_layout(concatFwd_, dnnResourceDst);
+
+      e = dnnSplitCreate<DType>(&concatBwd_, NULL, num_concats_,
+            bwd_top_diff_->layout_int, split_channels_);
+      CHECK_EQ(e, E_SUCCESS);
+
+      for (size_t n = 0; n < num_concats_; ++n) {
+        fwd_bottom_data_[n]->create_internal_layout(concatFwd_,
+          (dnnResourceType_t)(dnnResourceMultipleSrc + n));
+        bwd_bottom_diff_[n]->create_internal_layout(concatBwd_,
+          (dnnResourceType_t)(dnnResourceMultipleDst + n));
+      }
+    }
+    delete[] layouts;
+
+    void *concat_res[dnnResourceNumber];
+    for (size_t i = 0; i < num_concats_; ++i) {
+      concat_res[dnnResourceMultipleSrc + i]
+        = reinterpret_cast<void*>(bottom_data[i]);
+    }
+
+    concat_res[dnnResourceDst] = fwd_top_data_->get_output_ptr(out.dptr_,
+      fwd_top_data_, out_data[concat_enum::kOut]);
+    e = dnnExecute<DType>(concatFwd_, concat_res);
+    CHECK_EQ(e, E_SUCCESS);
+    delete[] split_channels_;
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_states) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), 1);
+    CHECK_EQ(in_grad.size(), static_cast<size_t>(size_));
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    std::vector<Tensor<xpu, 4, DType> > grad_in(size_);
+    Tensor<xpu, 4, DType> grad;
+    if (in_grad[0].ndim() == 2) {
+      Shape<4> dshape = Shape4(out_grad[concat_enum::kOut].shape_[0],
+        out_grad[concat_enum::kOut].shape_[1], 1, 1);
+      grad = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
+        out_grad[concat_enum::kOut], dshape, s);
+      for (int i = 0; i < size_; ++i) {
+        dshape = Shape4(in_grad[i].shape_[0],
+          in_grad[i].shape_[1], 1, 1);
+        grad_in[i] = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
+          in_grad[i], dshape, s);
+      }
+    } else if (in_grad[0].ndim() == 3) {
+      Shape<4> dshape = Shape4(out_grad[concat_enum::kOut].shape_[0],
+        out_grad[concat_enum::kOut].shape_[1],
+        out_grad[concat_enum::kOut].shape_[2], 1);
+      grad = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
+        out_grad[concat_enum::kOut], dshape, s);
+      for (int i = 0; i < size_; ++i) {
+        dshape = Shape4(in_grad[i].shape_[0],
+          in_grad[i].shape_[1], in_grad[i].shape_[2], 1);
+        grad_in[i] = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
+          in_grad[i], dshape, s);
+      }
+    } else {
+      grad = mkl_experimental_direct_get<xpu, 4, DType>(out_grad[concat_enum::kOut], s);
+      for (int i = 0; i < size_; ++i) {
+        grad_in[i] = mkl_experimental_direct_get<xpu, 4, DType>(in_grad[i], s);
+      }
+    }
+
+    int need_bwd = 0;
+    for (size_t n = 0; n < num_concats_; n++) {
+      need_bwd += req[n];
+    }
+    if (!need_bwd) {
+      return;
+    }
+
+    dnnError_t e;
+    void *concat_res[dnnResourceNumber];
+    concat_res[dnnResourceSrc] = bwd_top_diff_->get_converted_prv(grad.dptr_, true,
+      out_grad[concat_enum::kOut]);
+    for (size_t i = 0; i < num_concats_; ++i) {
+      concat_res[dnnResourceMultipleDst + i] = bwd_bottom_diff_[i]->get_output_ptr(
+        grad_in[i].dptr_, bwd_bottom_diff_[i], in_grad[i]);
+    }
+    e = dnnExecute<DType>(concatBwd_, concat_res);
+    CHECK_EQ(e, E_SUCCESS);
+  }
+
+ private:
+  int size_;
+  size_t dimension_;
+
+  bool init_mkldnn_;
+
+  dnnPrimitive_t concatFwd_;
+  dnnPrimitive_t concatBwd_;
+  std::shared_ptr<MKLData<DType> > fwd_top_data_;
+  std::vector< std::shared_ptr<MKLData<DType> > > fwd_bottom_data_;
+  std::shared_ptr<MKLData<DType> > bwd_top_diff_;
+  std::vector< std::shared_ptr<MKLData<DType> > > bwd_bottom_diff_;
+
+
+  size_t width_;
+  size_t height_;
+  size_t channels_;
+  size_t num_;
+  size_t num_concats_;
+};  // class MKLConcatOp
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_MKL_MKL_CONCAT_INL_H_
diff --git a/src/operator/mkl/mkl_convolution-inl.h b/src/operator/mkl/mkl_convolution-inl.h
new file mode 100644
index 0000000000..813d061f17
--- /dev/null
+++ b/src/operator/mkl/mkl_convolution-inl.h
@@ -0,0 +1,490 @@
+/*******************************************************************************
+* Copyright 2016 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+* \file mkl_convolution-inl.h
+* \brief
+* \author lingyan.guo@intel.com
+*         zhenlin.luo@intel.com
+*
+*******************************************************************************/
+#ifndef MXNET_OPERATOR_MKL_MKL_CONVOLUTION_INL_H_
+#define MXNET_OPERATOR_MKL_MKL_CONVOLUTION_INL_H_
+#include <mxnet/storage.h>
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <algorithm>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include "../operator_common.h"
+#include "../nn/convolution-inl.h"
+#include "./mkl_util-inl.h"
+
+namespace mxnet {
+namespace op {
+
+template<typename xpu, typename DType>
+class MKLConvolutionOp : public Operator {
+ public:
+  static std::string getName() {
+    return "MKLConvolutionOp";
+  }
+  void SetupBuffer() {
+    convolutionBwdBias = static_cast<dnnPrimitive_t>(NULL);
+    convolutionBwdFilter = static_cast<dnnPrimitive_t>(NULL);
+    convolutionBwdData = static_cast<dnnPrimitive_t>(NULL);
+    convolutionFwd = static_cast<dnnPrimitive_t>(NULL);
+    fwd_bottom_data = MKLData<DType>::create();
+    fwd_top_data = MKLData<DType>::create();
+    fwd_filter_data = MKLData<DType>::create();
+    fwd_bias_data = MKLData<DType>::create();
+    bwdd_top_diff = MKLData<DType>::create();
+    bwdd_bottom_diff = MKLData<DType>::create();
+    bwdd_filter_data = MKLData<DType>::create();
+    bwdf_top_diff = MKLData<DType>::create();
+    bwdf_filter_diff = MKLData<DType>::create();
+    bwdf_bottom_data = MKLData<DType>::create();
+    bwdb_top_diff = MKLData<DType>::create();
+    bwdb_bias_diff = MKLData<DType>::create();
+    // Names are for debugging purposes only.
+    fwd_bottom_data->name = "fwd_bottom_data   @ " + this->getName();
+    fwd_top_data->name = "fwd_top_data      @ " + this->getName();
+    fwd_filter_data->name = "fwd_filter_data   @ " + this->getName();
+    fwd_bias_data->name = "fwd_bias_data     @ " + this->getName();
+    bwdd_top_diff->name = "bwdd_top_diff     @ " + this->getName();
+    bwdd_bottom_diff->name = "bwdd_bottom_diff  @ " + this->getName();
+    bwdd_filter_data->name = "bwdd_filter_data  @ " + this->getName();
+    bwdf_top_diff->name = "bwdf_top_diff     @ " + this->getName();
+    bwdf_bottom_data->name = "bwdf_bottom_data  @ " + this->getName();
+    bwdf_filter_diff->name = "bwdf_filter_diff  @ " + this->getName();
+    bwdb_top_diff->name = "bwdb_top_diff     @ " + this->getName();
+    bwdb_bias_diff->name = "bwdb_bias_diff    @ " + this->getName();
+  }
+
+  explicit MKLConvolutionOp(ConvolutionParam p):
+                            convolutionFwd(NULL),
+                            convolutionBwdData(static_cast<dnnPrimitive_t>(NULL)),
+                            convolutionBwdFilter(static_cast<dnnPrimitive_t>(NULL)),
+                            convolutionBwdBias(static_cast<dnnPrimitive_t>(NULL)) {
+    this->param_ = p;
+    init_mkldnn_ = false;
+    // convert MBytes first to Bytes and then to elements.
+    param_.workspace = (param_.workspace << 20) / sizeof(DType);
+    SetupBuffer();
+  }
+  void ReleaseBuffer() {
+    if (convolutionFwd != NULL) {
+     dnnDelete<DType>(convolutionFwd);
+     convolutionFwd = NULL;
+    }
+    if (convolutionBwdData != NULL) {
+     dnnDelete<DType>(convolutionBwdData);
+     convolutionBwdData = NULL;
+    }
+    if (convolutionBwdFilter != NULL) {
+     dnnDelete<DType>(convolutionBwdFilter);
+     convolutionBwdFilter = NULL;
+    }
+    if (!param_.no_bias && convolutionBwdBias != NULL) {
+     dnnDelete<DType>(convolutionBwdBias);
+     convolutionBwdBias = NULL;
+    }
+  }
+  virtual ~MKLConvolutionOp() {
+    ReleaseBuffer();
+  }
+
+ private:
+  void LayerSetUp(const mshadow::Tensor<xpu, 4, DType> &data,
+                  const mshadow::Tensor<xpu, 4, DType> &out) {
+    this->width_ = data.shape_[3];
+    this->height_ = data.shape_[2];
+    this->channels_ = data.shape_[1];
+    this->num_ = data.shape_[0];
+    this->group_ = param_.num_group;
+    this->width_out_ = out.shape_[3];
+    this->height_out_ = out.shape_[2];
+    int channel_out_ = out.shape_[1];
+    this->num_output_ = channel_out_;
+    kernel_w_ = param_.kernel[1];
+    kernel_h_ = param_.kernel[0];
+    stride_w_ = param_.stride[1];
+    stride_h_ = param_.stride[0];
+    pad_w_ = param_.pad[1];
+    pad_h_ = param_.pad[0];
+    int status;
+    size_t n, g;
+    size_t iw, ih, ic;
+    size_t ow, oh, oc;
+    size_t kw, kh;
+    size_t dimension = 4;
+    g = std::max(this->group_, 1);
+    n = this->num_;
+    iw = this->width_;
+    ih = this->height_;
+    ic = this->channels_;
+    ow = this->width_out_;
+    oh = this->height_out_;
+    oc = this->num_output_;
+    kw = this->kernel_w_;
+    kh = this->kernel_h_;
+    oc = this->num_output_;
+    size_t bdata_sizes[4] = { iw, ih, ic, n };
+    size_t bdata_strides[4] = { 1, iw, iw*ih, iw*ih*ic };
+    /* starting with MKL 2017 Gold in case of groups filter layout
+    * becomes 5D, i.e. groups become a separate dimension */
+    size_t g_mkl2017 = g;
+    size_t f_dimension = dimension + (g != 1);
+    if (getMKLBuildDate() < 20160701) {
+     g_mkl2017 = 1;
+     f_dimension = dimension;
+    }
+    size_t fdata_sizes[5] = { kw, kh, ic / g, oc / g_mkl2017, g_mkl2017 };
+    size_t fdata_strides[5] = { 1, kw, kw*kh, kw*kh*ic / g, kw*kh*ic / g*oc / g };
+    size_t bias_sizes[1] = { oc };
+    size_t bias_strides[1] = { 1 };
+    size_t tdata_sizes[4] = { ow, oh, oc, n };
+    size_t tdata_strides[4] = { 1, ow, ow*oh, ow*oh*oc };
+    size_t convolutionStrides[2] = { this->stride_w_, this->stride_h_ };
+    int    inputOffset[2] = { -this->pad_w_, -this->pad_h_ };
+    // Names are for debugging purposes only.
+    /*** convolution section ***/
+    if (!param_.no_bias) {
+      status = dnnGroupsConvolutionCreateForwardBias<DType>(&convolutionFwd,
+                                                            NULL,
+                                                            dnnAlgorithmConvolutionDirect,
+                                                            g,
+                                                            dimension,
+                                                            bdata_sizes,
+                                                            tdata_sizes,
+                                                            fdata_sizes,
+                                                            convolutionStrides,
+                                                            inputOffset,
+                                                            dnnBorderZeros);
+    } else {
+      status = dnnGroupsConvolutionCreateForward<DType>(&convolutionFwd,
+                                                        NULL,
+                                                        dnnAlgorithmConvolutionDirect,
+                                                        g,
+                                                        dimension,
+                                                        bdata_sizes,
+                                                        tdata_sizes,
+                                                        fdata_sizes,
+                                                        convolutionStrides,
+                                                        inputOffset,
+                                                        dnnBorderZeros);
+    }
+    CHECK_EQ(status, 0)
+     << "Failed dnnCreateConvolution<DType>(dnnForward) with status "
+     << status << "\n";
+    fwd_bottom_data->create_layouts(convolutionFwd, dnnResourceSrc, dimension,
+                                    bdata_sizes, bdata_strides);
+    fwd_top_data->create_layouts(convolutionFwd, dnnResourceDst, dimension,
+                                 tdata_sizes, tdata_strides);
+    fwd_filter_data->create_layouts(convolutionFwd, dnnResourceFilter,
+                                    f_dimension, fdata_sizes, fdata_strides);
+    if (!param_.no_bias)
+      fwd_bias_data->create_layouts(convolutionFwd, dnnResourceBias, 1,
+                                    bias_sizes, bias_strides);
+    /*
+    * Backward by data layer setup
+    */
+    status = dnnGroupsConvolutionCreateBackwardData<DType>(&convolutionBwdData,
+                                                           NULL,
+                                                           dnnAlgorithmConvolutionDirect,
+                                                           g,
+                                                           dimension,
+                                                           bdata_sizes,
+                                                           tdata_sizes,
+                                                           fdata_sizes,
+                                                           convolutionStrides,
+                                                           inputOffset,
+                                                           dnnBorderZeros);
+    CHECK_EQ(status, 0)
+     << "Failed dnnConvolutionCreateBackwardData with status "
+     << status << "\n";
+    bwdd_bottom_diff->create_layouts(convolutionBwdData, dnnResourceDiffSrc,
+                                     dimension, bdata_sizes, bdata_strides);
+    bwdd_top_diff->create_layouts(convolutionBwdData, dnnResourceDiffDst,
+                                  dimension, tdata_sizes, tdata_strides);
+    bwdd_filter_data->create_layouts(convolutionBwdData, dnnResourceFilter,
+                                     f_dimension, fdata_sizes, fdata_strides);
+    /*
+    * Backward by filter layer setup
+    */
+    status = dnnGroupsConvolutionCreateBackwardFilter<DType>(&convolutionBwdFilter,
+                                                             NULL,
+                                                             dnnAlgorithmConvolutionDirect,
+                                                             g,
+                                                             dimension,
+                                                             bdata_sizes,
+                                                             tdata_sizes,
+                                                             fdata_sizes,
+                                                             convolutionStrides,
+                                                             inputOffset,
+                                                             dnnBorderZeros);
+    CHECK_EQ(status, 0)
+     << "Failed dnnConvolutionCreateBackwardFilter with status "
+     << status << "\n";
+    bwdf_bottom_data->create_layouts(convolutionBwdFilter, dnnResourceSrc,
+                                     dimension, bdata_sizes, bdata_strides);
+    bwdf_top_diff->create_layouts(convolutionBwdFilter, dnnResourceDiffDst,
+                                  dimension, tdata_sizes, tdata_strides);
+    bwdf_filter_diff->create_layouts(convolutionBwdFilter, dnnResourceDiffFilter,
+                                     f_dimension, fdata_sizes, fdata_strides);
+    /*
+    * Backward by bias layer setup
+    */
+    if (!param_.no_bias) {
+      status = dnnGroupsConvolutionCreateBackwardBias<DType>(&convolutionBwdBias,
+                                                             NULL,
+                                                             dnnAlgorithmConvolutionDirect,
+                                                             g,
+                                                             dimension,
+                                                             tdata_sizes);
+     CHECK_EQ(status, 0)
+      << "Failed dnnConvolutionCreateBackwardBias with status "
+      << status << "\n";
+     bwdb_top_diff->create_layouts(convolutionBwdBias, dnnResourceDiffDst,
+                                   dimension, tdata_sizes, tdata_strides);
+     bwdb_bias_diff->create_layouts(convolutionBwdBias, dnnResourceDiffBias, 1,
+                                    bias_sizes, bias_strides);
+    }
+  }
+
+ public:
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    DType *data_ptr = NULL;
+    DType *wmat_ptr = NULL;
+    DType *out_ptr = NULL;
+    Tensor<xpu, 4, DType> data =
+      mkl_experimental_direct_get<xpu, 4, DType>(in_data[conv::kData], s);
+    Tensor<xpu, 4, DType> out =
+      mkl_experimental_direct_get<xpu, 4, DType>(out_data[conv::kOut], s);
+    Tensor<xpu, 4, DType> wmat =
+      mkl_experimental_direct_get<xpu, 4, DType>(in_data[conv::kWeight], s);
+    if (!init_mkldnn_) {
+      LayerSetUp(data, out);
+      init_mkldnn_ = true;
+    }
+    CHECK_EQ(data.CheckContiguous(), true);
+    CHECK_EQ(wmat.CheckContiguous(), true);
+    CHECK_EQ(out.CheckContiguous(), true);
+    data_ptr = data.dptr_;
+    wmat_ptr = wmat.dptr_;
+    out_ptr = out.dptr_;
+    int status;
+    void *res_convolutionFwd[dnnResourceNumber];
+    res_convolutionFwd[dnnResourceSrc] =
+      fwd_bottom_data->get_converted_prv(data_ptr, false, in_data[conv::kData]);
+    res_convolutionFwd[dnnResourceFilter] =
+      fwd_filter_data->get_converted_prv(wmat_ptr, true, in_data[conv::kWeight]);
+    if (!param_.no_bias) {
+      Tensor<xpu, 1, DType> bias =
+        mkl_experimental_direct_get<xpu, 1, DType>(in_data[conv::kBias], s);
+      res_convolutionFwd[dnnResourceBias] =
+        fwd_bias_data->get_converted_prv(bias.dptr_, true, in_data[conv::kBias]);
+    }
+
+    res_convolutionFwd[dnnResourceDst] = fwd_top_data->get_output_ptr(out_ptr,
+      fwd_top_data, out_data[conv::kOut]);
+    status = dnnExecute<DType>(convolutionFwd, res_convolutionFwd);
+    CHECK_EQ(status, 0) << "Forward convolution failed with status " << status;
+#if MKL_EXPERIMENTAL == 0
+    if (fwd_top_data->conversion_needed()) {
+        fwd_top_data->convert_from_prv(out_ptr);
+    }
+#endif
+  }
+  void AddToModeAllocAndStoreBuffer(void *src, int blob_size, Storage::Handle *pws) {
+    int blob_byte_size = blob_size * sizeof(DType);
+    *pws = Storage::Get()->Alloc(blob_byte_size, Context::CPU());
+    memcpy(pws->dptr, src, blob_byte_size);
+  }
+  void AddToModeAddAndReleaseBuffer(Storage::Handle *pws, void *dst_, int blob_size) {
+    DType *dst = reinterpret_cast<DType*>(dst_);
+    DType *src = reinterpret_cast<DType*>(pws->dptr);
+#pragma omp parallel for
+    for (int i = 0; i < blob_size; i++) {
+      dst[i] += src[i];
+    }
+    if (pws->dptr)
+      Storage::Get()->Free(*pws);
+    pws->dptr = NULL;
+  }
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    if (param_.kernel.ndim() > 2) {
+      LOG(FATAL) << "Volume convolution is not implmented in mshadow";
+    }
+    CHECK_EQ(out_grad.size(), 1);
+    size_t expected = param_.no_bias == 0 ? 3 : 2;
+    CHECK(in_data.size() == expected && in_grad.size() == expected);
+    CHECK_EQ(req.size(), expected);
+    CHECK_EQ(in_data[conv::kWeight].CheckContiguous(), true);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4, DType> data =
+      mkl_experimental_direct_get<xpu, 4, DType>(in_data[conv::kData], s);
+    Shape<3> wmat_shape =
+      Shape3(param_.num_group,
+             param_.num_filter / param_.num_group,
+             data.shape_[1] / param_.num_group * param_.kernel[0] * param_.kernel[1]);
+    Tensor<xpu, 3, DType> wmat =
+      mkl_experimental_direct_get_with_shape<xpu, 3, DType>(
+      in_data[conv::kWeight], wmat_shape, s);
+    Tensor<xpu, 4, DType> grad =
+      mkl_experimental_direct_get<xpu, 4, DType>(out_grad[conv::kOut], s);
+    Tensor<xpu, 4, DType> gdata =
+      mkl_experimental_direct_get<xpu, 4, DType>(in_grad[conv::kData], s);
+    Tensor<xpu, 3, DType> gwmat =
+      mkl_experimental_direct_get_with_shape<xpu, 3, DType>(
+      in_grad[conv::kWeight], wmat_shape, s);
+
+    if (!init_mkldnn_) {
+      init_mkldnn_ = true;
+      LayerSetUp(data, grad);
+    }
+    int status;
+    if (req[0]) {
+      void *res_convolutionBwdData[dnnResourceNumber];
+      res_convolutionBwdData[dnnResourceDiffDst] =
+        bwdd_top_diff->get_converted_prv(grad.dptr_, true, out_grad[conv::kOut]);
+
+      res_convolutionBwdData[dnnResourceFilter] =
+        bwdd_filter_data->get_converted_prv(wmat.dptr_, false, in_data[conv::kWeight]);
+     Storage::Handle addtoWorkspace;
+     if (req[0] == kAddTo) {
+       // wait mkl support addto mode
+       AddToModeAllocAndStoreBuffer(gdata.dptr_, in_grad[conv::kData].Size(), &addtoWorkspace);
+     }
+
+     res_convolutionBwdData[dnnResourceDiffSrc] = bwdd_bottom_diff->get_output_ptr(gdata.dptr_,
+       bwdd_bottom_diff, in_grad[conv::kData]);
+     status = dnnExecute<DType>(convolutionBwdData, res_convolutionBwdData);
+     CHECK_EQ(status, 0) << "Backward Data conv failed with status " << status;
+#if MKL_EXPERIMENTAL == 0
+     if (bwdd_bottom_diff->conversion_needed()) {
+       bwdd_bottom_diff->convert_from_prv(gdata.dptr_);
+     }
+#endif
+     if (req[0] == kAddTo) {
+       if (bwdd_bottom_diff->conversion_needed()) {
+         bwdd_bottom_diff->convert_from_prv(gdata.dptr_);
+       }
+      AddToModeAddAndReleaseBuffer(&addtoWorkspace, gdata.dptr_, in_grad[conv::kData].Size());
+     }
+    }
+    if (req[1]) {
+      void *res_convolutionBwdFilter[dnnResourceNumber];
+
+      res_convolutionBwdFilter[dnnResourceDiffDst] =
+        bwdf_top_diff->get_converted_prv(grad.dptr_, true, out_grad[conv::kOut]);
+
+      res_convolutionBwdFilter[dnnResourceSrc] =
+        bwdf_bottom_data->get_converted_prv(data.dptr_, false,
+          in_data[conv::kData]);
+     Storage::Handle addtoWorkspace;
+     if (req[1] == kAddTo) {
+       // wait mkl support addto mode
+       AddToModeAllocAndStoreBuffer(gwmat.dptr_, in_grad[conv::kWeight].Size(), &addtoWorkspace);
+     }
+
+     res_convolutionBwdFilter[dnnResourceDiffFilter] = bwdf_filter_diff->get_output_ptr(
+       gwmat.dptr_, bwdf_filter_diff, in_grad[conv::kWeight]);
+     status = dnnExecute<DType>(convolutionBwdFilter, res_convolutionBwdFilter);
+     CHECK_EQ(status, 0) << "Backward Filter conv failed with status " << status;
+#if MKL_EXPERIMENTAL == 0
+     if (bwdf_filter_diff->conversion_needed()) {
+       bwdf_filter_diff->convert_from_prv(gwmat.dptr_);
+     }
+#endif
+     if (req[1] == kAddTo) {
+       if (bwdf_filter_diff->conversion_needed()) {
+         bwdf_filter_diff->convert_from_prv(gwmat.dptr_);
+       }
+       AddToModeAddAndReleaseBuffer(&addtoWorkspace, gwmat.dptr_, in_grad[conv::kWeight].Size());
+     }
+    }
+    if (!param_.no_bias) {
+      Tensor<xpu, 1, DType> gbias =
+        mkl_experimental_direct_get<xpu, 1, DType>(in_grad[conv::kBias], s);
+      void *res_convolutionBwdBias[dnnResourceNumber];
+      res_convolutionBwdBias[dnnResourceDiffDst] =
+        bwdb_top_diff->get_converted_prv(grad.dptr_, true, out_grad[conv::kOut]);
+
+      res_convolutionBwdBias[dnnResourceDiffBias] = bwdb_bias_diff->get_output_ptr(gbias.dptr_,
+        bwdb_bias_diff, in_grad[conv::kBias]);
+      status = dnnExecute<DType>(convolutionBwdBias, res_convolutionBwdBias);
+      CHECK_EQ(status, 0) << "Backward Bias failed with status " << status;
+#if MKL_EXPERIMENTAL == 0
+      if (bwdb_bias_diff->conversion_needed()) {
+        bwdb_bias_diff->convert_from_prv(gbias.dptr_);
+      }
+#endif
+    }
+  }
+
+ private:
+  ConvolutionParam param_;
+  size_t width_,
+         height_,
+         width_out_,
+         height_out_,
+         kernel_w_,
+         kernel_h_,
+         stride_w_,
+         stride_h_;
+  int group_,
+      num_,
+      num_output_;
+  size_t channels_;
+  int pad_w_,
+      pad_h_;
+  bool init_mkldnn_;
+  dnnPrimitive_t convolutionFwd;
+  dnnPrimitive_t convolutionBwdData;
+  dnnPrimitive_t convolutionBwdFilter;
+  dnnPrimitive_t convolutionBwdBias;
+  /* Fwd step */
+  std::shared_ptr<MKLData<DType> > fwd_bottom_data, fwd_top_data, fwd_filter_data,
+                                   fwd_bias_data;
+  /* Bwd data step */
+  std::shared_ptr<MKLData<DType> > bwdd_top_diff, bwdd_bottom_diff;
+  std::shared_ptr<MKLData<DType> > bwdd_filter_data;
+  /* Bwd filter step */
+  std::shared_ptr<MKLData<DType> > bwdf_top_diff, bwdf_filter_diff;
+  std::shared_ptr<MKLData<DType> > bwdf_bottom_data;
+  std::shared_ptr<MKLData<DType> > bwdf_filter_diff_iter, bwdf2fwd_filter_diff,
+                                   bwdb_bias_diff_iter;
+  /* Bwd bias step */
+  std::shared_ptr<MKLData<DType> > bwdb_top_diff, bwdb_bias_diff;
+};  // class ConvolutionOp
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_MKL_MKL_CONVOLUTION_INL_H_
diff --git a/src/operator/mkl/mkl_cppwrapper.cc b/src/operator/mkl/mkl_cppwrapper.cc
new file mode 100644
index 0000000000..507e5498c8
--- /dev/null
+++ b/src/operator/mkl/mkl_cppwrapper.cc
@@ -0,0 +1,44 @@
+/*******************************************************************************
+* Copyright 2016 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+* \file mkl_cppwrapper.cc
+* \brief
+* \author lingyan.guo@intel.com
+*         zhenlin.luo@intel.com
+*
+*******************************************************************************/
+
+
+
+#include "mkl_cppwrapper.h"
+#include <stdio.h>
+#if MXNET_USE_MKL2017 == 1
+#include "mkl_service.h"
+
+int getMKLBuildDate() {
+    static int build = 0;
+    if (build == 0) {
+        MKLVersion v;
+        mkl_get_version(&v);
+        build = atoi(v.Build);
+        printf("MKL Build:%d\n", build);
+    }
+    return build;
+}
+
+bool enableMKLWarnGenerated() {
+  return false;
+}
+#endif  // MSHADOW_USE_MKL2017
diff --git a/src/operator/mkl/mkl_cppwrapper.h b/src/operator/mkl/mkl_cppwrapper.h
new file mode 100644
index 0000000000..7d66f20ad3
--- /dev/null
+++ b/src/operator/mkl/mkl_cppwrapper.h
@@ -0,0 +1,1020 @@
+/*******************************************************************************
+* Copyright 2016 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+* \file mkl_cppwrapper.h
+* \brief
+* \author lingyan.guo@intel.com
+*         zhenlin.luo@intel.com
+*
+*******************************************************************************/
+#ifndef MXNET_OPERATOR_MKL_MKL_CPPWRAPPER_H_
+#define MXNET_OPERATOR_MKL_MKL_CPPWRAPPER_H_
+
+
+#include <stdarg.h>
+#include <stddef.h>
+#if MXNET_USE_MKL2017 == 1
+#include "mkl_dnn_types.h"
+#include "mkl_dnn.h"
+#include "mkl_version.h"
+
+
+extern int getMKLBuildDate();
+extern bool enableMKLWarnGenerated();
+
+
+template <typename Dtype> inline dnnError_t dnnLayoutCreate(
+    dnnLayout_t *pLayout, size_t dimension, const size_t size[], const size_t strides[]);
+template <> inline dnnError_t dnnLayoutCreate<float>(
+    dnnLayout_t *pLayout, size_t dimension, const size_t size[], const size_t strides[]) {
+    return dnnLayoutCreate_F32(pLayout, dimension, size, strides);
+}
+template <> inline dnnError_t dnnLayoutCreate<double>(
+    dnnLayout_t *pLayout, size_t dimension, const size_t size[], const size_t strides[]) {
+    return dnnLayoutCreate_F64(pLayout, dimension, size, strides);
+}
+
+template <typename Dtype> inline dnnError_t dnnLayoutCreateFromPrimitive(
+    dnnLayout_t *pLayout, const dnnPrimitive_t primitive, dnnResourceType_t type);
+template <> inline dnnError_t dnnLayoutCreateFromPrimitive<float>(
+    dnnLayout_t *pLayout, const dnnPrimitive_t primitive, dnnResourceType_t type) {
+    return dnnLayoutCreateFromPrimitive_F32(pLayout, primitive, type);
+}
+template <> inline dnnError_t dnnLayoutCreateFromPrimitive<double>(
+    dnnLayout_t *pLayout, const dnnPrimitive_t primitive, dnnResourceType_t type) {
+    return dnnLayoutCreateFromPrimitive_F64(pLayout, primitive, type);
+}
+
+template <typename Dtype> inline size_t dnnLayoutGetMemorySize(
+    const dnnLayout_t layout);
+template <> inline size_t dnnLayoutGetMemorySize<float>(
+    const dnnLayout_t layout) {
+    return dnnLayoutGetMemorySize_F32(layout);
+}
+template <> inline size_t dnnLayoutGetMemorySize<double>(
+    const dnnLayout_t layout) {
+    return dnnLayoutGetMemorySize_F64(layout);
+}
+
+template <typename Dtype> inline int dnnLayoutCompare(
+    const dnnLayout_t l1, const dnnLayout_t l2);
+template <> inline int dnnLayoutCompare<float>(
+    const dnnLayout_t l1, const dnnLayout_t l2) {
+    return dnnLayoutCompare_F32(l1, l2);
+}
+template <> inline int dnnLayoutCompare<double>(
+    const dnnLayout_t l1, const dnnLayout_t l2) {
+    return dnnLayoutCompare_F64(l1, l2);
+}
+
+
+template <typename Dtype> inline dnnError_t dnnAllocateBuffer(
+    void **pPtr, dnnLayout_t layout);
+template <> inline dnnError_t dnnAllocateBuffer<float>(
+    void **pPtr, dnnLayout_t layout) {
+    return dnnAllocateBuffer_F32(pPtr, layout);
+}
+template <> inline dnnError_t dnnAllocateBuffer<double>(
+    void **pPtr, dnnLayout_t layout) {
+    return dnnAllocateBuffer_F64(pPtr, layout);
+}
+
+template <typename Dtype> inline dnnError_t dnnReleaseBuffer(
+    void *ptr);
+template <> inline dnnError_t dnnReleaseBuffer<float>(
+    void *ptr) {
+    return dnnReleaseBuffer_F32(ptr);
+}
+template <> inline dnnError_t dnnReleaseBuffer<double>(
+    void *ptr) {
+    return dnnReleaseBuffer_F64(ptr);
+}
+
+template <typename Dtype> inline dnnError_t dnnLayoutDelete(
+    dnnLayout_t layout);
+template <> inline dnnError_t dnnLayoutDelete<float>(
+    dnnLayout_t layout) {
+    return dnnLayoutDelete_F32(layout);
+}
+template <> inline dnnError_t dnnLayoutDelete<double>(
+    dnnLayout_t layout) {
+    return dnnLayoutDelete_F64(layout);
+}
+
+template <typename Dtype> inline dnnError_t dnnPrimitiveAttributesCreate(
+    dnnPrimitiveAttributes_t *attributes);
+template <> inline dnnError_t dnnPrimitiveAttributesCreate<float>(
+    dnnPrimitiveAttributes_t *attributes) {
+    return dnnPrimitiveAttributesCreate_F32(attributes);
+}
+template <> inline dnnError_t dnnPrimitiveAttributesCreate<double>(
+    dnnPrimitiveAttributes_t *attributes) {
+    return dnnPrimitiveAttributesCreate_F64(attributes);
+}
+
+
+template <typename Dtype> inline dnnError_t dnnPrimitiveAttributesDestroy(
+    dnnPrimitiveAttributes_t attributes);
+template <> inline dnnError_t dnnPrimitiveAttributesDestroy<float>(
+    dnnPrimitiveAttributes_t attributes) {
+    return dnnPrimitiveAttributesDestroy_F32(attributes);
+}
+template <> inline dnnError_t dnnPrimitiveAttributesDestroy<double>(
+    dnnPrimitiveAttributes_t attributes) {
+    return dnnPrimitiveAttributesDestroy_F64(attributes);
+}
+
+template <typename Dtype> inline dnnError_t dnnPrimitiveGetAttributes(
+    dnnPrimitive_t primitive,
+    dnnPrimitiveAttributes_t *attributes);
+template <> inline dnnError_t dnnPrimitiveGetAttributes<float>(
+    dnnPrimitive_t primitive,
+    dnnPrimitiveAttributes_t *attributes) {
+    return dnnPrimitiveGetAttributes_F32(primitive, attributes);
+}
+template <> inline dnnError_t dnnPrimitiveGetAttributes<double>(
+    dnnPrimitive_t primitive,
+    dnnPrimitiveAttributes_t *attributes) {
+    return dnnPrimitiveGetAttributes_F64(primitive, attributes);
+}
+
+template <typename Dtype> inline dnnError_t dnnExecute(
+    dnnPrimitive_t primitive, void *resources[]);
+template <> inline dnnError_t dnnExecute<float>(
+    dnnPrimitive_t primitive, void *resources[]) {
+    return dnnExecute_F32(primitive, resources);
+}
+template <> inline dnnError_t dnnExecute<double>(
+    dnnPrimitive_t primitive, void *resources[]) {
+    return dnnExecute_F64(primitive, resources);
+}
+
+template <typename Dtype> inline dnnError_t dnnExecuteAsync(
+    dnnPrimitive_t primitive, void *resources[]);
+template <> inline dnnError_t dnnExecuteAsync<float>(
+    dnnPrimitive_t primitive, void *resources[]) {
+    return dnnExecuteAsync_F32(primitive, resources);
+}
+template <> inline dnnError_t dnnExecuteAsync<double>(
+    dnnPrimitive_t primitive, void *resources[]) {
+    return dnnExecuteAsync_F64(primitive, resources);
+}
+
+template <typename Dtype> inline dnnError_t dnnWaitFor(
+    dnnPrimitive_t primitive);
+template <> inline dnnError_t dnnWaitFor<float>(
+    dnnPrimitive_t primitive) {
+    return dnnWaitFor_F32(primitive);
+}
+template <> inline dnnError_t dnnWaitFor<double>(
+    dnnPrimitive_t primitive) {
+    return dnnWaitFor_F64(primitive);
+}
+
+template <typename Dtype> inline dnnError_t dnnDelete(
+    dnnPrimitive_t primitive);
+template <> inline dnnError_t dnnDelete<float>(
+    dnnPrimitive_t primitive) {
+    return dnnDelete_F32(primitive);
+}
+template <> inline dnnError_t dnnDelete<double>(
+    dnnPrimitive_t primitive) {
+    return dnnDelete_F64(primitive);
+}
+
+
+template <typename Dtype> inline dnnError_t dnnConversionCreate(
+    dnnPrimitive_t* pConversion, const dnnLayout_t from, const dnnLayout_t to);
+template <> inline dnnError_t dnnConversionCreate<float>(
+    dnnPrimitive_t* pConversion, const dnnLayout_t from, const dnnLayout_t to) {
+    return dnnConversionCreate_F32(pConversion, from, to);
+}
+template <> inline dnnError_t dnnConversionCreate<double>(
+    dnnPrimitive_t* pConversion, const dnnLayout_t from, const dnnLayout_t to) {
+    return dnnConversionCreate_F64(pConversion, from, to);
+}
+
+
+template <typename Dtype> inline dnnError_t dnnConversionExecute(
+    dnnPrimitive_t conversion, void *from, void *to);
+template <> inline dnnError_t dnnConversionExecute<float>(
+    dnnPrimitive_t conversion, void *from, void *to) {
+    return dnnConversionExecute_F32(conversion, from, to);
+}
+template <> inline dnnError_t dnnConversionExecute<double>(
+    dnnPrimitive_t conversion, void *from, void *to) {
+    return dnnConversionExecute_F64(conversion, from, to);
+}
+
+
+template <typename Dtype> inline dnnError_t dnnConvolutionCreateForward(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[],
+    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type);
+template <> inline dnnError_t dnnConvolutionCreateForward<float>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[],
+    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
+    return dnnConvolutionCreateForward_F32(
+               pConvolution,
+               attributes,
+               algorithm,
+               dimension, srcSize, dstSize, filterSize,
+               convolutionStrides, inputOffset, border_type);
+}
+
+template <> inline dnnError_t dnnConvolutionCreateForward<double>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[],
+    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
+    return dnnConvolutionCreateForward_F64(
+               pConvolution,
+               attributes,
+               algorithm,
+               dimension, srcSize, dstSize, filterSize,
+               convolutionStrides, inputOffset, border_type);
+}
+
+
+template <typename Dtype> inline dnnError_t dnnConvolutionCreateForwardBias(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[],
+    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type);
+template <> inline dnnError_t dnnConvolutionCreateForwardBias<float>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[],
+    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
+    return dnnConvolutionCreateForwardBias_F32(
+               pConvolution,
+               attributes,
+               algorithm,
+               dimension, srcSize, dstSize, filterSize,
+               convolutionStrides, inputOffset, border_type);
+}
+template <> inline dnnError_t dnnConvolutionCreateForwardBias<double>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[],
+    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
+    return dnnConvolutionCreateForwardBias_F64(
+               pConvolution,
+               attributes,
+               algorithm,
+               dimension, srcSize, dstSize, filterSize,
+               convolutionStrides, inputOffset, border_type);
+}
+
+
+template <typename Dtype> inline dnnError_t dnnConvolutionCreateBackwardData(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t dimension, const size_t srcSize[],
+    const size_t dstSize[], const size_t filterSize[],
+    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type);
+template <> inline dnnError_t dnnConvolutionCreateBackwardData<float>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t dimension, const size_t srcSize[],
+    const size_t dstSize[], const size_t filterSize[],
+    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
+    return dnnConvolutionCreateBackwardData_F32(
+               pConvolution,
+               attributes,
+               algorithm,
+               dimension, srcSize, dstSize, filterSize,
+               convolutionStrides, inputOffset, border_type);
+}
+template <> inline dnnError_t dnnConvolutionCreateBackwardData<double>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t dimension, const size_t srcSize[],
+    const size_t dstSize[], const size_t filterSize[],
+    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
+    return dnnConvolutionCreateBackwardData_F64(
+               pConvolution,
+               attributes,
+               algorithm,
+               dimension, srcSize, dstSize, filterSize,
+               convolutionStrides, inputOffset, border_type);
+}
+
+template <typename Dtype> inline dnnError_t dnnConvolutionCreateBackwardFilter(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[],
+    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type);
+template <> inline dnnError_t dnnConvolutionCreateBackwardFilter<float>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[],
+    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
+    return dnnConvolutionCreateBackwardFilter_F32(
+               pConvolution,
+               attributes,
+               algorithm,
+               dimension, srcSize, dstSize, filterSize,
+               convolutionStrides, inputOffset, border_type);
+}
+template <> inline dnnError_t dnnConvolutionCreateBackwardFilter<double>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[],
+    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
+    return dnnConvolutionCreateBackwardFilter_F64(
+               pConvolution,
+               attributes,
+               algorithm,
+               dimension, srcSize, dstSize, filterSize,
+               convolutionStrides, inputOffset, border_type);
+}
+
+template <typename Dtype> inline dnnError_t dnnConvolutionCreateBackwardBias(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t dimension, const size_t dstSize[]);
+template <> inline dnnError_t dnnConvolutionCreateBackwardBias<float>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t dimension, const size_t dstSize[]) {
+    return dnnConvolutionCreateBackwardBias_F32(
+               pConvolution,
+               attributes,
+               algorithm,
+               dimension, dstSize);
+}
+template <> inline dnnError_t dnnConvolutionCreateBackwardBias<double>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t dimension, const size_t dstSize[]) {
+    return dnnConvolutionCreateBackwardBias_F64(
+               pConvolution,
+               attributes,
+               algorithm,
+               dimension, dstSize);
+}
+
+template <typename Dtype> inline dnnError_t dnnGroupsConvolutionCreateForward(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t groups, size_t dimension, const size_t srcSize[],
+    const size_t dstSize[], const size_t filterSize[],
+    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type);
+template <> inline dnnError_t dnnGroupsConvolutionCreateForward<float>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t groups, size_t dimension, const size_t srcSize[],
+    const size_t dstSize[], const size_t filterSize[],
+    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
+    return dnnGroupsConvolutionCreateForward_F32(
+               pConvolution,
+               attributes,
+               algorithm,
+               groups, dimension, srcSize, dstSize, filterSize,
+               convolutionStrides, inputOffset, border_type);
+}
+template <> inline dnnError_t dnnGroupsConvolutionCreateForward<double>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t groups, size_t dimension, const size_t srcSize[],
+    const size_t dstSize[], const size_t filterSize[],
+    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
+    return dnnGroupsConvolutionCreateForward_F64(
+               pConvolution,
+               attributes,
+               algorithm,
+               groups, dimension, srcSize, dstSize, filterSize,
+               convolutionStrides, inputOffset, border_type);
+}
+
+template <typename Dtype> inline dnnError_t dnnGroupsConvolutionCreateForwardBias(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t groups, size_t dimension, const size_t srcSize[],
+    const size_t dstSize[], const size_t filterSize[],
+    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type);
+template <> inline dnnError_t dnnGroupsConvolutionCreateForwardBias<float>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t groups, size_t dimension, const size_t srcSize[],
+    const size_t dstSize[], const size_t filterSize[],
+    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
+    return dnnGroupsConvolutionCreateForwardBias_F32(
+               pConvolution,
+               attributes,
+               algorithm,
+               groups, dimension, srcSize, dstSize, filterSize,
+               convolutionStrides, inputOffset, border_type);
+}
+template <> inline dnnError_t dnnGroupsConvolutionCreateForwardBias<double>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t groups, size_t dimension, const size_t srcSize[],
+    const size_t dstSize[], const size_t filterSize[],
+    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
+    return dnnGroupsConvolutionCreateForwardBias_F64(
+               pConvolution,
+               attributes,
+               algorithm,
+               groups, dimension, srcSize, dstSize, filterSize,
+               convolutionStrides, inputOffset, border_type);
+}
+
+template <typename Dtype> inline dnnError_t dnnGroupsConvolutionCreateBackwardData(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t groups, size_t dimension, const size_t srcSize[],
+    const size_t dstSize[], const size_t filterSize[],
+    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type);
+template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardData<float>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t groups, size_t dimension, const size_t srcSize[],
+    const size_t dstSize[], const size_t filterSize[],
+    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
+    return dnnGroupsConvolutionCreateBackwardData_F32(
+               pConvolution,
+               attributes,
+               algorithm,
+               groups, dimension, srcSize, dstSize, filterSize,
+               convolutionStrides, inputOffset, border_type);
+}
+template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardData<double>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t groups, size_t dimension, const size_t srcSize[],
+    const size_t dstSize[], const size_t filterSize[],
+    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
+    return dnnGroupsConvolutionCreateBackwardData_F64(
+               pConvolution,
+               attributes,
+               algorithm,
+               groups, dimension, srcSize, dstSize, filterSize,
+               convolutionStrides, inputOffset, border_type);
+}
+
+
+template <typename Dtype> inline dnnError_t dnnGroupsConvolutionCreateBackwardFilter(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t groups, size_t dimension, const size_t srcSize[],
+    const size_t dstSize[], const size_t filterSize[],
+    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type);
+template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardFilter<float>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t groups, size_t dimension, const size_t srcSize[],
+    const size_t dstSize[], const size_t filterSize[],
+    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
+    return dnnGroupsConvolutionCreateBackwardFilter_F32(
+               pConvolution,
+               attributes,
+               algorithm,
+               groups, dimension, srcSize, dstSize, filterSize,
+               convolutionStrides, inputOffset, border_type);
+}
+template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardFilter<double>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t groups, size_t dimension, const size_t srcSize[],
+    const size_t dstSize[], const size_t filterSize[],
+    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
+    return dnnGroupsConvolutionCreateBackwardFilter_F64(
+               pConvolution,
+               attributes,
+               algorithm,
+               groups, dimension, srcSize, dstSize, filterSize,
+               convolutionStrides, inputOffset, border_type);
+}
+
+template <typename Dtype> inline dnnError_t dnnGroupsConvolutionCreateBackwardBias(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t groups, size_t dimension, const size_t dstSize[]);
+template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardBias<float>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t groups, size_t dimension, const size_t dstSize[]) {
+    return dnnGroupsConvolutionCreateBackwardBias_F32(
+               pConvolution,
+               attributes,
+               algorithm,
+               groups, dimension, dstSize);
+}
+template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardBias<double>(
+    dnnPrimitive_t* pConvolution,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm,
+    size_t groups, size_t dimension, const size_t dstSize[]) {
+    return dnnGroupsConvolutionCreateBackwardBias_F64(
+               pConvolution,
+               attributes,
+               algorithm,
+               groups, dimension, dstSize);
+}
+
+template <typename Dtype> inline dnnError_t dnnReLUCreateForward(
+    dnnPrimitive_t* pRelu,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout, float negativeSlope);
+template <> inline dnnError_t dnnReLUCreateForward<float>(
+    dnnPrimitive_t* pRelu,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout, float negativeSlope) {
+    return dnnReLUCreateForward_F32(
+               pRelu,
+               attributes,
+               dataLayout, negativeSlope);
+}
+template <> inline dnnError_t dnnReLUCreateForward<double>(
+    dnnPrimitive_t* pRelu,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout, float negativeSlope) {
+    return dnnReLUCreateForward_F64(
+               pRelu,
+               attributes,
+               dataLayout, negativeSlope);
+}
+
+template <typename Dtype> inline dnnError_t dnnReLUCreateBackward(
+    dnnPrimitive_t* pRelu,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t diffLayout, const dnnLayout_t dataLayout, float negativeSlope);
+template <> inline dnnError_t dnnReLUCreateBackward<float>(
+    dnnPrimitive_t* pRelu,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t diffLayout, const dnnLayout_t dataLayout, float negativeSlope) {
+    return dnnReLUCreateBackward_F32(
+               pRelu,
+               attributes,
+               diffLayout, dataLayout, negativeSlope);
+}
+template <> inline dnnError_t dnnReLUCreateBackward<double>(
+    dnnPrimitive_t* pRelu,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t diffLayout, const dnnLayout_t dataLayout, float negativeSlope) {
+    return dnnReLUCreateBackward_F64(
+               pRelu,
+               attributes,
+               diffLayout, dataLayout, negativeSlope);
+}
+
+template <typename Dtype> inline dnnError_t dnnLRNCreateForward(
+    dnnPrimitive_t* pLrn,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout, size_t kernel_size, float alpha, float beta, float k);
+template <> inline dnnError_t dnnLRNCreateForward<float>(
+    dnnPrimitive_t* pLrn,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout, size_t kernel_size, float alpha, float beta, float k) {
+    return dnnLRNCreateForward_F32(
+               pLrn,
+               attributes,
+               dataLayout, kernel_size, alpha, beta, k);
+}
+template <> inline dnnError_t dnnLRNCreateForward<double>(
+    dnnPrimitive_t* pLrn,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout, size_t kernel_size, float alpha, float beta, float k) {
+    return dnnLRNCreateForward_F64(
+               pLrn,
+               attributes,
+               dataLayout, kernel_size, alpha, beta, k);
+}
+
+
+template <typename Dtype> inline dnnError_t dnnLRNCreateBackward(
+    dnnPrimitive_t* pLrn,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t diffLayout, const dnnLayout_t dataLayout,
+    size_t kernel_size, float alpha, float beta, float k);
+template <> inline dnnError_t dnnLRNCreateBackward<float>(
+    dnnPrimitive_t* pLrn,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t diffLayout, const dnnLayout_t dataLayout,
+    size_t kernel_size, float alpha, float beta, float k) {
+    return dnnLRNCreateBackward_F32(
+               pLrn,
+               attributes,
+               diffLayout, dataLayout, kernel_size, alpha, beta, k);
+}
+template <> inline dnnError_t dnnLRNCreateBackward<double>(
+    dnnPrimitive_t* pLrn,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t diffLayout, const dnnLayout_t dataLayout,
+    size_t kernel_size, float alpha, float beta, float k) {
+    return dnnLRNCreateBackward_F64(
+               pLrn,
+               attributes,
+               diffLayout, dataLayout, kernel_size, alpha, beta, k);
+}
+
+
+template <typename Dtype> inline dnnError_t dnnPoolingCreateForward(
+    dnnPrimitive_t* pPooling,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t op,
+    const dnnLayout_t srcLayout,
+    const size_t kernelSize[], const size_t kernelStride[],
+    const int inputOffset[], const dnnBorder_t border_type);
+template <> inline dnnError_t dnnPoolingCreateForward<float>(
+    dnnPrimitive_t* pPooling,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t op,
+    const dnnLayout_t srcLayout,
+    const size_t kernelSize[], const size_t kernelStride[],
+    const int inputOffset[], const dnnBorder_t border_type) {
+    return dnnPoolingCreateForward_F32(
+               pPooling,
+               attributes,
+               op,
+               srcLayout,
+               kernelSize, kernelStride,
+               inputOffset, border_type);
+}
+template <> inline dnnError_t dnnPoolingCreateForward<double>(
+    dnnPrimitive_t* pPooling,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t op,
+    const dnnLayout_t srcLayout,
+    const size_t kernelSize[], const size_t kernelStride[],
+    const int inputOffset[], const dnnBorder_t border_type) {
+    return dnnPoolingCreateForward_F64(
+               pPooling,
+               attributes,
+               op,
+               srcLayout,
+               kernelSize, kernelStride,
+               inputOffset, border_type);
+}
+
+
+template <typename Dtype> inline dnnError_t dnnPoolingCreateBackward(
+    dnnPrimitive_t* pPooling,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t op,
+    const dnnLayout_t srcLayout,
+    const size_t kernelSize[], const size_t kernelStride[],
+    const int inputOffset[], const dnnBorder_t border_type);
+template <> inline dnnError_t dnnPoolingCreateBackward<float>(
+    dnnPrimitive_t* pPooling,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t op,
+    const dnnLayout_t srcLayout,
+    const size_t kernelSize[], const size_t kernelStride[],
+    const int inputOffset[], const dnnBorder_t border_type) {
+    return dnnPoolingCreateBackward_F32(
+               pPooling,
+               attributes,
+               op,
+               srcLayout,
+               kernelSize, kernelStride,
+               inputOffset, border_type);
+}
+template <> inline dnnError_t dnnPoolingCreateBackward<double>(
+    dnnPrimitive_t* pPooling,
+    dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t op,
+    const dnnLayout_t srcLayout,
+    const size_t kernelSize[], const size_t kernelStride[],
+    const int inputOffset[], const dnnBorder_t border_type) {
+    return dnnPoolingCreateBackward_F64(
+               pPooling,
+               attributes,
+               op,
+               srcLayout,
+               kernelSize, kernelStride,
+               inputOffset, border_type);
+}
+
+template <typename Dtype> inline dnnError_t dnnConcatCreate(
+    dnnPrimitive_t *pConcat,
+    dnnPrimitiveAttributes_t attributes,
+    const size_t N,
+    dnnLayout_t src[]);
+template <> inline dnnError_t dnnConcatCreate<float>(
+    dnnPrimitive_t *pConcat,
+    dnnPrimitiveAttributes_t attributes,
+    const size_t N,
+    dnnLayout_t src[]) {
+    return dnnConcatCreate_F32(
+               pConcat,
+               attributes,
+               N,
+               src);
+}
+template <> inline dnnError_t dnnConcatCreate<double>(
+    dnnPrimitive_t *pConcat,
+    dnnPrimitiveAttributes_t attributes,
+    const size_t N,
+    dnnLayout_t src[]) {
+    return dnnConcatCreate_F64(
+               pConcat,
+               attributes,
+               N,
+               src);
+}
+
+
+template <typename Dtype> inline dnnError_t dnnSplitCreate(
+    dnnPrimitive_t *pSplit,
+    dnnPrimitiveAttributes_t attributes,
+    const size_t N,
+    dnnLayout_t src,
+    size_t dst[]);
+template <> inline dnnError_t dnnSplitCreate<float>(
+    dnnPrimitive_t *pSplit,
+    dnnPrimitiveAttributes_t attributes,
+    const size_t N,
+    dnnLayout_t src,
+    size_t dst[]) {
+    return dnnSplitCreate_F32(
+               pSplit,
+               attributes,
+               N,
+               src,
+               dst);
+}
+template <> inline dnnError_t dnnSplitCreate<double>(
+    dnnPrimitive_t *pSplit,
+    dnnPrimitiveAttributes_t attributes,
+    const size_t N,
+    dnnLayout_t src,
+    size_t dst[]) {
+    return dnnSplitCreate_F64(
+               pSplit,
+               attributes,
+               N,
+               src,
+               dst);
+}
+
+template <typename Dtype> inline dnnError_t dnnSumCreate(
+    dnnPrimitive_t *pSum,
+    dnnPrimitiveAttributes_t attributes,
+    const size_t nSummands, dnnLayout_t layout, Dtype *coefficients);
+template <> inline dnnError_t dnnSumCreate<float>(
+    dnnPrimitive_t *pSum,
+    dnnPrimitiveAttributes_t attributes,
+    const size_t nSummands, dnnLayout_t layout, float *coefficients) {
+    return dnnSumCreate_F32(
+               pSum,
+               attributes,
+               nSummands,
+               layout, coefficients);
+}
+template <> inline dnnError_t dnnSumCreate<double>(
+    dnnPrimitive_t *pSum,
+    dnnPrimitiveAttributes_t attributes,
+    const size_t nSummands, dnnLayout_t layout, double *coefficients) {
+    return dnnSumCreate_F64(
+               pSum,
+               attributes,
+               nSummands,
+               layout, coefficients);
+}
+
+template <typename Dtype> inline dnnError_t dnnBatchNormalizationCreateForward_v2(
+    dnnPrimitive_t* pBatchNormalization,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout, float eps,
+    int flags);
+
+template <> inline dnnError_t dnnBatchNormalizationCreateForward_v2<float>(
+    dnnPrimitive_t* pBatchNormalization,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout, float eps,
+    int flags) {
+    return dnnBatchNormalizationCreateForward_v2_F32(
+               pBatchNormalization,
+               attributes,
+               dataLayout, eps, flags);
+}
+template <> inline dnnError_t dnnBatchNormalizationCreateForward_v2<double>(
+    dnnPrimitive_t* pBatchNormalization,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout, float eps,
+    int flags) {
+    return dnnBatchNormalizationCreateForward_v2_F64(
+               pBatchNormalization,
+               attributes,
+               dataLayout, eps, flags);
+}
+
+
+template <typename Dtype> inline dnnError_t dnnBatchNormalizationCreateBackward_v2(
+    dnnPrimitive_t* pBatchNormalization,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout, float eps,
+    int flags);
+
+template <> inline  dnnError_t dnnBatchNormalizationCreateBackward_v2<float>(
+    dnnPrimitive_t* pBatchNormalization,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout, float eps,
+    int flags) {
+    return dnnBatchNormalizationCreateBackward_v2_F32(
+               pBatchNormalization,
+               attributes,
+               dataLayout, eps, flags);
+}
+
+template <> inline dnnError_t dnnBatchNormalizationCreateBackward_v2<double>(
+    dnnPrimitive_t* pBatchNormalization,
+    dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout, float eps,
+    int flags) {
+    return dnnBatchNormalizationCreateBackward_v2_F64(
+               pBatchNormalization,
+               attributes,
+               dataLayout, eps, flags);
+}
+
+template <typename Dtype> inline dnnError_t dnnInnerProductCreateForward(
+    dnnPrimitive_t *pInnerProduct,
+    dnnPrimitiveAttributes_t attributes,
+    size_t dimensions,
+    const size_t srcSize[],
+    size_t outputChannels);
+template <> inline dnnError_t dnnInnerProductCreateForward<float>(
+    dnnPrimitive_t *pInnerProduct,
+    dnnPrimitiveAttributes_t attributes,
+    size_t dimensions,
+    const size_t srcSize[],
+    size_t outputChannels) {
+    return dnnInnerProductCreateForward_F32(pInnerProduct,
+                                            attributes, dimensions,
+                                            srcSize, outputChannels);
+}
+template <> inline dnnError_t dnnInnerProductCreateForward<double>(
+    dnnPrimitive_t *pInnerProduct,
+    dnnPrimitiveAttributes_t attributes,
+    size_t dimensions,
+    const size_t srcSize[],
+    size_t outputChannels) {
+    return dnnInnerProductCreateForward_F64(pInnerProduct,
+                                            attributes, dimensions,
+                                            srcSize, outputChannels);
+}
+
+template <typename Dtype> inline dnnError_t dnnInnerProductCreateForwardBias(
+    dnnPrimitive_t *pInnerProduct,
+    dnnPrimitiveAttributes_t attributes,
+    size_t dimensions,
+    const size_t srcSize[],
+    size_t outputChannels);
+
+template <> inline dnnError_t dnnInnerProductCreateForwardBias<float>(
+    dnnPrimitive_t *pInnerProduct,
+    dnnPrimitiveAttributes_t attributes,
+    size_t dimensions,
+    const size_t srcSize[],
+    size_t outputChannels) {
+    return dnnInnerProductCreateForwardBias_F32(pInnerProduct,
+            attributes, dimensions,
+            srcSize, outputChannels);
+}
+template <> inline dnnError_t dnnInnerProductCreateForwardBias<double>(
+    dnnPrimitive_t *pInnerProduct,
+    dnnPrimitiveAttributes_t attributes,
+    size_t dimensions,
+    const size_t srcSize[],
+    size_t outputChannels) {
+    return dnnInnerProductCreateForwardBias_F64(pInnerProduct,
+            attributes, dimensions,
+            srcSize, outputChannels);
+}
+
+
+template <typename Dtype> inline dnnError_t dnnInnerProductCreateBackwardData(
+    dnnPrimitive_t *pInnerProduct,
+    dnnPrimitiveAttributes_t attributes,
+    size_t dimensions,
+    const size_t srcSize[],
+    size_t outputChannels);
+
+template <> inline dnnError_t dnnInnerProductCreateBackwardData<float>(
+    dnnPrimitive_t *pInnerProduct,
+    dnnPrimitiveAttributes_t attributes,
+    size_t dimensions,
+    const size_t srcSize[],
+    size_t outputChannels) {
+    return dnnInnerProductCreateBackwardData_F32(pInnerProduct,
+            attributes, dimensions,
+            srcSize, outputChannels);
+}
+template <> inline dnnError_t dnnInnerProductCreateBackwardData<double>(
+    dnnPrimitive_t *pInnerProduct,
+    dnnPrimitiveAttributes_t attributes,
+    size_t dimensions,
+    const size_t srcSize[],
+    size_t outputChannels) {
+    return dnnInnerProductCreateBackwardData_F64(pInnerProduct,
+            attributes, dimensions,
+            srcSize, outputChannels);
+}
+
+
+
+
+template <typename Dtype> inline dnnError_t dnnInnerProductCreateBackwardFilter(
+    dnnPrimitive_t *pInnerProduct,
+    dnnPrimitiveAttributes_t attributes,
+    size_t dimensions,
+    const size_t srcSize[],
+    size_t outputChannels);
+
+template <> inline dnnError_t dnnInnerProductCreateBackwardFilter<float>(
+    dnnPrimitive_t *pInnerProduct,
+    dnnPrimitiveAttributes_t attributes,
+    size_t dimensions,
+    const size_t srcSize[],
+    size_t outputChannels) {
+    return dnnInnerProductCreateBackwardFilter_F32(pInnerProduct,
+            attributes, dimensions,
+            srcSize, outputChannels);
+}
+template <> inline dnnError_t dnnInnerProductCreateBackwardFilter<double>(
+    dnnPrimitive_t *pInnerProduct,
+    dnnPrimitiveAttributes_t attributes,
+    size_t dimensions,
+    const size_t srcSize[],
+    size_t outputChannels) {
+    return dnnInnerProductCreateBackwardFilter_F64(pInnerProduct,
+            attributes, dimensions,
+            srcSize, outputChannels);
+}
+
+
+
+template <typename Dtype> inline dnnError_t dnnInnerProductCreateBackwardBias(
+    dnnPrimitive_t *pInnerProduct,
+    dnnPrimitiveAttributes_t attributes,
+    size_t dimensions,
+    const size_t dstSize[]);
+
+template <> inline dnnError_t dnnInnerProductCreateBackwardBias<float>(
+    dnnPrimitive_t *pInnerProduct,
+    dnnPrimitiveAttributes_t attributes,
+    size_t dimensions,
+    const size_t dstSize[]) {
+    return dnnInnerProductCreateBackwardBias_F32(pInnerProduct,
+            attributes, dimensions,
+            dstSize);
+}
+template <> inline dnnError_t dnnInnerProductCreateBackwardBias<double>(
+    dnnPrimitive_t *pInnerProduct,
+    dnnPrimitiveAttributes_t attributes,
+    size_t dimensions,
+    const size_t dstSize[]) {
+    return dnnInnerProductCreateBackwardBias_F64(pInnerProduct,
+            attributes, dimensions,
+            dstSize);
+}
+#endif  // #MXNET_USE_MKL2017 == 1
+#endif  // MXNET_OPERATOR_MKL_MKL_CPPWRAPPER_H_
diff --git a/src/operator/mkl/mkl_elementwise_copy-inl.h b/src/operator/mkl/mkl_elementwise_copy-inl.h
new file mode 100644
index 0000000000..48c9312911
--- /dev/null
+++ b/src/operator/mkl/mkl_elementwise_copy-inl.h
@@ -0,0 +1,69 @@
+/*******************************************************************************
+* Copyright 2016 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+* \file mkl_elementwise-inl.h
+* \brief
+* \author lingyan.guo@intel.com
+*         zhenlin.luo@intel.com
+*
+*******************************************************************************/
+#ifndef MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_COPY_INL_H_
+#define MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_COPY_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <cstring>
+#include <map>
+#include <string>
+#include <vector>
+#include <utility>
+#include "../operator_common.h"
+#include "../mshadow_op.h"
+#include "./mkl_util-inl.h"
+
+
+namespace mxnet {
+namespace op {
+
+template<typename xpu, typename DType>
+void MKLIdentityCompute(const nnvm::NodeAttrs& attrs,
+  const OpContext& ctx,
+  const std::vector<TBlob>& inputs,
+  const std::vector<OpReqType>& req,
+  const std::vector<TBlob>& outputs) {
+  if (!req[0]) return;
+#if MKL_EXPERIMENTAL == 1
+  if (op::mkl_prv_data<DType>(inputs[0])) {
+    std::shared_ptr<MKLMemHolder> in_data_mem = inputs[0].Mkl_mem_;
+    // User copy to avoid potential problem
+    std::shared_ptr<MKLData<DType> > top_data = MKLData<DType>::create();
+    std::shared_ptr<MKLMemHolder> top_mem = outputs[0].Mkl_mem_;
+    top_data->copy_from(in_data_mem);
+    top_mem->set_prv_descriptor(top_data);
+    return;
+  }
+#endif
+  int in_blob_size = inputs[0].Size();
+  int out_blob_size = outputs[0].Size();
+  CHECK_EQ(in_blob_size, out_blob_size) << "MKLIdentityCompute CPU Size not Match ";
+  memcpy(outputs[0].dptr_, inputs[0].dptr_, in_blob_size * sizeof(DType));
+}
+
+
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_COPY_INL_H_
diff --git a/src/operator/mkl/mkl_elementwise_sum-inl.h b/src/operator/mkl/mkl_elementwise_sum-inl.h
new file mode 100644
index 0000000000..d313fd15a5
--- /dev/null
+++ b/src/operator/mkl/mkl_elementwise_sum-inl.h
@@ -0,0 +1,117 @@
+/*******************************************************************************
+* Copyright 2016 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+* \file mkl_elementwise-inl.h
+* \brief
+* \author lingyan.guo@intel.com
+*         zhenlin.luo@intel.com
+*
+*******************************************************************************/
+#ifndef MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_SUM_INL_H_
+#define MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_SUM_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <cstring>
+#include <map>
+#include <string>
+#include <vector>
+#include <utility>
+#include "../operator_common.h"
+#include "../mshadow_op.h"
+#include "./mkl_util-inl.h"
+
+
+namespace mxnet {
+namespace op {
+template<typename xpu, typename DType>
+static void LayerSetUp(const std::vector<mshadow::Tensor<xpu, 1, DType> > &data,
+  size_t data_shape_size,
+  std::shared_ptr<MKLData<DType> > fwd_top_data) {
+  // Whether to use an asymptotically slower (for >2 inputs) but stabler method
+  // of computing the gradient for the PROD operation. (No effect for SUM op.)
+  // stable_prod_grad_ = 1;
+  size_t dim_src = data_shape_size;
+  size_t *sizes_src = new size_t[dim_src];
+  size_t *strides_src = new size_t[dim_src];
+  for (size_t d = 0; d < dim_src; ++d) {
+    sizes_src[d] = data[0].shape_[dim_src - d - 1];
+    strides_src[d] = (d == 0) ? 1 : strides_src[d - 1] * sizes_src[d - 1];
+  }
+
+  fwd_top_data->create_user_layout(dim_src, sizes_src, strides_src);
+  delete[] sizes_src;
+  delete[] strides_src;
+}
+
+template<typename xpu, typename DType>
+void MKLElementWiseSumCompute_(const nnvm::NodeAttrs& attrs,
+  const OpContext& ctx,
+  const std::vector<TBlob>& in_data,
+  const std::vector<OpReqType>& req,
+  const std::vector<TBlob>& out_data) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  if (req[0] == kNullOp) return;
+  size_t size = in_data.size();
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  std::vector<Tensor<xpu, 1, DType> > data(size);
+  Tensor<xpu, 1, DType> out = out_data[0].FlatTo1D<xpu, DType>(s);
+  bool in_place_flag = false;
+  int in_place_idx = 0;
+
+  for (size_t i = 0; i < size; ++i) {
+    data[i]  = in_data[i].FlatTo1D<xpu, DType>(s);
+    if (data[i].dptr_ == out.dptr_) {
+      in_place_idx = i;
+      in_place_flag = true;
+    }
+  }
+  std::shared_ptr<MKLData<DType> > fwd_top_data = MKLData<DType>::create();
+  std::vector<DType> coeffs_  = std::vector<DType>(data.size(), 1);
+  LayerSetUp(data, 1, fwd_top_data);
+
+
+  dnnError_t e;
+  void *eltwise_res[dnnResourceNumber];
+  dnnPrimitive_t sumPrimitive = NULL;
+  e = dnnSumCreate<DType>(&sumPrimitive, NULL, size, fwd_top_data->layout_usr,
+    &coeffs_[0]);
+  CHECK_EQ(e, E_SUCCESS);
+
+  eltwise_res[dnnResourceDst] = reinterpret_cast<void*>(const_cast<DType*>(out.dptr_));
+  eltwise_res[dnnResourceMultipleSrc] =
+    reinterpret_cast<void *>(reinterpret_cast<void *>(in_data[in_place_idx].dptr_));
+  for (size_t i = 1; i < size; ++i) {
+    if (i == in_place_idx) continue;
+    eltwise_res[dnnResourceMultipleSrc + i] =
+      reinterpret_cast<void *>(reinterpret_cast<void *>(in_data[i].dptr_));
+  }
+
+  e = dnnExecute<DType>(sumPrimitive, eltwise_res);
+  CHECK_EQ(e, E_SUCCESS);
+
+  if (sumPrimitive != NULL) {
+    dnnDelete<DType>(sumPrimitive);
+    sumPrimitive = NULL;
+  }
+}
+
+
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_SUM_INL_H_
diff --git a/src/operator/mkl/mkl_fully_connected-inl.h b/src/operator/mkl/mkl_fully_connected-inl.h
new file mode 100644
index 0000000000..5e296704b6
--- /dev/null
+++ b/src/operator/mkl/mkl_fully_connected-inl.h
@@ -0,0 +1,192 @@
+/*******************************************************************************
+* Copyright 2016 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+* \file mkl_fully_connected-inl.h
+* \brief
+* \author zhenlin.luo@intel.com
+*          lingyan.guo@intel.com
+*         
+*
+*******************************************************************************/
+#ifndef MXNET_OPERATOR_MKL_MKL_FULLY_CONNECTED_INL_H_
+#define MXNET_OPERATOR_MKL_MKL_FULLY_CONNECTED_INL_H_
+#include <string>
+#include <algorithm>
+#include <vector>
+#include "../activation-inl.h"
+#include "./mkl_util-inl.h"
+
+namespace mxnet {
+namespace op {
+
+template<typename xpu, typename DType>
+class MKLFullyConnectedOp : public Operator {
+ public:
+  explicit MKLFullyConnectedOp(const FullyConnectedParam& p,
+                               const std::vector<TShape>& in_shapes,
+                               const std::vector<TShape>& out_shapes):
+    param_(p) {
+    LayerSetUp(in_shapes, out_shapes);
+  }
+
+  ~MKLFullyConnectedOp() {
+    dnnDelete<DType>(fullyConnectedFwd);
+    dnnDelete<DType>(fullyConnectedBwdData);
+    dnnDelete<DType>(fullyConnectedBwdFilter);
+    dnnDelete<DType>(fullyConnectedBwdBias);
+  }
+  static std::string getName() {
+    return "MKLFullyConnectedOp";
+  }
+
+ private:
+  void LayerSetUp(const std::vector<TShape>& in_shapes,
+                  const std::vector<TShape>& out_shapes) {
+    const TShape& ishape = in_shapes[fullc::kData];
+
+    const size_t dim = 4;
+    const size_t src_sizes[4] = {1, 1, ishape.ProdShape(1, ishape.ndim()), ishape[0]};
+    const size_t dst_sizes[2] = {param_.num_hidden, ishape[0]};
+    const size_t output_channels = param_.num_hidden;
+
+    dnnPrimitiveAttributes_t attributes = NULL;
+    MKLDNN_CALL(dnnPrimitiveAttributesCreate<DType>(&attributes));
+    if (!param_.no_bias) {
+      MKLDNN_CALL(dnnInnerProductCreateForwardBias<DType>(
+            &fullyConnectedFwd,
+            attributes,
+            dim,
+            src_sizes,
+            output_channels));
+    } else {
+      MKLDNN_CALL(dnnInnerProductCreateForward<DType>(
+            &fullyConnectedFwd,
+            attributes,
+            dim,
+            src_sizes,
+            output_channels));
+    }
+    MKLDNN_CALL(dnnInnerProductCreateBackwardData<DType>(
+          &fullyConnectedBwdData,
+          attributes,
+          dim,
+          src_sizes,
+          output_channels));
+    MKLDNN_CALL(dnnInnerProductCreateBackwardFilter<DType>(
+          &fullyConnectedBwdFilter,
+          attributes,
+          dim,
+          src_sizes,
+          output_channels));
+    if (!param_.no_bias) {
+      MKLDNN_CALL(dnnInnerProductCreateBackwardBias<DType>(
+            &fullyConnectedBwdBias,
+            attributes,
+            2,
+            dst_sizes));
+    }
+    // TODO(minjie): Shouldn't `attributes` be destroyed?
+  }
+
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+
+    void* res_fullyConnected[dnnResourceNumber];
+    if (req[fullc::kOut] == kNullOp) return;
+    CHECK_EQ(req[fullc::kOut], kWriteTo);
+    CHECK_EQ(in_data.size(), param_.no_bias ? 2 : 3);
+    CHECK_EQ(out_data.size(), 1);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+
+    const TShape& ishape = in_data[fullc::kData].shape_;
+    const TShape& oshape = out_data[fullc::kOut].shape_;
+
+    Tensor<xpu, 4, DType> data;
+    Tensor<xpu, 4, DType> out;
+
+    Shape4(in_data[fullc::kData].shape_[0], in_data[fullc::kData].shape_[1], 1, 1);
+
+    Shape<4> dshape = Shape4(ishape[0], ishape.ProdShape(1, ishape.ndim()), 1, 1);
+    Shape<4> odshape = Shape4(oshape[0], oshape.ProdShape(1, oshape.ndim()), 1, 1);
+
+    data = in_data[fullc::kData].get_with_shape<xpu, 4, DType>(dshape, s);
+    out = out_data[fullc::kOut].get_with_shape<xpu, 4, DType>(odshape, s);
+    res_fullyConnected[dnnResourceSrc] =
+      reinterpret_cast<void *>(in_data[fullc::kData].dptr_);
+    res_fullyConnected[dnnResourceDst] =
+      reinterpret_cast<void *>(out_data[fullc::kOut].dptr_);
+    res_fullyConnected[dnnResourceFilter] =
+      reinterpret_cast<void *>(in_data[fullc::kWeight].dptr_);
+    if (!param_.no_bias) {
+      res_fullyConnected[dnnResourceBias] = reinterpret_cast<void *>(in_data[fullc::kBias].dptr_);
+    }
+
+    MKLDNN_CALL(dnnExecute<DType>(fullyConnectedFwd, res_fullyConnected));
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+
+    void* res_fullyConnected[dnnResourceNumber];
+    CHECK_EQ(out_grad.size(), 1);
+    const size_t expected = param_.no_bias ? 2 : 3;
+    CHECK(in_data.size() == expected && in_grad.size() == expected);
+    CHECK_EQ(req.size(), expected);
+    res_fullyConnected[dnnResourceSrc] =
+      reinterpret_cast<void *>(in_data[fullc::kData].dptr_);
+    res_fullyConnected[dnnResourceFilter] =
+      reinterpret_cast<void *>(in_data[fullc::kWeight].dptr_);
+
+    res_fullyConnected[dnnResourceDiffDst] =
+      reinterpret_cast<void *>(out_grad[fullc::kOut].dptr_);
+    res_fullyConnected[dnnResourceDiffSrc] =
+      reinterpret_cast<void *>(in_grad[fullc::kData].dptr_);
+    res_fullyConnected[dnnResourceDiffFilter] =
+      reinterpret_cast<void *>(in_grad[fullc::kWeight].dptr_);
+    if (!param_.no_bias) {
+      res_fullyConnected[dnnResourceDiffBias] =
+        reinterpret_cast<void *>(in_grad[fullc::kBias].dptr_);
+    }
+    MKLDNN_CALL(dnnExecute<DType>(fullyConnectedBwdFilter, res_fullyConnected));
+    if (!param_.no_bias) {
+      MKLDNN_CALL(dnnExecute<DType>(fullyConnectedBwdBias, res_fullyConnected));
+    }
+    MKLDNN_CALL(dnnExecute<DType>(fullyConnectedBwdData, res_fullyConnected));
+  }
+
+ private:
+  dnnPrimitive_t fullyConnectedFwd{nullptr};
+  dnnPrimitive_t fullyConnectedBwdData{nullptr};
+  dnnPrimitive_t fullyConnectedBwdFilter{nullptr};
+  dnnPrimitive_t fullyConnectedBwdBias{nullptr};
+  const FullyConnectedParam param_;
+};  // class MKLFullyConnectedOp
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_MKL_MKL_FULLY_CONNECTED_INL_H_
diff --git a/src/operator/mkl/mkl_lrn-inl.h b/src/operator/mkl/mkl_lrn-inl.h
new file mode 100644
index 0000000000..90dfad50fa
--- /dev/null
+++ b/src/operator/mkl/mkl_lrn-inl.h
@@ -0,0 +1,265 @@
+/*******************************************************************************
+* Copyright 2016 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+* \file mkl_lrn-inl.h
+* \brief
+* \author zhenlin.luo@intel.com
+*         lingyan.guo@intel.com
+*
+*******************************************************************************/
+#ifndef MXNET_OPERATOR_MKL_MKL_LRN_INL_H_
+#define MXNET_OPERATOR_MKL_MKL_LRN_INL_H_
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include "../operator_common.h"
+#include "../mshadow_op.h"
+#include "./mkl_util-inl.h"
+
+namespace mxnet {
+namespace op {
+
+template<typename xpu, typename DType>
+class MKLLRNOp : public Operator {
+ public:
+  static std::string getName() {
+    return "MKLLRNOp";
+  }
+
+  explicit MKLLRNOp(LRNParam param) :
+    lrnFwd(static_cast<dnnPrimitive_t>(NULL)),
+    lrnBwd(static_cast<dnnPrimitive_t>(NULL)),
+    lrn_buffer_(NULL) {
+    this->param_ = param;
+    fwd_top_data_ = MKLData<DType>::create();
+    fwd_bottom_data_ = MKLData<DType>::create();
+    bwd_top_diff_ = MKLData<DType>::create();
+    bwd_bottom_diff_ = MKLData<DType>::create();
+    init_mkldnn_ = false;
+  }
+
+  virtual ~MKLLRNOp() {
+    if (lrnFwd != NULL) {
+      dnnDelete<DType>(lrnFwd);
+      lrnFwd = NULL;
+    }
+    if (lrnBwd != NULL) {
+      dnnDelete<DType>(lrnBwd);
+      lrnBwd = NULL;
+    }
+    dnnReleaseBuffer<DType>(lrn_buffer_);
+  }
+
+ private:
+  void LayerSetup(const mshadow::Tensor<xpu, 4, DType> &data,
+                  const mshadow::Tensor<xpu, 4, DType> &out) {
+    size_ = param_.nsize;
+    CHECK_EQ(size_ % 2, 1) << "LRN only supports odd values for local size";
+
+    alpha_ = param_.alpha;
+    beta_ = param_.beta;
+    k_ = param_.knorm;
+    size_t dim = 4, sizes[4], strides[4];
+    channels_ = data.shape_[1];
+    height_ = data.shape_[2];
+    width_ = data.shape_[3];
+    num_ = data.shape_[0];
+    sizes[0] = width_;
+    sizes[1] = height_;
+    sizes[2] = channels_;
+    sizes[3] = num_;
+
+    strides[0] = 1;
+    strides[1] = sizes[0];
+    strides[2] = sizes[0] * sizes[1];
+    strides[3] = sizes[0] * sizes[1] * sizes[2];
+
+    fwd_bottom_data_->name = "fwd_bottom_data_   @ " + getName();
+    fwd_top_data_->name = "fwd_top_data_      @ " + getName();
+    bwd_top_diff_->name = "bwd_top_diff_      @ " + getName();
+    bwd_bottom_diff_->name = "bwd_bottom_diff_   @ " + getName();
+
+    fwd_bottom_data_->create_user_layout(dim, sizes, strides);
+    fwd_top_data_->create_user_layout(dim, sizes, strides);
+    bwd_bottom_diff_->create_user_layout(dim, sizes, strides);
+    bwd_top_diff_->create_user_layout(dim, sizes, strides);
+  }
+
+ public:
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_states) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 1U);
+    CHECK_EQ(out_data.size(), 2U);
+    CHECK_EQ(param_.nsize % 2, 1U) << "LRN only supports odd values for local_size";
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4, DType> data = mkl_experimental_direct_get<xpu, 4, DType>(
+      in_data[lrn_enum::kData], s);
+    Tensor<xpu, 4, DType> out = mkl_experimental_direct_get<xpu, 4, DType>(
+      out_data[lrn_enum::kOut], s);
+    if (!init_mkldnn_) {
+      LayerSetup(data, out);
+      init_mkldnn_ = true;
+    }
+
+    const void* bottom_data = NULL;
+#if MKL_EXPERIMENTAL == 1
+    bottom_data =
+          reinterpret_cast<void*>(mkl_prv_data<DType>(in_data[lrn_enum::kData]));
+#endif
+#if MKL_EXPERIMENTAL == 1
+    if (NULL != bottom_data) {
+      if (lrnFwd == NULL) {
+        std::shared_ptr<MKLMemHolder> bottom_data_mem =
+          in_data[lrn_enum::kData].Mkl_mem_;
+        std::shared_ptr<PrvMemDescr> bottom_prv_descriptor =
+          bottom_data_mem->get_prv_descriptor();
+        CHECK_EQ(bottom_prv_descriptor->get_descr_type(),
+            PrvMemDescr::PRV_DESCR_MKL2017);
+        std::shared_ptr<MKLData<DType> > mem_descr
+          = std::static_pointer_cast<MKLData<DType>>(bottom_prv_descriptor);
+        CHECK(mem_descr != nullptr);
+        fwd_bottom_data_ = mem_descr;
+
+        dnnError_t e;
+        dnnLayout_t lrn_buffer_l = NULL;
+
+        e = dnnLRNCreateForward<DType>(&lrnFwd, NULL, fwd_bottom_data_->layout_int,
+                                       size_, alpha_, beta_, k_);
+        CHECK_EQ(e, E_SUCCESS);
+
+        fwd_top_data_->create_internal_layout(lrnFwd, dnnResourceDst);
+
+        e = dnnLRNCreateBackward<DType>(&lrnBwd, NULL,
+                                        fwd_bottom_data_->layout_int, fwd_bottom_data_->layout_int,
+                                        size_, alpha_, beta_, k_);
+        CHECK_EQ(e, E_SUCCESS);
+
+        e = dnnLayoutCreateFromPrimitive<DType>(
+              &lrn_buffer_l, lrnFwd, dnnResourceWorkspace);
+        CHECK_EQ(e, E_SUCCESS);
+        e = dnnAllocateBuffer<DType>(
+              reinterpret_cast<void **>(&lrn_buffer_), lrn_buffer_l);
+        CHECK_EQ(e, E_SUCCESS);
+        dnnLayoutDelete<DType>(lrn_buffer_l);
+
+        bwd_top_diff_->create_internal_layout(lrnBwd, dnnResourceDiffDst);
+        bwd_bottom_diff_->create_internal_layout(lrnBwd, dnnResourceDiffSrc);
+      }
+    }
+#endif
+    if (bottom_data == NULL) {
+      if (lrnFwd == NULL) {
+        dnnError_t e;
+        dnnLayout_t lrn_buffer_l = NULL;
+        e = dnnLRNCreateForward<DType>(&lrnFwd, NULL, fwd_bottom_data_->layout_usr,
+                                       size_, alpha_, beta_, k_);
+        CHECK_EQ(e, E_SUCCESS);
+
+        e = dnnLayoutCreateFromPrimitive<DType>(
+              &lrn_buffer_l, lrnFwd, dnnResourceWorkspace);
+        CHECK_EQ(e, E_SUCCESS);
+        e = dnnAllocateBuffer<DType>(
+              reinterpret_cast<void **>(&lrn_buffer_), lrn_buffer_l);
+        CHECK_EQ(e, E_SUCCESS);
+        dnnLayoutDelete<DType>(lrn_buffer_l);
+
+        e = dnnLRNCreateBackward<DType>(&lrnBwd, NULL,
+                                        fwd_bottom_data_->layout_usr, fwd_bottom_data_->layout_usr,
+                                        size_, alpha_, beta_, k_);
+        CHECK_EQ(e, E_SUCCESS);
+      }
+      bottom_data = data.dptr_;
+    }
+
+    dnnError_t e;
+    void* lrn_res[dnnResourceNumber];
+    lrn_res[dnnResourceSrc] = const_cast<void*>(bottom_data);
+
+    lrn_res[dnnResourceDst] = fwd_top_data_->get_output_ptr(
+      out.dptr_, fwd_top_data_, out_data[lrn_enum::kOut]);
+    lrn_res[dnnResourceWorkspace] = lrn_buffer_;
+    e = dnnExecute<DType>(lrnFwd, lrn_res);
+    CHECK_EQ(e, E_SUCCESS);
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_states) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), 1);
+    CHECK_EQ(in_data.size(), 1);
+    CHECK_EQ(out_data.size(), 2);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4, DType> grad = mkl_experimental_direct_get<xpu, 4, DType>(
+      out_grad[lrn_enum::kOut], s);
+    Tensor<xpu, 4, DType> data = mkl_experimental_direct_get<xpu, 4, DType>(
+      in_data[lrn_enum::kData], s);
+    Tensor<xpu, 4, DType> grad_in = mkl_experimental_direct_get<xpu, 4, DType>(
+      in_grad[lrn_enum::kData], s);
+    dnnError_t e;
+    void* lrn_res[dnnResourceNumber];
+    lrn_res[dnnResourceDiffDst] =
+      bwd_top_diff_->get_converted_prv(grad.dptr_, true, out_grad[lrn_enum::kOut]);
+    lrn_res[dnnResourceWorkspace] = lrn_buffer_;
+    lrn_res[dnnResourceSrc] =
+      fwd_bottom_data_->get_converted_prv(data.dptr_, false, in_data[lrn_enum::kData]);
+
+    lrn_res[dnnResourceDiffSrc] = bwd_bottom_diff_->get_output_ptr(
+      grad_in.dptr_, bwd_bottom_diff_, in_grad[lrn_enum::kData]);
+    e = dnnExecute<DType>(lrnBwd, lrn_res);
+    CHECK_EQ(e, E_SUCCESS);
+  }
+
+ private:
+  LRNParam param_;
+  int size_;
+  int pre_pad_;
+  DType alpha_;
+  DType beta_;
+  DType k_;
+  int num_;
+  int channels_;
+  int height_;
+  int width_;
+  bool init_mkldnn_;
+
+ private:
+  dnnPrimitive_t lrnFwd, lrnBwd;
+  std::shared_ptr<MKLData<DType> > fwd_top_data_;
+  std::shared_ptr<MKLData<DType> > fwd_bottom_data_;
+
+  std::shared_ptr<MKLData<DType> > bwd_top_diff_;
+  std::shared_ptr<MKLData<DType> > bwd_bottom_diff_;
+
+  DType *lrn_buffer_;
+};  // class LocalResponseNormOp
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_MKL_MKL_LRN_INL_H_
+
diff --git a/src/operator/mkl/mkl_memory-inl.h b/src/operator/mkl/mkl_memory-inl.h
new file mode 100644
index 0000000000..71af10254b
--- /dev/null
+++ b/src/operator/mkl/mkl_memory-inl.h
@@ -0,0 +1,137 @@
+/*******************************************************************************
+* Copyright 2016 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+* \file mkl_memory-inl.h
+* \brief
+* \author lingyan.guo@intel.com
+*         zhenlin.luo@intel.com
+*
+*******************************************************************************/
+#ifndef MXNET_OPERATOR_MKL_MKL_MEMORY_INL_H_
+#define MXNET_OPERATOR_MKL_MKL_MEMORY_INL_H_
+
+
+#include <string>
+#include <vector>
+#include <memory>
+#include "mkl_cppwrapper.h"
+
+namespace mxnet {
+
+template <typename DType>
+struct MKLMemoryDescriptorBase : public PrvMemDescr,
+ public std::enable_shared_from_this<MKLMemoryDescriptorBase<DType> > {
+    MKLMemoryDescriptorBase() : layout_usr(NULL), layout_int(NULL),
+    convert_to_int(NULL), convert_from_int(NULL), convert_prv2prv(NULL),
+    name("UNKNOWN"), internal_ptr(NULL) {}
+  virtual ~MKLMemoryDescriptorBase() {
+    dnnLayoutDelete<DType>(layout_usr);
+    dnnLayoutDelete<DType>(layout_int);
+    if (internal_ptr != NULL) {
+      dnnReleaseBuffer<DType>(internal_ptr);
+      internal_ptr = NULL;
+    }
+    if (convert_to_int != NULL) {
+      dnnDelete<DType>(convert_to_int);
+      convert_to_int = NULL;
+    }
+    if (convert_from_int != NULL) {
+      dnnDelete<DType>(convert_from_int);
+      convert_from_int = NULL;
+    }
+    if (convert_prv2prv != NULL) {
+      dnnDelete<DType>(convert_prv2prv);
+      convert_prv2prv = NULL;
+    }
+  }
+  std::shared_ptr<MKLMemoryDescriptorBase<DType> > get_shared_ptr() {
+    return this->shared_from_this();
+  }
+
+  dnnLayout_t layout_usr;
+  dnnLayout_t layout_int;
+  dnnPrimitive_t convert_to_int;
+  dnnPrimitive_t convert_from_int;
+  dnnPrimitive_t convert_prv2prv;
+  std::shared_ptr<MKLMemoryDescriptorBase<DType> > descr_prv2prv_conversion;
+
+
+  std::string name;  // for debugging purposes
+  void allocate() {
+    if (internal_ptr == NULL) {
+      int status = dnnAllocateBuffer<DType>(
+              reinterpret_cast<void **>(&internal_ptr), layout_int);
+      CHECK_EQ(status, E_SUCCESS)
+          << "Failed internal_ptr memory allocation with status "
+          << status << "\n";
+    }
+  }
+  virtual void* prv_ptr(bool allocate_when_uninit = true) {
+    if (internal_ptr == NULL && allocate_when_uninit)
+      allocate();
+    return internal_ptr;
+  }
+  inline bool conversion_needed() {
+    return (convert_to_int != NULL);
+  }
+  void create_conversions();
+  void create_internal_layout(const dnnPrimitive_t primitive,
+                dnnResourceType_t type);
+  void create_user_layout(size_t dimension, const size_t size[],
+              const size_t strides[]);
+  void create_layouts(
+    const dnnPrimitive_t primitive, dnnResourceType_t type,
+    size_t dimension, const size_t size[], const size_t strides[]);
+
+  virtual PrvDescrType get_descr_type() {
+    return PRV_DESCR_MKL2017;
+  }
+  virtual size_t prv_size() {
+    return dnnLayoutGetMemorySize<DType>(layout_int);
+  }
+  virtual size_t prv_count() {
+    return dnnLayoutGetMemorySize<DType>(layout_int) / sizeof(DType);
+  }
+  virtual void convert_from_prv(void* cpu_ptr);
+  virtual void convert_to_prv(void* cpu_ptr);
+  virtual bool layout_compare(std::shared_ptr<PrvMemDescr> other);
+  virtual void convert_from_other(std::shared_ptr<PrvMemDescr> other);
+ protected:
+  DType* internal_ptr;
+};
+
+template <typename DType>
+struct MKLMemoryDescriptor : MKLMemoryDescriptorBase<DType> {
+  // The last get_converted_prv() argument is a hack for reusing
+  // in backward a conversion done already in the forward direction.
+  DType* get_converted_prv(DType *data_ptr, bool set_prv_ptr,
+      const TBlob &blob);
+  void* get_output_ptr(DType *data_ptr, std::shared_ptr<MKLMemoryDescriptor<DType> > self_ptr,
+    const TBlob &blob, bool in_place = false);
+  bool copy_from(std::shared_ptr<MKLMemHolder> dnn_chunk);
+  MKLMemoryDescriptor() {}
+};
+
+template <typename DType> struct MKLData : MKLMemoryDescriptor<DType> {
+  static std::shared_ptr<MKLData<DType> > create() {
+    return std::make_shared<MKLData<DType> >();
+  }
+};
+
+template struct MKLData<float>;
+template struct MKLData<double>;
+
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_MKL_MKL_MEMORY_INL_H_
diff --git a/src/operator/mkl/mkl_memory.cc b/src/operator/mkl/mkl_memory.cc
new file mode 100644
index 0000000000..7682fe1c1f
--- /dev/null
+++ b/src/operator/mkl/mkl_memory.cc
@@ -0,0 +1,291 @@
+/*******************************************************************************
+* Copyright 2016 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+* \file mkl_memory.cc
+* \brief
+* \author lingyan.guo@intel.com
+*         zhenlin.luo@intel.com
+*
+*******************************************************************************/
+#include "../operator_common.h"
+
+#if MXNET_USE_MKL2017 == 1
+#include <mkl_memory.h>
+#include "mkl_memory-inl.h"
+#include "mkl_util-inl.h"
+
+namespace mxnet {
+
+template <typename Dtype>
+void MKLMemoryDescriptorBase<Dtype>::create_conversions() {
+  int status;
+  if (this->convert_from_int) {
+    status = dnnDelete<Dtype>(this->convert_from_int);
+    CHECK_EQ(status, E_SUCCESS);
+    this->convert_from_int = NULL;
+  }
+  if (this->convert_to_int) {
+    status = dnnDelete<Dtype>(this->convert_to_int);
+    CHECK_EQ(status, E_SUCCESS);
+    this->convert_to_int = NULL;
+  }
+  if (layout_int
+      && !dnnLayoutCompare<Dtype>(layout_usr, layout_int)) {
+    CHECK(layout_usr);
+    status = dnnConversionCreate<Dtype>(&convert_to_int, layout_usr,
+            layout_int);
+    CHECK_EQ(status, E_SUCCESS)
+            << "Failed creation convert_to_int with status "
+            << status << " for buffer: " << this->name << "\n";
+    status = dnnConversionCreate<Dtype>(&convert_from_int, layout_int,
+            layout_usr);
+    CHECK_EQ(status, E_SUCCESS)
+            << "Failed creation convert_from_int with status "
+            << status << " for buffer: " << this->name << "\n";
+  }
+}
+
+template <typename Dtype>
+void MKLMemoryDescriptorBase<Dtype>::create_internal_layout(
+    const dnnPrimitive_t primitive, dnnResourceType_t type) {
+  int status;
+  if (this->layout_int) {
+    status = dnnLayoutDelete<Dtype>(this->layout_int);
+    CHECK_EQ(status, E_SUCCESS);
+  }
+  status = dnnLayoutCreateFromPrimitive<Dtype>(
+      &this->layout_int, primitive, type);
+  CHECK_EQ(status, E_SUCCESS)
+      << "Failed dnnLayoutCreateFromPrimitive with status "
+      << status << " for buffer: " << this->name << "\n";
+
+  if (this->layout_usr)
+    this->create_conversions();
+}
+
+template <typename Dtype>
+void MKLMemoryDescriptorBase<Dtype>::create_user_layout(
+    size_t dimension, const size_t size[], const size_t strides[]) {
+  int status;
+  if (this->layout_usr) {
+    status = dnnLayoutDelete<Dtype>(this->layout_usr);
+    CHECK_EQ(status, E_SUCCESS);
+  }
+
+  status = dnnLayoutCreate<Dtype>(
+      &this->layout_usr, dimension, size, strides);
+  CHECK_EQ(status, E_SUCCESS) << "Failed dnnLayoutCreate with status "
+      << status << " for buffer: " << this->name << "\n";
+
+  if (this->layout_int)
+    this->create_conversions();
+}
+
+template <typename Dtype>
+void MKLMemoryDescriptorBase<Dtype>::create_layouts(
+    const dnnPrimitive_t primitive, dnnResourceType_t type,
+    size_t dimension, const size_t size[], const size_t strides[]) {
+  this->create_internal_layout(primitive, type);
+  this->create_user_layout(dimension, size, strides);
+}
+
+
+template <typename Dtype>
+void MKLMemoryDescriptorBase<Dtype>::convert_from_prv(void* cpu_ptr) {
+  CHECK(cpu_ptr);
+  CHECK(this->convert_from_int);
+  int status;
+  void *convert_resources[dnnResourceNumber];
+
+  convert_resources[dnnResourceFrom] = this->prv_ptr();
+  convert_resources[dnnResourceTo]   = cpu_ptr;
+  status = dnnExecute<Dtype>(this->convert_from_int, convert_resources);
+  CHECK_EQ(status, 0) << "Conversion from prv failed with status " << status;
+}
+
+template <typename Dtype>
+void MKLMemoryDescriptorBase<Dtype>::convert_to_prv(void* cpu_ptr) {
+  CHECK(cpu_ptr);
+  CHECK(this->convert_to_int);
+  int status;
+  void *convert_resources[dnnResourceNumber];
+
+  convert_resources[dnnResourceFrom] = cpu_ptr;
+  convert_resources[dnnResourceTo]   = this->prv_ptr();
+  status = dnnExecute<Dtype>(this->convert_to_int, convert_resources);
+  CHECK_EQ(status, 0) << "Conversion from prv failed with status " << status;
+}
+
+
+template <typename Dtype>
+bool MKLMemoryDescriptorBase<Dtype>::layout_compare(
+  std::shared_ptr<PrvMemDescr> other) {
+  CHECK_EQ(other->get_descr_type(),
+              PrvMemDescr::PRV_DESCR_MKL2017);
+  std::shared_ptr<MKLMemoryDescriptorBase<Dtype> >other_descr =
+    std::static_pointer_cast<MKLMemoryDescriptorBase<Dtype> >
+    (other);
+
+  if (dnnLayoutCompare<Dtype>(other_descr->layout_int,
+      this->layout_int))
+    return true;
+  else
+    return false;
+}
+
+template <typename Dtype>
+void MKLMemoryDescriptorBase<Dtype>::convert_from_other(
+  std::shared_ptr<PrvMemDescr> other) {
+    std::shared_ptr<MKLMemoryDescriptorBase<Dtype> > other_descr =
+        std::static_pointer_cast<MKLMemoryDescriptorBase<Dtype> >
+            (other);
+
+  int status;
+  dnnPrimitive_t convert;
+  status = dnnConversionCreate<Dtype>(&convert,
+    other_descr->layout_int, this->layout_int);
+
+  void *convert_resources[dnnResourceNumber];
+  convert_resources[dnnResourceFrom] = other_descr->prv_ptr();
+  convert_resources[dnnResourceTo]   = this->prv_ptr();
+  status = dnnExecute<Dtype>(convert, convert_resources);
+  CHECK_EQ(status, 0) << "Conversion from other failed with status "
+                      << status;
+
+  dnnDelete<Dtype>(convert);
+}
+
+
+template <typename Dtype>
+Dtype* MKLMemoryDescriptor<Dtype>::get_converted_prv(
+    Dtype *cpu_ptr, bool set_prv_ptr, const TBlob &blob) {
+  Dtype* prv_ptr = NULL;
+  std::shared_ptr<MKLMemHolder> dnn_chunk = NULL;
+#if MKL_EXPERIMENTAL == 1
+  dnn_chunk = blob.Mkl_mem_;
+#endif
+#if MKL_EXPERIMENTAL == 1
+  if (dnn_chunk != NULL)
+    prv_ptr = static_cast<Dtype*>(dnn_chunk->prv_data());
+#endif
+
+  if (this->convert_to_int != NULL) {
+#if MKL_EXPERIMENTAL == 1
+    int status;
+    void *convert_resources[dnnResourceNumber];
+#endif
+    if (prv_ptr == NULL) {
+      this->allocate();
+      this->convert_to_prv(cpu_ptr);
+#if MKL_EXPERIMENTAL == 1
+      if (set_prv_ptr) {
+        dnn_chunk->set_prv_descriptor(this->get_shared_ptr(), true);
+      }
+#endif
+      return this->internal_ptr;
+    }
+#if MKL_EXPERIMENTAL == 1
+    if (prv_ptr != NULL)  {
+      std::shared_ptr<MKLData<Dtype> > current_descr =
+        op::mkl_get_mem_desc<Dtype>(dnn_chunk);
+      if (!dnnLayoutCompare<Dtype>(current_descr->layout_int,
+        this->layout_int)) {
+        if (this->convert_prv2prv) {
+          CHECK_EQ(dnnLayoutCompare<Dtype>(
+            this->descr_prv2prv_conversion->layout_int,
+            this->layout_int), 0);
+          status = 0;
+        } else {
+          status = dnnConversionCreate<Dtype>(&this->convert_prv2prv,
+            current_descr->layout_int, this->layout_int);
+          if (status == 0)
+            this->descr_prv2prv_conversion = current_descr;
+        }
+        if (status != 0) {
+          this->allocate();
+          convert_resources[dnnResourceFrom] = cpu_ptr;
+          convert_resources[dnnResourceTo] =
+            reinterpret_cast<void*>(this->internal_ptr);
+          status = dnnExecute<Dtype>(this->convert_to_int, convert_resources);
+          CHECK_EQ(status, 0) << "Conversion failed with status " << status;
+        } else {
+          this->allocate();
+          convert_resources[dnnResourceFrom] = reinterpret_cast<void*>(prv_ptr);
+          convert_resources[dnnResourceTo] =
+            reinterpret_cast<void*>(this->internal_ptr);
+          status = dnnExecute<Dtype>(this->convert_prv2prv, convert_resources);
+          CHECK_EQ(status, 0) << "Conversion failed with status " << status;
+        }
+        if (set_prv_ptr) {
+          dnn_chunk->set_prv_descriptor(this->get_shared_ptr(), true);
+        }
+        return this->internal_ptr;
+      } else if (current_descr.get() != this) {
+        // MKL_DLOG(INFO) << "layout OK                 "
+        //  << current_descr->name << " == " << this->name;
+      }
+    }
+#endif
+    return const_cast<Dtype *>(prv_ptr);
+  } else {
+    if (prv_ptr != NULL) {
+#if MKL_EXPERIMENTAL == 1
+      std::shared_ptr<MKLMemoryDescriptorBase<float> > other_descr =
+        std::static_pointer_cast<MKLMemoryDescriptorBase<float> >
+        (dnn_chunk->prv_descriptor_);
+      dnn_chunk->check_and_prv_to_cpu(cpu_ptr);
+#endif
+      // printf("get_converted_prv release %s\n", other_descr->name.c_str());
+    }
+  }
+  return cpu_ptr;
+}
+
+template <typename Dtype>
+void* MKLMemoryDescriptor<Dtype>::get_output_ptr(Dtype *data_ptr,
+  std::shared_ptr<MKLMemoryDescriptor<Dtype> > self_ptr, const TBlob &blob, bool in_place) {
+#if MKL_EXPERIMENTAL == 1
+  std::shared_ptr<MKLMemHolder> dnn_chunk = blob.Mkl_mem_;
+#endif
+  if (this->conversion_needed()) {
+    void * prv_ptr =  this->prv_ptr();
+#if MKL_EXPERIMENTAL == 1
+    if (!in_place) {
+      dnn_chunk->set_prv_descriptor(self_ptr);
+    } else {
+      Dtype * blob_prv = op::mkl_prv_data<Dtype>(blob);
+      if (blob_prv != NULL)
+        return blob_prv;
+    }
+#endif
+    return prv_ptr;
+  } else {
+#if MKL_EXPERIMENTAL == 1
+    std::shared_ptr<MKLMemoryDescriptorBase<float> > other_descr =
+      std::static_pointer_cast<MKLMemoryDescriptorBase<float> >
+      (dnn_chunk->prv_descriptor_);
+    dnn_chunk->check_and_prv_to_cpu(data_ptr);
+#endif
+    return data_ptr;
+  }
+}
+
+template class MKLMemoryDescriptor<double>;
+template class MKLMemoryDescriptor<float>;
+
+template class MKLMemoryDescriptorBase<float>;
+template class MKLMemoryDescriptorBase<double>;
+}  // namespace mxnet
+#endif
diff --git a/src/operator/mkl/mkl_memory.h b/src/operator/mkl/mkl_memory.h
new file mode 100644
index 0000000000..13f1fd27b1
--- /dev/null
+++ b/src/operator/mkl/mkl_memory.h
@@ -0,0 +1,123 @@
+/*******************************************************************************
+* Copyright 2016 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+* \file mkl_memory.cc
+* \brief
+* \author lingyan.guo@intel.com
+*         zhenlin.luo@intel.com
+*
+*******************************************************************************/
+#ifndef MXNET_OPERATOR_MKL_MKL_MEMORY_H_
+#define MXNET_OPERATOR_MKL_MKL_MEMORY_H_
+
+#include <string>
+#include <vector>
+#include <memory>
+
+
+namespace mxnet {
+// Base class
+struct PrvMemDescr {
+  virtual void convert_from_prv(void* cpu_ptr) = 0;
+  virtual void convert_to_prv(void* cpu_ptr) = 0;
+  virtual void convert_from_other(std::shared_ptr<PrvMemDescr> other) = 0;
+  virtual void* prv_ptr(bool allocate_when_uninit = true) = 0;
+  // returns true for matching layouts
+  virtual bool layout_compare(std::shared_ptr<PrvMemDescr> other) = 0;
+  virtual size_t prv_count() = 0;
+  virtual size_t prv_size() = 0;
+  // This might help using prv_ptr_ by different accelerators/engines
+  enum PrvDescrType {
+    PRV_DESCR_MKL2017,
+    PRV_DESCR_MKLDNN
+  };
+  virtual PrvDescrType get_descr_type() = 0;
+};
+
+#if MKL_EXPERIMENTAL == 1
+// Currently HEAD_AT_PRV do not free CPU data
+enum SyncedHead {
+  HEAD_AT_CPU,
+  HEAD_AT_PRV,
+};
+struct MKLMemHolder {
+  SyncedHead head_;
+  std::shared_ptr<PrvMemDescr> prv_descriptor_;
+  bool  b_disable_prv_2_cpu;
+  bool  b_eager_mode;
+  void disable_prv_2_cpu(bool flag) {
+    b_disable_prv_2_cpu = flag;
+  }
+  void set_eager_mode(bool eager_mode) {
+    b_eager_mode = eager_mode;
+  }
+  void set_prv_descriptor(std::shared_ptr<PrvMemDescr> descriptor, bool same_data = false) {
+    head_ = HEAD_AT_PRV;
+    prv_descriptor_ = descriptor;
+  }
+  std::shared_ptr<PrvMemDescr> get_prv_descriptor() {
+    return  prv_descriptor_;
+  }
+  bool head_at_prv() {
+    return (head_ == HEAD_AT_PRV) ? true : false;
+  }
+  void* prv_data(bool allocate_when_uninit = true) {
+    if (head_ != HEAD_AT_PRV) {
+      return NULL;
+    }
+    if (prv_descriptor_ == NULL) {
+      LOG(FATAL) << " prv_descriptor_  is NULL";
+    }
+    CHECK(prv_descriptor_.get());
+    return reinterpret_cast<void*>(prv_descriptor_->prv_ptr(allocate_when_uninit));
+  }
+
+  int prv_count() {
+    if (head_ != HEAD_AT_PRV) {
+      return 0;
+    }
+    if (prv_descriptor_ == NULL) {
+      LOG(FATAL) << " prv_descriptor_  is NULL";
+    }
+    CHECK(prv_descriptor_.get());
+    return prv_descriptor_->prv_count();
+  }
+  static std::shared_ptr<MKLMemHolder> create() {
+    return std::make_shared<MKLMemHolder>();
+  }
+  void  check_and_prv_to_cpu(void *dptr_) {
+    if (!b_disable_prv_2_cpu && head_ == HEAD_AT_PRV) {
+      CHECK(prv_descriptor_ != nullptr);
+      prv_descriptor_->convert_from_prv(dptr_);
+      // Because operator use CPU & maybe change it, change to CPU Flag
+      head_ = HEAD_AT_CPU;
+    }
+    if (b_disable_prv_2_cpu) {
+      b_disable_prv_2_cpu = false;
+    }
+  }
+  MKLMemHolder() :
+    head_(HEAD_AT_CPU), prv_descriptor_(nullptr),
+    b_disable_prv_2_cpu(false), b_eager_mode(false) {}
+};
+#else
+struct MKLMemHolder {
+ public:
+  virtual std::shared_ptr<PrvMemDescr> get_prv_descriptor() = 0;
+};
+#endif
+
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_MKL_MKL_MEMORY_H_
diff --git a/src/operator/mkl/mkl_pooling-inl.h b/src/operator/mkl/mkl_pooling-inl.h
new file mode 100644
index 0000000000..5662a61aeb
--- /dev/null
+++ b/src/operator/mkl/mkl_pooling-inl.h
@@ -0,0 +1,357 @@
+/*******************************************************************************
+* Copyright 2016 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+* \file mkl_pooling-inl.h
+* \brief
+* \author zhenlin.luo@intel.com
+*         lingyan.guo@intel.com
+*
+*******************************************************************************/
+
+#ifndef MXNET_OPERATOR_MKL_MKL_POOLING_INL_H_
+#define MXNET_OPERATOR_MKL_MKL_POOLING_INL_H_
+#include <vector>
+#include <string>
+#include <utility>
+#include "../operator_common.h"
+#include "../nn/pooling-inl.h"
+#include "./mkl_util-inl.h"
+
+namespace mxnet {
+namespace op {
+
+
+template<typename xpu, typename DType>
+class MKLPoolingOp : public Operator {
+ public:
+  static std::string getName() {
+    return "MKLPoolingOp";
+  }
+  explicit MKLPoolingOp(PoolingParam p) {
+    poolingFwd = static_cast<dnnPrimitive_t>(NULL);
+    poolingBwd = static_cast<dnnPrimitive_t>(NULL);
+    max_idx_data = static_cast<DType*>(NULL);
+    fwd_top_data = MKLData<DType>::create();
+    fwd_bottom_data = MKLData<DType>::create();
+    bwd_top_diff = MKLData<DType>::create();
+    bwd_bottom_diff = MKLData<DType>::create();
+    this->param_ = p;
+    init_mkldnn_ = false;
+  }
+  virtual ~MKLPoolingOp() {
+    if (poolingFwd != NULL) {
+      dnnDelete<DType>(poolingFwd);
+      poolingFwd = NULL;
+    }
+    if (poolingBwd != NULL) {
+      dnnDelete<DType>(poolingBwd);
+      poolingBwd = NULL;
+    }
+    if (max_idx_data != NULL) {
+      dnnReleaseBuffer<DType>(max_idx_data);
+      max_idx_data = NULL;
+    }
+  }
+
+ private:
+  void LayerSetUp(const mshadow::Tensor<xpu, 4, DType> &data,
+                  const mshadow::Tensor<xpu, 4, DType> &out) {
+    channels_ = data.shape_[1];
+    height_ = data.shape_[2];
+    width_ = data.shape_[3];
+    num_ = data.shape_[0];
+    global_pooling_ = param_.global_pool;
+    if (global_pooling_) {
+      kernel_h_ = height_;
+      kernel_w_ = width_;
+    } else {
+      kernel_h_ = param_.kernel[0];
+      kernel_w_ = param_.kernel[1];
+    }
+    CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero.";
+    CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero.";
+    pad_h_ = param_.pad[0];
+    pad_w_ = param_.pad[1];
+    if (global_pooling_) {
+      stride_h_ = stride_w_ = 1;
+    } else {
+      stride_h_ = param_.stride[0];
+      stride_w_ = param_.stride[1];
+    }
+    if (global_pooling_) {
+      CHECK(pad_h_ == 0 && pad_w_ == 0 && stride_h_ == 1 && stride_w_ == 1)
+        << "With Global_pooling: true; only pad = 0 and stride = 1";
+    }
+    if (pad_h_ != 0 || pad_w_ != 0) {
+      CHECK(param_.pool_type == pool_enum::kAvgPooling
+          || param_.pool_type == pool_enum::kMaxPooling)
+        << "Padding implemented only for average and max pooling.";
+      CHECK_LT(pad_h_, kernel_h_);
+      CHECK_LT(pad_w_, kernel_w_);
+    }
+    pooled_height_ = out.shape_[2];
+    pooled_width_ = out.shape_[3];
+
+    size_t dim = 4;
+    size_t src_sizes[4], src_strides[4];
+    size_t dst_sizes[4], dst_strides[4];
+    src_sizes[0] = width_;
+    src_sizes[1] = height_;
+    src_sizes[2] = channels_;
+    src_sizes[3] = num_;
+    src_strides[0] = 1;
+    src_strides[1] = src_sizes[0];
+    src_strides[2] = src_sizes[0] * src_sizes[1];
+    src_strides[3] = src_sizes[0] * src_sizes[1] * src_sizes[2];
+    dst_sizes[0] = pooled_width_;
+    dst_sizes[1] = pooled_height_;
+    dst_sizes[2] = src_sizes[2];
+    dst_sizes[3] = src_sizes[3];
+    dst_strides[0] = 1;
+    dst_strides[1] = dst_sizes[0];
+    dst_strides[2] = dst_sizes[0] * dst_sizes[1];
+    dst_strides[3] = dst_sizes[0] * dst_sizes[1] * dst_sizes[2];
+    src_offset[0] = -pad_w_;
+    src_offset[1] = -pad_h_;
+    src_offset[2] = -pad_w_;
+    src_offset[3] = -pad_h_;
+    kernel_stride[0] = stride_w_;
+    kernel_stride[1] = stride_h_;
+    kernel_size[0] = kernel_w_;
+    kernel_size[1] = kernel_h_;
+
+    // Names are for debugging only
+    fwd_bottom_data->name = "fwd_bottom_data   @ " + getName();
+    fwd_top_data->name = "fwd_top_data      @ " + getName();
+    bwd_top_diff->name = "bwd_top_diff      @ " + getName();
+    bwd_bottom_diff->name = "bwd_bottom_diff   @ " + getName();
+
+    fwd_bottom_data->create_user_layout(dim, src_sizes, src_strides);
+    fwd_top_data->create_user_layout(dim, dst_sizes, dst_strides);
+    bwd_bottom_diff->create_user_layout(dim, src_sizes, src_strides);
+    bwd_top_diff->create_user_layout(dim, dst_sizes, dst_strides);
+
+    // Primitives will be allocated during the first fwd pass
+    poolingFwd = NULL;
+    poolingBwd = NULL;
+    max_idx_data = NULL;
+  }
+
+ public:
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 1);
+    CHECK_EQ(out_data.size(), 1);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    if (param_.kernel.ndim() >= 3) {
+      LOG(FATAL) << "Not implmented";
+    }
+    Tensor<xpu, 4, DType> data = mkl_experimental_direct_get<xpu, 4, DType>(
+      in_data[pool_enum::kData], s);
+    Tensor<xpu, 4, DType> out = mkl_experimental_direct_get<xpu, 4, DType>(
+      out_data[pool_enum::kOut], s);
+    if (!init_mkldnn_) {
+      LayerSetUp(data, out);
+      init_mkldnn_ = true;
+    }
+    auto first_pass = false;
+    if (poolingFwd == NULL) first_pass = true;
+
+    dnnAlgorithm_t algorithm = dnnAlgorithmPoolingMax;
+
+    switch (param_.pool_type) {
+    case pool_enum::kMaxPooling:
+      algorithm = dnnAlgorithmPoolingMax;
+      break;
+    case pool_enum::kAvgPooling:
+      algorithm = dnnAlgorithmPoolingAvgIncludePadding;
+
+      break;
+    default:
+      LOG(FATAL) << "Unknown pooling method.";
+    }
+
+    dnnError_t status;
+    void* pooling_res[dnnResourceNumber];
+
+    void* bottom_data = NULL;
+#if MKL_EXPERIMENTAL == 1
+    bottom_data =
+          reinterpret_cast<void *>(mkl_prv_data<DType>(in_data[pool_enum::kData]));
+#endif
+    dnnBorder_t border_type = dnnBorderZerosAsymm;
+    switch (param_.pooling_convention) {
+    case pool_enum::kFull:
+      border_type = dnnBorderZeros;
+      break;
+    case pool_enum::kValid:
+      border_type = dnnBorderZerosAsymm;
+      break;
+    default:
+      border_type = dnnBorderZerosAsymm;
+      break;
+    }
+    if (NULL == bottom_data) {
+      bottom_data = data.dptr_;
+      if (NULL == poolingFwd) {
+        status = dnnPoolingCreateForward<DType>(&poolingFwd, NULL,
+                                                algorithm, fwd_bottom_data->layout_usr,
+                                                kernel_size, kernel_stride,
+                                                src_offset, border_type);
+      CHECK_EQ(status, E_SUCCESS);
+      // Now create poolingBwd
+      status = dnnPoolingCreateBackward<DType>(&poolingBwd, NULL,
+                                               algorithm, fwd_bottom_data->layout_usr,
+                                               kernel_size, kernel_stride,
+                                               src_offset, border_type);
+      CHECK_EQ(status, E_SUCCESS);
+      }
+    }
+#if MKL_EXPERIMENTAL == 1
+    if (NULL != bottom_data) {
+       if (NULL == poolingFwd) {
+          std::shared_ptr<MKLMemHolder> bottom_data_mem = in_data[pool_enum::kData].Mkl_mem_;
+          std::shared_ptr<PrvMemDescr> bottom_prv_descriptor =
+            bottom_data_mem->get_prv_descriptor();
+          CHECK_EQ(bottom_prv_descriptor->get_descr_type(),
+                   PrvMemDescr::PRV_DESCR_MKL2017);
+          std::shared_ptr<MKLData<DType> > mem_descr
+            = std::static_pointer_cast<MKLData<DType>>(bottom_prv_descriptor);
+          CHECK(mem_descr != nullptr);
+          fwd_bottom_data = mem_descr;
+
+          status = dnnPoolingCreateForward<DType>(&poolingFwd, NULL,
+                                                  algorithm, fwd_bottom_data->layout_int,
+                                                  kernel_size, kernel_stride,
+                                                  src_offset, border_type);
+          CHECK_EQ(status, E_SUCCESS);
+          fwd_top_data->create_internal_layout(poolingFwd, dnnResourceDst);
+
+          // Now create poolingBwd
+          status = dnnPoolingCreateBackward<DType>(&poolingBwd, NULL,
+                                                   algorithm, fwd_bottom_data->layout_int,
+                                                   kernel_size, kernel_stride,
+                                                   src_offset, border_type);
+          CHECK_EQ(status, E_SUCCESS);
+          bwd_top_diff->create_internal_layout(poolingFwd, dnnResourceDst);
+          bwd_bottom_diff->create_internal_layout(poolingFwd, dnnResourceSrc);
+        }
+    }
+#endif
+
+    if (first_pass) {
+      dnnLayout_t max_idx_datal = NULL;
+      status = dnnLayoutCreateFromPrimitive<DType>(
+          &max_idx_datal, poolingFwd, dnnResourceWorkspace);
+      CHECK_EQ(status, E_SUCCESS);
+      status = dnnAllocateBuffer<DType>(reinterpret_cast<void**>(&max_idx_data), max_idx_datal);
+      CHECK_EQ(status, E_SUCCESS);
+#if MKL_EXPERIMENTAL == 0
+      fwd_bottom_data->create_internal_layout(poolingFwd, dnnResourceSrc);
+      fwd_top_data->create_internal_layout(poolingFwd, dnnResourceDst);
+      bwd_top_diff->create_internal_layout(poolingBwd, dnnResourceDiffDst);
+      bwd_bottom_diff->create_internal_layout(poolingBwd, dnnResourceDiffSrc);
+#endif
+      dnnLayoutDelete<DType>(max_idx_datal);
+      first_pass = false;
+    }
+    pooling_res[dnnResourceSrc] = bottom_data;
+    pooling_res[dnnResourceWorkspace] = max_idx_data;
+
+    pooling_res[dnnResourceDst] = fwd_top_data->get_output_ptr(
+      out.dptr_, fwd_top_data, out_data[pool_enum::kOut]);
+    status = dnnExecute<DType>(poolingFwd, pooling_res);
+    CHECK_EQ(status, E_SUCCESS);
+#if MKL_EXPERIMENTAL == 0
+    if (fwd_top_data->conversion_needed()) {
+      fwd_top_data->convert_from_prv(out.dptr_);
+    }
+#endif
+  }
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    if (!req[0]) {
+      return;
+    }
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), 1);
+    CHECK_EQ(in_data.size(), 1);
+    CHECK_EQ(out_data.size(), 1);
+    CHECK_EQ(req.size(), 1);
+    CHECK_EQ(in_grad.size(), 1);
+    if (param_.kernel.ndim() >= 3) {
+      LOG(FATAL) << "Not implmented";
+    }
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4, DType> grad = mkl_experimental_direct_get<xpu, 4, DType>(
+      out_grad[pool_enum::kOut], s);
+    Tensor<xpu, 4, DType> input_grad = mkl_experimental_direct_get<xpu, 4, DType>(
+      in_grad[pool_enum::kData], s);
+    dnnError_t e;
+    void* pooling_res[dnnResourceNumber];
+    pooling_res[dnnResourceWorkspace] = reinterpret_cast<void *>(max_idx_data);
+
+    pooling_res[dnnResourceDiffDst] =
+      bwd_top_diff->get_converted_prv(grad.dptr_, true, out_grad[pool_enum::kOut]);
+
+    pooling_res[dnnResourceDiffSrc] = bwd_bottom_diff->get_output_ptr(
+      input_grad.dptr_, bwd_bottom_diff, in_grad[pool_enum::kData]);
+    e = dnnExecute<DType>(poolingBwd, pooling_res);
+    CHECK_EQ(e, E_SUCCESS);
+#if MKL_EXPERIMENTAL == 0
+    if (bwd_bottom_diff->conversion_needed()) {
+      bwd_bottom_diff->convert_from_prv(input_grad.dptr_);
+    }
+#endif
+  }
+
+ private:
+  PoolingParam param_;
+  int kernel_h_, kernel_w_;
+  int stride_h_, stride_w_;
+  int pad_h_, pad_w_;
+  int channels_, num_;
+  int height_, width_;
+  int pooled_height_, pooled_width_;
+  bool global_pooling_;
+
+ private:
+  size_t kernel_size[2],
+         kernel_stride[4];
+  int src_offset[4];  // 2*(dimension-2)
+  dnnPrimitive_t poolingFwd, poolingBwd;
+  DType *max_idx_data;
+
+  std::shared_ptr<MKLData<DType> > fwd_top_data;
+  std::shared_ptr<MKLData<DType> > fwd_bottom_data;
+  std::shared_ptr<MKLData<DType> > bwd_top_diff;
+  std::shared_ptr<MKLData<DType> > bwd_bottom_diff;
+  bool init_mkldnn_;
+};  // class MKLPoolingOp
+}   // namespace op
+}   // namespace mxnet
+
+#endif  // MXNET_OPERATOR_MKL_MKL_POOLING_INL_H_
diff --git a/src/operator/mkl/mkl_relu-inl.h b/src/operator/mkl/mkl_relu-inl.h
new file mode 100644
index 0000000000..8d7ab5e1e2
--- /dev/null
+++ b/src/operator/mkl/mkl_relu-inl.h
@@ -0,0 +1,272 @@
+/*******************************************************************************
+* Copyright 2016 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+* \file mkl_relu-inl.h
+* \brief
+* \author zhenlin.luo@intel.com
+*         lingyan.guo@intel.com
+*
+*******************************************************************************/
+#ifndef MXNET_OPERATOR_MKL_MKL_RELU_INL_H_
+#define MXNET_OPERATOR_MKL_MKL_RELU_INL_H_
+
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <algorithm>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include "../operator_common.h"
+#include "./mkl_util-inl.h"
+
+namespace mxnet {
+namespace op {
+
+template<typename xpu, typename DType>
+class MKLReluOp : public Operator {
+ public:
+  static std::string getName() {
+    return "MKLReluOp";
+  }
+  MKLReluOp():
+      reluFwd_(NULL),
+      reluBwd_(NULL) {
+    init_mkldnn_ = false;
+    fwd_top_data_ = MKLData<DType>::create();
+    fwd_bottom_data_ = MKLData<DType>::create();
+    bwd_top_diff_ = MKLData<DType>::create();
+    bwd_bottom_diff_ = MKLData<DType>::create();
+  }
+
+  ~MKLReluOp() {
+    if (reluFwd_ != NULL) {
+      dnnDelete<DType>(reluFwd_);
+      reluFwd_ = NULL;
+    }
+    if (reluBwd_ != NULL) {
+      dnnDelete<DType>(reluBwd_);
+      reluBwd_ = NULL;
+    }
+  }
+
+ private:
+  void LayerSetUp(const mshadow::Tensor<xpu, 4, DType> &data,
+                  const mshadow::Tensor<xpu, 4, DType> &out) {
+    size_t dim = 4;
+    size_t *sizes = new size_t[dim];
+    size_t *strides = new size_t[dim];
+    for (size_t d = 0; d < dim; ++d) {
+      (sizes)[d] = data.shape_[dim - 1 - d];
+      (strides)[d] = (d == 0) ? 1 : (strides)[d - 1] * (sizes)[d - 1];
+    }
+    // Names are for debugging only
+    fwd_bottom_data_->name = "fwd_bottom_data   @ " + getName();
+    fwd_top_data_->name = "fwd_top_data      @ " + getName();
+    bwd_bottom_diff_->name = "bwd_bottom_diff   @ " + getName();
+    bwd_top_diff_->name = "bwd_top_diff      @ " + getName();
+    fwd_bottom_data_->create_user_layout(dim, (sizes), (strides));
+    fwd_top_data_->create_user_layout(dim, (sizes), (strides));
+    bwd_bottom_diff_->create_user_layout(dim, (sizes), (strides));
+    bwd_top_diff_->create_user_layout(dim, (sizes), (strides));
+    delete[] sizes;
+    delete[] strides;
+  }
+
+ public:
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 1);
+    CHECK_EQ(out_data.size(), 1);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4, DType> data;
+    Tensor<xpu, 4, DType> out;
+    if (in_data[activation::kData].ndim() == 1) {
+      Shape<4> dshape = Shape4(in_data[activation::kData].shape_[0], 1, 1, 1);
+      data = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
+        in_data[activation::kData], dshape, s);
+      out = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
+        out_data[activation::kOut], dshape, s);
+    } else if (in_data[activation::kData].ndim() == 2) {
+      Shape<4> dshape = Shape4(in_data[activation::kData].shape_[0],
+      in_data[activation::kData].shape_[1], 1, 1);
+      data = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
+        in_data[activation::kData], dshape, s);
+      out = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
+        out_data[activation::kOut], dshape, s);
+    } else if (in_data[activation::kData].ndim() == 3) {
+      Shape<4> dshape = Shape4(in_data[activation::kData].shape_[0],
+        in_data[activation::kData].shape_[1],
+        in_data[activation::kData].shape_[2], 1);
+      data = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
+        in_data[activation::kData], dshape, s);
+      out = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
+        out_data[activation::kOut], dshape, s);
+    } else {
+      data = mkl_experimental_direct_get<xpu, 4, DType>(in_data[activation::kData], s);
+      out = mkl_experimental_direct_get<xpu, 4, DType>(out_data[activation::kOut], s);
+    }
+    if (!init_mkldnn_) {
+      LayerSetUp(data, out);
+      init_mkldnn_ = true;
+    }
+    void* bottom_data = NULL;
+#if MKL_EXPERIMENTAL == 1
+    bottom_data =
+          reinterpret_cast<void *>(mkl_prv_data<DType>(in_data[activation::kData]));
+#endif
+#if MKL_EXPERIMENTAL == 1
+    if (bottom_data != NULL) {
+      if (reluFwd_ == NULL) {
+      std::shared_ptr<MKLData<DType> > mem_descr =
+        mkl_get_mem_desc<DType>(in_data[activation::kData].Mkl_mem_);
+      DType negative_slope = 0;
+      dnnError_t e;
+      e = dnnReLUCreateForward<DType>(&reluFwd_, NULL, mem_descr->layout_int,
+                                      negative_slope);
+      CHECK_EQ(e, E_SUCCESS);
+      e = dnnReLUCreateBackward<DType>(&reluBwd_, NULL, mem_descr->layout_int,
+                                       mem_descr->layout_int, negative_slope);
+      CHECK_EQ(e, E_SUCCESS);
+
+      fwd_bottom_data_ = mem_descr;
+      fwd_top_data_->create_internal_layout(reluFwd_, dnnResourceDst);
+      bwd_top_diff_->create_internal_layout(reluFwd_, dnnResourceDst);
+      bwd_bottom_diff_->create_internal_layout(reluFwd_, dnnResourceSrc);
+      }
+    }
+#endif
+    if (bottom_data  == NULL) {
+      bottom_data = data.dptr_;
+      if (reluFwd_ == NULL) {
+        dnnError_t e;
+        DType negative_slope = 0;
+        e = dnnReLUCreateForward<DType>(&reluFwd_, NULL,
+                                        fwd_bottom_data_->layout_usr, negative_slope);
+        CHECK_EQ(e, E_SUCCESS);
+        e = dnnReLUCreateBackward<DType>(&reluBwd_, NULL,
+                                         fwd_bottom_data_->layout_usr, fwd_bottom_data_->layout_usr,
+                                         negative_slope);
+        CHECK_EQ(e, E_SUCCESS);
+      }
+    }
+    dnnError_t e;
+    void* relu_res[dnnResourceNumber];
+    relu_res[dnnResourceSrc] = bottom_data;
+
+    relu_res[dnnResourceDst] = fwd_top_data_->get_output_ptr(
+      out.dptr_, fwd_top_data_, out_data[activation::kOut], (data.dptr_ == out.dptr_));
+    e = dnnExecute<DType>(reluFwd_, relu_res);
+    CHECK_EQ(e, E_SUCCESS);
+#if MKL_EXPERIMENTAL == 0
+    if (fwd_top_data_->conversion_needed()) {
+      fwd_top_data_->convert_from_prv(out.dptr_);
+    }
+#endif
+  }
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    if (!req[0]) {
+      return;
+    }
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), 1);
+    CHECK(in_data.size() == 1 && in_grad.size() == 1);
+    CHECK_EQ(req.size(), 1);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4, DType> m_out_grad;
+    Tensor<xpu, 4, DType> m_out_data;
+    Tensor<xpu, 4, DType> m_in_grad;
+
+    if (out_grad[activation::kOut].ndim() == 1) {
+      Shape<4> dshape = Shape4(out_grad[activation::kOut].shape_[0], 1, 1, 1);
+      m_out_grad = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
+        out_grad[activation::kOut], dshape, s);
+      m_out_data = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
+        out_data[activation::kOut], dshape, s);
+      m_in_grad = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
+        in_grad[activation::kData], dshape, s);
+    } else if (out_grad[activation::kOut].ndim() == 2) {
+      Shape<4> dshape = Shape4(out_grad[activation::kOut].shape_[0],
+                               out_grad[activation::kOut].shape_[1], 1, 1);
+      m_out_grad = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
+        out_grad[activation::kOut], dshape, s);
+      m_out_data = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
+        out_data[activation::kOut], dshape, s);
+      m_in_grad = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
+        in_grad[activation::kData], dshape, s);
+    } else if (out_grad[activation::kOut].ndim() == 3) {
+      Shape<4> dshape = Shape4(out_grad[activation::kOut].shape_[0],
+        out_grad[activation::kOut].shape_[1],
+        out_grad[activation::kOut].shape_[2], 1);
+      m_out_grad = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
+        out_grad[activation::kOut], dshape, s);
+      m_out_data = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
+        out_data[activation::kOut], dshape, s);
+      m_in_grad = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
+        in_grad[activation::kData], dshape, s);
+    } else {
+      m_out_grad = mkl_experimental_direct_get<xpu, 4, DType>(out_grad[activation::kOut], s);
+      m_out_data = mkl_experimental_direct_get<xpu, 4, DType>(out_data[activation::kOut], s);
+      m_in_grad = mkl_experimental_direct_get<xpu, 4, DType>(in_grad[activation::kData], s);
+    }
+    dnnError_t e;
+    void* relu_res[dnnResourceNumber];
+
+    void* bottom_data = NULL;
+#if MKL_EXPERIMENTAL == 1
+    bottom_data = reinterpret_cast<void *>(mkl_prv_data<DType>(out_data[activation::kOut]));
+#endif
+    if (NULL == bottom_data) {
+      bottom_data = reinterpret_cast<void *>(const_cast<DType*>(m_out_data.dptr_));
+    }
+    relu_res[dnnResourceSrc] = bottom_data;
+    relu_res[dnnResourceDiffDst] = bwd_top_diff_->get_converted_prv(m_out_grad.dptr_,
+                true, out_grad[activation::kOut]);
+    relu_res[dnnResourceDiffSrc] = bwd_bottom_diff_->get_output_ptr(
+      m_in_grad.dptr_, bwd_bottom_diff_, in_grad[activation::kData]);
+    e = dnnExecute<DType>(reluBwd_, relu_res);
+    CHECK_EQ(e, E_SUCCESS);
+#if MKL_EXPERIMENTAL == 0
+    if (bwd_bottom_diff_->conversion_needed()) {
+      bwd_bottom_diff_->convert_from_prv(m_in_grad.dptr_);
+    }
+#endif
+  }
+
+ private:
+  bool init_mkldnn_;
+  std::shared_ptr<MKLData<DType> > fwd_top_data_;
+  std::shared_ptr<MKLData<DType> > fwd_bottom_data_;
+  std::shared_ptr<MKLData<DType> > bwd_top_diff_;
+  std::shared_ptr<MKLData<DType> > bwd_bottom_diff_;
+  dnnPrimitive_t reluFwd_, reluBwd_;
+};  // class MKLReluOp
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_MKL_MKL_RELU_INL_H_
diff --git a/src/operator/mkl/mkl_util-inl.h b/src/operator/mkl/mkl_util-inl.h
new file mode 100644
index 0000000000..4ad786a2ce
--- /dev/null
+++ b/src/operator/mkl/mkl_util-inl.h
@@ -0,0 +1,110 @@
+/*******************************************************************************
+* Copyright 2016 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+* \file mkl_util-inl.h
+* \brief
+* \author lingyan.guo@intel.com
+*         zhenlin.luo@intel.com
+*
+*******************************************************************************/
+#ifndef MXNET_OPERATOR_MKL_MKL_UTIL_INL_H_
+#define MXNET_OPERATOR_MKL_MKL_UTIL_INL_H_
+#include <vector>
+#define MKLDNN_CALL(func)                                                               \
+  {                                                                                     \
+    dnnError_t status = (func);                                                                \
+    CHECK_EQ(status, E_SUCCESS) << "MKL DNN call failed (status: " << status << ").";           \
+  }
+
+
+namespace mxnet {
+namespace op {
+
+#if MKL_EXPERIMENTAL == 1
+  template<typename DType>
+  inline DType * mkl_prv_data(const TBlob &b) {
+    std::shared_ptr<MKLMemHolder> bottom_data_mem = b.Mkl_mem_;
+    bool mem_valid = (bottom_data_mem != nullptr) && bottom_data_mem->head_at_prv();
+    if (mem_valid) {
+      return reinterpret_cast<DType*>(bottom_data_mem->prv_data());
+    }
+    return NULL;
+  }
+
+  template<typename DType>
+  inline int mkl_prv_count(const TBlob &b) {
+    std::shared_ptr<MKLMemHolder> bottom_data_mem = b.Mkl_mem_;
+    bool mem_valid = (bottom_data_mem != nullptr) && bottom_data_mem->head_at_prv();
+    if (mem_valid) {
+      return bottom_data_mem->prv_count();
+    }
+    return 0;
+  }
+#endif
+  inline void mkl_set_priv_flag(const TBlob &b) {
+#if MKL_EXPERIMENTAL == 1
+    std::shared_ptr<MKLMemHolder> bottom_data_mem = b.Mkl_mem_;
+    bool mem_valid = (bottom_data_mem != nullptr) && bottom_data_mem->head_at_prv();
+    if (mem_valid) {
+      bottom_data_mem->disable_prv_2_cpu(true);
+    }
+#endif
+  }
+#if MKL_EXPERIMENTAL == 1
+  template<typename DType>
+  inline std::shared_ptr<MKLData<DType> > mkl_get_mem_desc(
+    const std::shared_ptr<MKLMemHolder> data_mem) {
+    std::shared_ptr<PrvMemDescr> prv_descriptor =
+      data_mem->get_prv_descriptor();
+    CHECK_EQ(prv_descriptor->get_descr_type(),
+      PrvMemDescr::PRV_DESCR_MKL2017);
+    std::shared_ptr<MKLData<DType> > mem_descr
+      = std::static_pointer_cast<MKLData<DType>>
+      (prv_descriptor);
+    CHECK(mem_descr != NULL);
+    return mem_descr;
+  }
+#endif
+  template<typename xpu, int dim, typename DType>
+  inline  mshadow::Tensor<xpu, dim, DType> mkl_experimental_direct_get(
+    const TBlob &b, mshadow::Stream<xpu> *s) {
+    mkl_set_priv_flag(b);
+    return b.get<xpu, dim, DType>(s);
+  }
+  template<typename xpu, int dim, typename DType>
+  inline  mshadow::Tensor<xpu, dim, DType> mkl_experimental_direct_get_with_shape(
+    const TBlob &b, const mshadow::Shape<dim> &shape, mshadow::Stream<xpu> *s) {
+    mkl_set_priv_flag(b);
+    return b.get_with_shape<xpu, dim, DType>(shape, s);
+  }
+}  // namespace op
+#if MKL_EXPERIMENTAL == 1
+inline void mkl_tblobs_prv_to_cpu(const std::vector<TBlob> &data) {
+  for (size_t i = 0; i < data.size(); i++) {
+    std::shared_ptr<MKLMemHolder> mem_holder = data[i].Mkl_mem_;
+    if (mem_holder != nullptr && mem_holder->b_eager_mode) {
+      mem_holder->check_and_prv_to_cpu(data[i].dptr_);
+    }
+  }
+}
+inline void mkl_set_tblob_eager_mode(const TBlob &data) {
+  std::shared_ptr<MKLMemHolder> mem_holder = data.Mkl_mem_;
+  if (mem_holder != nullptr) {
+    mem_holder->set_eager_mode(true);
+  }
+}
+#endif
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_MKL_MKL_UTIL_INL_H_
diff --git a/src/operator/nn/activation-inl.h b/src/operator/nn/activation-inl.h
index a440f97e13..ac8b747f0f 100644
--- a/src/operator/nn/activation-inl.h
+++ b/src/operator/nn/activation-inl.h
@@ -21,7 +21,7 @@
  * Copyright (c) 2015 by Contributors
  * \file activation-inl.h
  * \brief Activation operator
- * \author Bing Xu, Da Zheng
+ * \author Bing Xu
 */
 
 #ifndef MXNET_OPERATOR_NN_ACTIVATION_INL_H_
@@ -37,7 +37,6 @@
 #include <utility>
 #include "../operator_common.h"
 #include "../mxnet_op.h"
-#include "../mshadow_op.h"
 
 namespace mxnet {
 namespace op {
@@ -46,7 +45,6 @@ namespace op {
 namespace activation {
 enum ActivationOpInputs {kData};
 enum ActivationOpOutputs {kOut};
-enum ActivationOpResource {kTempSpace};
 enum ActivationOpType {kReLU, kSigmoid, kTanh, kSoftReLU};
 }  // activation
 
@@ -61,148 +59,160 @@ struct ActivationParam : public dmlc::Parameter<ActivationParam> {
     .add_enum("softrelu", activation::kSoftReLU)
     .describe("Activation function to be applied.");
   }
-
-  bool operator==(const ActivationParam& other) const {
-    return this->act_type == other.act_type;
-  }
 };
 
-}  // namespace op
-}  // namespace mxnet
+/**
+ * \brief This is the implementation of activation operator.
+ * \tparam xpu The device that the op will be executed on.
+ */
+template<typename xpu, typename ForwardOp, typename BackwardOp, typename DType>
+class ActivationOp : public Operator {
+ public:
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 1U);
+    CHECK_EQ(out_data.size(), 1U);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    const TBlob& input = in_data[activation::kData];
+    const size_t sz = input.shape_.Size();
+    if (sz) {
+      MXNET_ASSIGN_REQ_SWITCH(req[activation::kOut], Req, {
+        mxnet_op::Kernel<mxnet_op::op_with_req<ForwardOp, Req>, xpu>::Launch(
+          s, sz,
+          out_data[activation::kOut].dptr<DType>(),
+          input.dptr<DType>());
+      });
+    }
+  }
 
-namespace std {
-template<>
-struct hash<mxnet::op::ActivationParam> {
-  size_t operator()(const mxnet::op::ActivationParam& val) {
-    return val.act_type;
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), 1U);
+    CHECK(in_data.size() == 1 && in_grad.size() == 1);
+    CHECK_EQ(req.size(), 1U);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    const TBlob& m_out_grad = out_grad[activation::kOut];
+    const TBlob& m_out_data = out_data[activation::kOut];
+    const TBlob&  m_in_grad = in_grad[activation::kData];
+    const size_t sz = m_out_data.shape_.Size();
+    if (sz) {
+      MXNET_ASSIGN_REQ_SWITCH(req[activation::kData], Req, {
+        mxnet_op::Kernel<mxnet_op::op_with_req<
+          mxnet::op::mxnet_op::backward_grad_tuned<BackwardOp>, Req>, xpu>::Launch(
+          s, sz,
+          m_in_grad.dptr<DType>(),
+          m_out_grad.dptr<DType>(),
+          m_out_data.dptr<DType>());
+      });
+    }
   }
-};
-}  // namespace std
+};  // class ActivationOp
 
-namespace mxnet {
-namespace op {
+// Declare Factory function, used for dispatch specialization
+template<typename xpu>
+Operator* CreateOp(ActivationParam type, int dtype, const TShape& dshape);
 
-template<typename xpu, typename ForwardOp, typename BackwardOp, typename DType>
-void ActivationForward(const OpContext &ctx, const TBlob &in_data,
-                       const OpReqType &req, const TBlob &out_data) {
-  using namespace mshadow;
-  using namespace mshadow::expr;
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-  const size_t sz = in_data.shape_.Size();
-  if (sz) {
-    MXNET_ASSIGN_REQ_SWITCH(req, Req, {
-      mxnet_op::Kernel<mxnet_op::op_with_req<ForwardOp, Req>, xpu>::Launch(
-        s, sz,
-        out_data.dptr<DType>(),
-        in_data.dptr<DType>());
-    });
+#if DMLC_USE_CXX11
+class ActivationProp : public OperatorProperty {
+ public:
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    param_.Init(kwargs);
   }
-}
 
-template<typename xpu, typename ForwardOp, typename BackwardOp, typename DType>
-void ActivationBackward(const OpContext &ctx, const TBlob &out_grad,
-                        const TBlob &out_data, const OpReqType &req,
-                        const TBlob &in_grad) {
-  using namespace mshadow;
-  using namespace mshadow::expr;
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-  const size_t sz = out_data.shape_.Size();
-  if (sz) {
-    MXNET_ASSIGN_REQ_SWITCH(req, Req, {
-      mxnet_op::Kernel<mxnet_op::op_with_req<
-        mxnet::op::mxnet_op::backward_grad_tuned<BackwardOp>, Req>, xpu>::Launch(
-        s, sz,
-        in_grad.dptr<DType>(),
-        out_grad.dptr<DType>(),
-        out_data.dptr<DType>());
-    });
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
   }
-}
 
-template<typename xpu>
-void ActivationComputeImpl(const ActivationParam &param, const OpContext &ctx,
-                           const TBlob &input, OpReqType req, const TBlob &output) {
-  MSHADOW_REAL_TYPE_SWITCH(input.type_flag_, DType, {
-    switch (param.act_type) {
-      case activation::kReLU:
-        ActivationForward<xpu, mshadow_op::relu, mshadow_op::relu_grad, DType>(
-            ctx, input, req, output);
-        break;
-      case activation::kSigmoid:
-        ActivationForward<xpu, mshadow_op::sigmoid, mshadow_op::sigmoid_grad, DType>(
-            ctx, input, req, output);
-        break;
-      case activation::kTanh:
-        ActivationForward<xpu, mshadow_op::tanh, mshadow_op::tanh_grad, DType>(
-            ctx, input, req, output);
-        break;
-      case activation::kSoftReLU:
-        ActivationForward<xpu, mshadow_op::softrelu, mshadow_op::softrelu_grad, DType>(
-            ctx, input, req, output);
-        break;
-      default:
-        LOG(FATAL) << "unknown activation type";
-    }
-  });
-}
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), 1U) << "Input:[data]";
+    const TShape &dshape = in_shape->at(activation::kData);
+    if (dshape.ndim() == 0) return false;
+    out_shape->clear();
+    out_shape->push_back(dshape);
+    return true;
+  }
 
-template<typename xpu>
-void ActivationGradComputeImpl(const ActivationParam &param, const OpContext &ctx,
-                               const TBlob &out_grad, const TBlob &out_data,
-                               OpReqType req, const TBlob &output) {
-  MSHADOW_REAL_TYPE_SWITCH(out_grad.type_flag_, DType, {
-    switch (param.act_type) {
-      case activation::kReLU:
-        ActivationBackward<xpu, mshadow_op::relu, mshadow_op::relu_grad, DType>(
-            ctx, out_grad, out_data, req, output);
-        break;
-      case activation::kSigmoid:
-        ActivationBackward<xpu, mshadow_op::sigmoid, mshadow_op::sigmoid_grad, DType>(
-            ctx, out_grad, out_data, req, output);
-        break;
-      case activation::kTanh:
-        ActivationBackward<xpu, mshadow_op::tanh, mshadow_op::tanh_grad, DType>(
-            ctx, out_grad, out_data, req, output);
-        break;
-      case activation::kSoftReLU:
-        ActivationBackward<xpu, mshadow_op::softrelu, mshadow_op::softrelu_grad, DType>(
-            ctx, out_grad, out_data, req, output);
-        break;
-      default:
-        LOG(FATAL) << "unknown activation type";
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_GE(in_type->size(), 1U);
+    int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "First input must have specified type";
+    for (index_t i = 0; i < in_type->size(); ++i) {
+      if ((*in_type)[i] == -1) {
+          (*in_type)[i] = dtype;
+      } else {
+        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
+      }
     }
-  });
-}
+    out_type->clear();
+    out_type->push_back(dtype);
+    return true;
+  }
 
-template<typename xpu>
-void ActivationCompute(const nnvm::NodeAttrs& attrs,
-    const OpContext& ctx,
-    const std::vector<TBlob>& inputs,
-    const std::vector<OpReqType>& req,
-    const std::vector<TBlob>& outputs) {
-  CHECK_EQ(inputs.size(), 1U);
-  CHECK_EQ(outputs.size(), 1U);
-  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
-  ActivationComputeImpl<xpu>(param, ctx, inputs[0], req[0], outputs[0]);
-}
+  OperatorProperty* Copy() const override {
+    auto ptr = new ActivationProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
 
-template<typename xpu>
-void ActivationGradCompute(const nnvm::NodeAttrs& attrs,
-    const OpContext& ctx,
-    const std::vector<TBlob>& inputs,
-    const std::vector<OpReqType>& req,
-    const std::vector<TBlob>& outputs) {
+  std::string TypeString() const override {
+    return "Activation";
+  }
+
+  // decalre dependency and inplace optimization options
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
 #if MXNET_USE_CUDNN == 1
-  CHECK_EQ(inputs.size(), 3U);
+    return {out_grad[activation::kOut], out_data[activation::kOut], in_data[activation::kData]};
 #else
-  CHECK_EQ(inputs.size(), 2U);
-#endif
-  CHECK_EQ(outputs.size(), 1U);
-  CHECK_EQ(req.size(), 1U);
-  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
-  ActivationGradComputeImpl<xpu>(param, ctx, inputs[0], inputs[1], req[0], outputs[0]);
-}
+    return {out_grad[activation::kOut], out_data[activation::kOut]};
+#endif  // MXNET_USE_CUDNN
+  }
 
+  std::vector<std::pair<int, void*> > BackwardInplaceOption(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data,
+    const std::vector<void*> &in_grad) const override {
+    return {{out_grad[activation::kOut], in_grad[activation::kData]}};
+  }
+
+  std::vector<std::pair<int, void*> > ForwardInplaceOption(
+    const std::vector<int> &in_data,
+    const std::vector<void*> &out_data) const override {
+    return {{in_data[activation::kData], out_data[activation::kOut]}};
+  }
+
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented.";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
+
+ private:
+  ActivationParam param_;
+};
+#endif  // DMLC_USE_CXX11
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_NN_ACTIVATION_INL_H_
diff --git a/src/operator/nn/activation.cc b/src/operator/nn/activation.cc
index 0da644cb1f..401a9e3eaa 100644
--- a/src/operator/nn/activation.cc
+++ b/src/operator/nn/activation.cc
@@ -17,130 +17,69 @@
  * under the License.
  */
 
-
 /*!
  * Copyright (c) 2015 by Contributors
  * \file activation.cc
  * \brief activation op
- * \author Bing Xu, Da Zheng
+ * \author Bing Xu
 */
 #include "./activation-inl.h"
 #include "../mshadow_op.h"
-#include "../tensor/elemwise_unary_op.h"
-#if MXNET_USE_MKLDNN == 1
-#include "./mkldnn/mkldnn_base-inl.h"
-#include "./mkldnn/mkldnn_ops-inl.h"
-#endif  // MXNET_USE_MKLDNN
+#if MXNET_USE_MKL2017 == 1
+#include <mkl_memory.h>
+#include "../mkl/mkl_memory-inl.h"
+#include "../mkl/mkl_relu-inl.h"
+#endif  // MXNET_USE_MKL2017
 
 namespace mxnet {
 namespace op {
-
-DMLC_REGISTER_PARAMETER(ActivationParam);
-
-// This will determine the order of the inputs for backward computation.
-struct ActivationGrad {
-  const char *op_name;
-  std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
-                                          const std::vector<nnvm::NodeEntry>& ograds) const {
-    std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
-    heads.emplace_back(nnvm::NodeEntry{n, activation::kOut, 0});
-#if MXNET_USE_CUDNN == 1
-    heads.push_back(n->inputs[activation::kData]);
-#endif
-    return MakeGradNode(op_name, n, heads, n->attrs.dict);
-  }
-};
-
-#if MXNET_USE_MKLDNN == 1
-static void ActivationComputeExCPU(const nnvm::NodeAttrs& attrs,
-                                   const OpContext& ctx,
-                                   const std::vector<NDArray>& inputs,
-                                   const std::vector<OpReqType>& req,
-                                   const std::vector<NDArray>& outputs) {
-  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
-  CHECK_EQ(inputs.size(), 1U);
-  CHECK_EQ(outputs.size(), 1U);
-  if (SupportMKLDNN(inputs[0])) {
-    MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
-    MKLDNNActivationForward(attrs, ctx, inputs[0], req[0], outputs[0]);
-    MKLDNN_OPCHECK_RUN(ActivationCompute<cpu>, attrs, ctx, inputs, req, outputs);
-    return;
+template<>
+Operator *CreateOp<cpu>(ActivationParam param, int dtype, const TShape& dshape) {
+  Operator *op = NULL;
+#if MXNET_USE_MKL2017 == 1
+  if (param.act_type == activation::kReLU && dshape.ndim() <= 4) {
+      switch (dtype) {
+      case mshadow::kFloat32:
+          return new MKLReluOp<cpu, float>();
+      case mshadow::kFloat64:
+          return new MKLReluOp<cpu, double>();
+      default:
+          break;
+      }
   }
-  ActivationComputeImpl<cpu>(param, ctx, inputs[0].data(), req[0], outputs[0].data());
-}
-
-void ActivationGradComputeExCPU(const nnvm::NodeAttrs& attrs,
-                                const OpContext& ctx,
-                                const std::vector<NDArray>& inputs,
-                                const std::vector<OpReqType>& req,
-                                const std::vector<NDArray>& outputs) {
-#if MXNET_USE_CUDNN == 1
-  CHECK_EQ(inputs.size(), 3U);
-#else
-  CHECK_EQ(inputs.size(), 2U);
+  if (enableMKLWarnGenerated())
+    LOG(INFO) << MKLReluOp<cpu, float>::getName() << " Skip MKL optimization";
 #endif
-  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
-  if (SupportMKLDNN(inputs[0])) {
-    MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
-    MKLDNNActivationBackward(attrs, ctx, inputs[0], inputs[1], req[0],
-                             outputs[0]);
-      MKLDNN_OPCHECK_RUN(ActivationGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
-    return;
-  }
-  ActivationGradComputeImpl<cpu>(param, ctx, inputs[0].data(), inputs[1].data(),
-                                 req[0], outputs[0].data());
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    switch (param.act_type) {
+      case activation::kReLU:
+        op = new ActivationOp<cpu, mshadow_op::relu, mshadow_op::relu_grad, DType>();
+        break;
+      case activation::kSigmoid:
+        op = new ActivationOp<cpu, mshadow_op::sigmoid, mshadow_op::sigmoid_grad, DType>();
+        break;
+      case activation::kTanh:
+        op = new ActivationOp<cpu, mshadow_op::tanh, mshadow_op::tanh_grad, DType>();
+        break;
+      case activation::kSoftReLU:
+        op = new ActivationOp<cpu, mshadow_op::softrelu, mshadow_op::softrelu_grad, DType>();
+        break;
+      default:
+        LOG(FATAL) << "unknown activation type";
+    }
+  })
+  return op;
 }
-#endif
 
-inline static bool ActivationStorageType(const nnvm::NodeAttrs& attrs,
-                                         const int dev_mask,
-                                         DispatchMode* dispatch_mode,
-                                         std::vector<int> *in_attrs,
-                                         std::vector<int> *out_attrs) {
-  CHECK_EQ(in_attrs->size(), 1);
-  CHECK_EQ(out_attrs->size(), 1);
-  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
-  bool ret = ElemwiseStorageType<1, 1, false, false, false>(attrs, dev_mask,
-                                                            dispatch_mode,
-                                                            in_attrs, out_attrs);
-#if MXNET_USE_MKLDNN == 1
-  if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNAct(param)) {
-    *dispatch_mode = DispatchMode::kFComputeEx;
-  }
-#endif
-  return ret;
+// DO_BIND_DISPATCH comes from operator_common.h
+Operator *ActivationProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                           std::vector<int> *in_type) const {
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], (*in_shape)[0]);
 }
 
-inline static bool BackwardActStorageType(const nnvm::NodeAttrs& attrs,
-                                          const int dev_mask,
-                                          DispatchMode* dispatch_mode,
-                                          std::vector<int> *in_attrs,
-                                          std::vector<int> *out_attrs) {
-#if MXNET_USE_CUDNN == 1
-  CHECK_EQ(in_attrs->size(), 3U);
-#else
-  CHECK_EQ(in_attrs->size(), 2U);
-#endif
-  CHECK_EQ(out_attrs->size(), 1U);
-  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
-#if MXNET_USE_CUDNN == 1
-  bool ret = ElemwiseStorageType<3, 1, false, false, false>(attrs, dev_mask,
-                                                            dispatch_mode,
-                                                            in_attrs, out_attrs);
-#else
-  bool ret = ElemwiseStorageType<2, 1, false, false, false>(attrs, dev_mask,
-                                                            dispatch_mode,
-                                                            in_attrs, out_attrs);
-#endif
-#if MXNET_USE_MKLDNN == 1
-  if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNAct(param)) {
-    *dispatch_mode = DispatchMode::kFComputeEx;
-  }
-#endif
-  return ret;
-}
+DMLC_REGISTER_PARAMETER(ActivationParam);
 
-MXNET_OPERATOR_REGISTER_UNARY(Activation)
+MXNET_REGISTER_OP_PROPERTY(Activation, ActivationProp)
 .describe(R"code(Applies an activation function element-wise to the input.
 
 The following activation functions are supported:
@@ -151,35 +90,8 @@ The following activation functions are supported:
 - `softrelu`: Soft ReLU, or SoftPlus, :math:`y = log(1 + exp(x))`
 
 )code" ADD_FILELINE)
-.set_attr_parser(ParamParser<ActivationParam>)
-.set_attr<FInferStorageType>("FInferStorageType", ActivationStorageType)
-.set_attr<FCompute>("FCompute<cpu>", ActivationCompute<cpu>)
-#if MXNET_USE_MKLDNN == 1
-.set_attr<FComputeEx>("FComputeEx<cpu>", ActivationComputeExCPU)
-#endif
-.set_attr<nnvm::FGradient>("FGradient", ActivationGrad{"_backward_Activation"})
+.add_argument("data", "NDArray-or-Symbol", "Input array to activation function.")
 .add_arguments(ActivationParam::__FIELDS__());
 
-NNVM_REGISTER_OP(_backward_Activation)
-.set_num_inputs(3)
-.set_num_outputs(1)
-.set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<FInferStorageType>("FInferStorageType", BackwardActStorageType)
-.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<3, 1>)
-.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<3, 1>)
-.set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs){
-  return std::vector<std::pair<int, int> >{{0, 0}};
-})
-#if MXNET_USE_MKLDNN == 1
-.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
-  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-})
-#endif
-.set_attr_parser(ParamParser<ActivationParam>)
-#if MXNET_USE_MKLDNN == 1
-.set_attr<FComputeEx>("FComputeEx<cpu>", ActivationGradComputeExCPU)
-#endif
-.set_attr<FCompute>("FCompute<cpu>", ActivationGradCompute<cpu>);
-
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/activation.cu b/src/operator/nn/activation.cu
index dc435b2acc..c2f6be9f37 100644
--- a/src/operator/nn/activation.cu
+++ b/src/operator/nn/activation.cu
@@ -31,73 +31,39 @@
 
 namespace mxnet {
 namespace op {
-
-#if MXNET_USE_CUDNN == 1
-
-template<typename DType>
-static CuDNNActivationOp<DType> &get_cudnn_op(const ActivationParam& param) {
-#if DMLC_CXX11_THREAD_LOCAL
-  static thread_local CuDNNActivationOp<DType> cudnn_op;
-#else
-  static MX_THREAD_LOCAL CuDNNActivationOp<DType> cudnn_op;
-#endif
-  cudnn_op.Init(param);
-  return cudnn_op;
-}
-
 template<>
-void ActivationCompute<gpu>(const nnvm::NodeAttrs& attrs,
-    const OpContext& ctx,
-    const std::vector<TBlob>& inputs,
-    const std::vector<OpReqType>& req,
-    const std::vector<TBlob>& outputs) {
-  CHECK_EQ(inputs.size(), 1U);
-  CHECK_EQ(outputs.size(), 1U);
-  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
-
+Operator *CreateOp<gpu>(ActivationParam param, int dtype, const TShape& dshape) {
+  Operator *op = NULL;
   // SoftReLU not supported by CUDNN yet
   if (param.act_type == activation::kSoftReLU) {
-    MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-      ActivationForward<gpu, mshadow_op::softrelu, mshadow_op::softrelu_grad, DType>(ctx,
-          inputs[0], req[0], outputs[0]);
-    });
-  } else {
-    MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-      get_cudnn_op<DType>(param).Forward(ctx, inputs[0], req[0], outputs[0]);
-    });
+    MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+      op = new ActivationOp<gpu, mshadow_op::softrelu, mshadow_op::softrelu_grad, DType>();
+    })
+    return op;
   }
-}
-
-template<>
-void ActivationGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
-    const OpContext& ctx,
-    const std::vector<TBlob>& inputs,
-    const std::vector<OpReqType>& req,
-    const std::vector<TBlob>& outputs) {
-  CHECK_EQ(inputs.size(), 3U);
-  CHECK_EQ(outputs.size(), 1U);
-  CHECK_EQ(req.size(), 1U);
-  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
 
-  // SoftReLU not supported by CUDNN yet
-  if (param.act_type == activation::kSoftReLU) {
-    MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-      ActivationBackward<gpu, mshadow_op::softrelu, mshadow_op::softrelu_grad, DType>(
-          ctx, inputs[0], inputs[1], req[0], outputs[0]);
-    });
-  } else {
-    MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-      get_cudnn_op<DType>(param).Backward(ctx, inputs[0], inputs[2], inputs[1], req[0], outputs[0]);
-    });
-  }
+#if MXNET_USE_CUDNN == 1
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new CuDNNActivationOp<DType>(param);
+  })
+#else
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    switch (param.act_type) {
+      case activation::kReLU:
+        op = new ActivationOp<gpu, mshadow_op::relu, mshadow_op::relu_grad, DType>();
+        break;
+      case activation::kSigmoid:
+        op = new ActivationOp<gpu, mshadow_op::sigmoid, mshadow_op::sigmoid_grad, DType>();
+        break;
+      case activation::kTanh:
+        op = new ActivationOp<gpu, mshadow_op::tanh, mshadow_op::tanh_grad, DType>();
+        break;
+      default:
+        LOG(FATAL) << "unknown activation";
+    }
+  })
+#endif  // MXNET_USE_CUDNN
+  return op;
 }
-#endif
-
-NNVM_REGISTER_OP(Activation)
-.set_attr<FCompute>("FCompute<gpu>", ActivationCompute<gpu>);
-
-NNVM_REGISTER_OP(_backward_Activation)
-.set_attr<FCompute>("FCompute<gpu>", ActivationGradCompute<gpu>);
-
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/batch_norm-inl.h b/src/operator/nn/batch_norm-inl.h
index 27e0a8434d..2a9dee2cf8 100644
--- a/src/operator/nn/batch_norm-inl.h
+++ b/src/operator/nn/batch_norm-inl.h
@@ -21,7 +21,7 @@
  * Copyright (c) 2017 by Contributors
  * \file batch_norm-inl.h
  * \brief
- * \author Bing Xu, Chris Olivier, Da Zheng
+ * \author Bing Xu, Chris Olivier
  */
 #ifndef MXNET_OPERATOR_NN_BATCH_NORM_INL_H_
 #define MXNET_OPERATOR_NN_BATCH_NORM_INL_H_
@@ -47,10 +47,8 @@ namespace mxnet {
 namespace op {
 
 namespace batchnorm {
-enum BatchNormOpInputs {kData, kGamma, kBeta, kInMovingMean,
-  kInMovingVar};  // kGamma: weights, kBeta: biases
+enum BatchNormOpInputs {kData, kGamma, kBeta};  // kGamma: weights, kBeta: biases
 enum BatchNormOpOutputs {kOut, kMean, kVar};  // req, out_data
-enum BatchNormOpResource {kTempSpace};
 enum BatchNormOpAuxiliary {kMovingMean, kMovingVar};  // aux_states
 
 /*! \brief Default channel axis if none specified int he params */
@@ -85,203 +83,280 @@ struct BatchNormParam : public dmlc::Parameter<BatchNormParam> {
     DMLC_DECLARE_FIELD(cudnn_off).set_default(false)
       .describe("Do not select CUDNN operator, if available");
   }
-
-  bool operator==(const BatchNormParam& other) const {
-    return this->eps == other.eps &&
-           this->momentum == other.momentum &&
-           this->fix_gamma == other.fix_gamma &&
-           this->use_global_stats == other.use_global_stats &&
-           this->output_mean_var == other.output_mean_var &&
-           this->axis == other.axis &&
-           this->cudnn_off == other.cudnn_off;
-  }
 };
 
-}  // namespace op
-}  // namespace mxnet
-
-namespace std {
-template<>
-struct hash<mxnet::op::BatchNormParam> {
-  size_t operator()(const mxnet::op::BatchNormParam& val) {
-    size_t ret = 0;
-    ret = dmlc::HashCombine(ret, val.momentum);
-    ret = dmlc::HashCombine(ret, val.fix_gamma);
-    ret = dmlc::HashCombine(ret, val.use_global_stats);
-    ret = dmlc::HashCombine(ret, val.output_mean_var);
-    ret = dmlc::HashCombine(ret, val.axis);
-    return ret;
+/*! \brief Batch normalization operator */
+template <typename xpu, typename DType, typename AccReal>
+class BatchNormOp : public Operator {
+ public:
+  explicit BatchNormOp(BatchNormParam param) {
+    this->param_ = param;
   }
-};
-}  // namespace std
 
-namespace mxnet {
-namespace op {
+  static inline bool IsWriting(const OpReqType ort) {
+    return ort == kWriteTo || ort == kWriteInplace;
+  }
 
-static inline bool IsBNWriting(const OpReqType ort) {
-  return ort == kWriteTo || ort == kWriteInplace;
-}
+  /*!
+   * \brief perform a forward operation of Operator, save the output to TBlob.
+   * \param ctx runtime context available to this call
+   * \param in_data array of input data, it is const
+   * \param req the request types of saving operation, can only be kWriteTo or kWriteInplace.
+   * \param out_data array of output data, pointer is used to indicate that this is holder
+   *        the space of TBlob in out_data must be pre-allocated with InferShape
+   * \param aux_states Auxiliary states of operator. Normally operator doesn't
+   *        need, epecial case like Batch Norm requires.
+   * \sa OpReqType, OpContext
+   */
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_states) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+
+    CHECK_EQ(in_data.size(), 3U);
+    CHECK_EQ(aux_states.size(), 2U);
+    if (ctx.is_train) {
+      CHECK_EQ(out_data.size(), 3U);
+      CHECK_EQ(req.size(), 3U);
+    } else {
+      CHECK_GE(out_data.size(), 1U);
+      CHECK_GE(req.size(), 1U);
+      CHECK_EQ(req[batchnorm::kOut], kWriteTo);
+    }
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    DoForward(s, ctx, in_data, req, out_data, aux_states);
+  }
 
-template <typename xpu, typename DType, typename AccReal>
-void BatchNormForwardImpl(mshadow::Stream<cpu> *stream,
-                          const OpContext &ctx, const BatchNormParam& param,
-                          const std::vector<TBlob> &in_data,
-                          const std::vector<OpReqType> &req,
-                          const std::vector<TBlob> &out_data,
-                          const std::vector<TBlob> &aux_states);
+  /*!
+   * \brief Perform a Backward Operation, write gradient to the in_grad.
+   *
+   * \note
+   * Convention:
+   *   out_grad.size() == OperatorProperty.NumVisibleOutputs()
+   *   out_data.size() == OperatorProperty.NumOutputs()
+   * out_data can contain additional invisible returns that remembers the
+   * state carried from the Forward pass. For example mask in the dropout.
+   * The gradients are passed from visible returns in this function.
+   *
+   * \par
+   * Not all the TBlobs in the arguments will be available
+   * if you override the DeclareBackwardDependency of corresponding OperatorProperty class.
+   * Only the dependencies you declared will be available at corresponding position,
+   * the rest of the parameters are simply dummy where you will get a nullptr.
+   * You will be safe if you use the default DeclareBackwardDependency.
+   * But only declare what you need will give engine more chance for optimization.
+   *
+   * \param ctx runtime context available to this call
+   * \param out_grad the gradient value we get from of the Operator.
+   * \param in_data the array of input data.
+   * \param out_data the array of output data.
+   * \param req request types of the saving operation, can be all types.
+   * \param in_grad the array of gradient we need to write to.
+   * \param aux_states Auxiliary states of operator. Normally operator doesn't need
+   * \sa OperatorProperty, OpReqType, OpContext
+   */
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_states) {
+    CHECK_EQ(out_grad.size(), param_.output_mean_var ? 3U : 1U);
+    CHECK_EQ(in_data.size(), 3U);
+    CHECK_EQ(out_data.size(), 3U);
+    CHECK_EQ(in_grad.size(), 3U);
+    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+    DoBackward(s, ctx, out_grad, in_data,
+               out_data, req, in_grad, aux_states);
+  }
 
-template <typename xpu, typename DType, typename AccReal>
-void BatchNormBackwardImpl(mshadow::Stream<cpu> *stream,
-                           const OpContext &ctx, const BatchNormParam& param,
-                           const std::vector<TBlob> &out_grad,
-                           const std::vector<TBlob> &in_data,
-                           const std::vector<TBlob> &out_data,
-                           const std::vector<OpReqType> &req,
-                           const std::vector<TBlob> &in_grad,
-                           const std::vector<TBlob> &aux_states);
+ private:
+  void DoForward(mshadow::Stream<cpu> *stream,
+                 const OpContext &ctx,
+                 const std::vector<TBlob> &in_data,
+                 const std::vector<OpReqType> &req,
+                 const std::vector<TBlob> &out_data,
+                 const std::vector<TBlob> &aux_states);
+
+  void DoBackward(mshadow::Stream<cpu> *stream,
+                  const OpContext &ctx,
+                  const std::vector<TBlob> &out_grad,
+                  const std::vector<TBlob> &in_data,
+                  const std::vector<TBlob> &out_data,
+                  const std::vector<OpReqType> &req,
+                  const std::vector<TBlob> &in_grad,
+                  const std::vector<TBlob> &aux_states);
 
 #if MXNET_USE_CUDA
-template <typename xpu, typename DType, typename AccReal>
-void BatchNormForwardImpl(mshadow::Stream<gpu> *stream,
-                          const OpContext &ctx, const BatchNormParam& param,
-                          const std::vector<TBlob> &in_data,
-                          const std::vector<OpReqType> &req,
-                          const std::vector<TBlob> &out_data,
-                          const std::vector<TBlob> &aux_states);
-template <typename xpu, typename DType, typename AccReal>
-void BatchNormBackwardImpl(mshadow::Stream<gpu> *stream,
-                           const OpContext &ctx, const BatchNormParam& param,
-                           const std::vector<TBlob> &out_grad,
-                           const std::vector<TBlob> &in_data,
-                           const std::vector<TBlob> &out_data,
-                           const std::vector<OpReqType> &req,
-                           const std::vector<TBlob> &in_grad,
-                           const std::vector<TBlob> &aux_states);
+  void DoForward(mshadow::Stream<gpu> *stream,
+                 const OpContext &ctx,
+                 const std::vector<TBlob> &in_data,
+                 const std::vector<OpReqType> &req,
+                 const std::vector<TBlob> &out_data,
+                 const std::vector<TBlob> &aux_states);
+  void DoBackward(mshadow::Stream<gpu> *stream,
+                  const OpContext &ctx,
+                  const std::vector<TBlob> &out_grad,
+                  const std::vector<TBlob> &in_data,
+                  const std::vector<TBlob> &out_data,
+                  const std::vector<OpReqType> &req,
+                  const std::vector<TBlob> &in_grad,
+                  const std::vector<TBlob> &aux_states);
 #endif  // MXNET_USE_CUDA
 
-/*!
- * \brief perform a forward operation of Operator, save the output to TBlob.
- * \param ctx runtime context available to this call
- * \param in_data array of input data, it is const
- * \param req the request types of saving operation, can only be kWriteTo or kWriteInplace.
- * \param out_data array of output data, pointer is used to indicate that this is holder
- *        the space of TBlob in out_data must be pre-allocated with InferShape
- * \param aux_states Auxiliary states of operator. Normally operator doesn't
- *        need, epecial case like Batch Norm requires.
- * \sa OpReqType, OpContext
- */
-template <typename xpu, typename DType, typename AccReal>
-void BatchNormForward(const OpContext &ctx, const BatchNormParam& param,
-                      const std::vector<TBlob> &in_data,
-                      const std::vector<OpReqType> &req,
-                      const std::vector<TBlob> &out_data,
-                      const std::vector<TBlob> &aux_states) {
-  using namespace mshadow;
-  using namespace mshadow::expr;
-
-  CHECK_EQ(in_data.size(), 3U);
-  CHECK_EQ(aux_states.size(), 2U);
-  if (ctx.is_train) {
-    CHECK_EQ(out_data.size(), 3U);
-    CHECK_EQ(req.size(), 3U);
-  } else {
-    CHECK_GE(out_data.size(), 1U);
-    CHECK_GE(req.size(), 1U);
-    CHECK_EQ(req[batchnorm::kOut], kWriteTo);
-  }
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-  BatchNormForwardImpl<xpu, DType, AccReal>(s, ctx, param, in_data, req,
-                                            out_data, aux_states);
-}
-
-/*!
- * \brief Perform a Backward Operation, write gradient to the in_grad.
- *
- * \note
- * Convention:
- *   out_grad.size() == OperatorProperty.NumVisibleOutputs()
- *   out_data.size() == OperatorProperty.NumOutputs()
- * out_data can contain additional invisible returns that remembers the
- * state carried from the Forward pass. For example mask in the dropout.
- * The gradients are passed from visible returns in this function.
- *
- * \par
- * Not all the TBlobs in the arguments will be available
- * if you override the DeclareBackwardDependency of corresponding OperatorProperty class.
- * Only the dependencies you declared will be available at corresponding position,
- * the rest of the parameters are simply dummy where you will get a nullptr.
- * You will be safe if you use the default DeclareBackwardDependency.
- * But only declare what you need will give engine more chance for optimization.
- *
- * \param ctx runtime context available to this call
- * \param out_grad the gradient value we get from of the Operator.
- * \param in_data the array of input data.
- * \param out_data the array of output data.
- * \param req request types of the saving operation, can be all types.
- * \param in_grad the array of gradient we need to write to.
- * \param aux_states Auxiliary states of operator. Normally operator doesn't need
- * \sa OperatorProperty, OpReqType, OpContext
- */
-template <typename xpu, typename DType, typename AccReal>
-void BatchNormBackward(const OpContext &ctx, const BatchNormParam& param,
-                       const std::vector<TBlob> &out_grad,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &in_grad,
-                       const std::vector<TBlob> &aux_states) {
-  CHECK_EQ(out_grad.size(), param.output_mean_var ? 3U : 1U);
-  CHECK_EQ(in_data.size(), 3U);
-  CHECK_EQ(out_data.size(), 3U);
-  CHECK_EQ(in_grad.size(), 3U);
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  BatchNormBackwardImpl<xpu, DType, AccReal>(s, ctx, param, out_grad, in_data,
-                                             out_data, req, in_grad, aux_states);
-}
+  /*! \brief Batch normalization operator parameters */
+  BatchNormParam param_;
+};  // class BatchNormOp
 
 template<typename xpu>
-void BatchNormCompute(const nnvm::NodeAttrs& attrs,
-                      const OpContext& ctx, const std::vector<TBlob>& inputs,
-                      const std::vector<OpReqType>& req,
-                      const std::vector<TBlob>& outputs) {
-  const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
-  CHECK_EQ(inputs.size(), 5U);
-  std::vector<TBlob> in_data(inputs.begin(),
-                             inputs.begin() + batchnorm::kInMovingMean);
-  std::vector<TBlob> aux_states(inputs.begin() + batchnorm::kInMovingMean,
-                                inputs.end());
-  MSHADOW_REAL_TYPE_SWITCH_EX(inputs[0].type_flag_, DType, AccReal, {
-    BatchNormForward<xpu, DType, AccReal>(ctx, param, in_data, req, outputs,
-                                          aux_states);
-  });
-}
-
-template<typename xpu>
-void BatchNormGradCompute(const nnvm::NodeAttrs& attrs,
-                          const OpContext& ctx, const std::vector<TBlob>& inputs,
-                          const std::vector<OpReqType>& req,
-                          const std::vector<TBlob>& outputs) {
-  CHECK_EQ(inputs.size(), 11U);
-  const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
-  int num_out_grads = param.output_mean_var ? 3U : 1U;
-  int in_data_start = 3;
-  int aux_states_start = in_data_start + batchnorm::kInMovingMean;
-  int out_data_start = in_data_start + batchnorm::kInMovingVar + 1;
-  std::vector<TBlob> out_grad(inputs.begin(), inputs.begin() + num_out_grads);
-  std::vector<TBlob> in_data(inputs.begin() + in_data_start,
-                             inputs.begin() + aux_states_start);
-  std::vector<TBlob> aux_states(inputs.begin() + aux_states_start,
-                                inputs.begin() + out_data_start);
-  std::vector<TBlob> out_data(inputs.begin() + out_data_start, inputs.end());
-  std::vector<TBlob> in_grad(outputs.begin(), outputs.begin() + 3);
-
-  MSHADOW_REAL_TYPE_SWITCH_EX(out_grad[0].type_flag_, DType, AccReal, {
-    BatchNormBackward<xpu, DType, AccReal>(ctx, param, out_grad, in_data, out_data, req,
-                                           in_grad, aux_states);
-  });
-}
+Operator *CreateOp(BatchNormParam param, const int dtype, const TShape& shape);
 
 #if DMLC_USE_CXX11
+class BatchNormProp : public OperatorProperty {
+ public:
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    param_.Init(kwargs);
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), 3U) << "Input:[data, gamma, beta]";
+    const TShape &dshape = in_shape->at(0);
+
+    const size_t channelAxis = static_cast<size_t>(param_.axis < 0
+                            ? static_cast<int>(dshape.ndim()) + param_.axis
+                            : param_.axis);
+    CHECK_LT(channelAxis, dshape.ndim()) << "Channel axis out of range: " << param_.axis;
+
+    const int channelCount = dshape[channelAxis];
+
+    if (dshape.ndim() == 0) {
+      return false;
+    }
+
+    in_shape->at(1) = TShape(Shape1(channelCount));
+    in_shape->at(2) = TShape(Shape1(channelCount));
+
+    out_shape->clear();
+    out_shape->push_back(dshape);                // kOut
+    out_shape->push_back(Shape1(channelCount));  // kMean
+    out_shape->push_back(Shape1(channelCount));  // kVar
+
+    aux_shape->clear();
+    aux_shape->push_back(Shape1(channelCount));  // kMovingMean
+    aux_shape->push_back(Shape1(channelCount));  // kMovingVar
+    return true;
+  }
+
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    using namespace mshadow;
+    CHECK_GE(in_type->size(), 1U);
+    const int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "First input must have specified type";
+    // For float16 input type beta, gamma, mean, and average are stored in float32.
+    // For other input types, these parameters have the same type as input
+    // NOTE: This requirement is from cuDNN (v. 4 and 5)
+    int dtype_param;
+    MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DTypeX, AccRealX, {
+         dtype_param = mshadow::DataType<AccRealX>::kFlag; });
+    for (index_t i = 1; i < in_type->size(); ++i) {
+      if ((*in_type)[i] == -1) {
+        (*in_type)[i] = dtype_param;
+      } else {
+        UNIFORM_TYPE_CHECK((*in_type)[i], dtype_param, ListArguments()[i]);
+      }
+    }
+    for (index_t i = 0; i < aux_type->size(); ++i) {
+      if ((*aux_type)[i] != -1) {
+        UNIFORM_TYPE_CHECK((*aux_type)[i], dtype_param, ListArguments()[i]);
+      }
+    }
+    const size_t n_aux = this->ListAuxiliaryStates().size();
+    aux_type->clear();
+    for (size_t i = 0; i < n_aux; ++i) {
+      aux_type->push_back(dtype_param);
+    }
+    const size_t n_out = this->ListOutputs().size();
+    out_type->clear();
+    out_type->push_back(dtype);
+    for (size_t i = 1; i < n_out; ++i) {
+      out_type->push_back(dtype_param);
+    }
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    auto ptr = new BatchNormProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  std::string TypeString() const override {
+    return "BatchNorm";
+  }
+
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    return {out_grad[batchnorm::kOut],
+            out_data[batchnorm::kMean],
+            out_data[batchnorm::kVar],
+            in_data[batchnorm::kData],
+            in_data[batchnorm::kGamma]
+           };
+  }
+
+  int NumVisibleOutputs() const override {
+    if (param_.output_mean_var) {
+      return 3;
+    }
+    return 1;
+  }
+
+  int NumOutputs() const override {
+    return 3;
+  }
+
+  std::vector<std::string> ListArguments() const override {
+    return {"data", "gamma", "beta"};
+  }
+
+  std::vector<std::string> ListOutputs() const override {
+    return {"output", "mean", "var"};
+  }
+
+  std::vector<std::string> ListAuxiliaryStates() const override {
+    return {"moving_mean", "moving_var"};
+  }
+
+  Operator* CreateOperator(Context ctx) const override {
+      LOG(FATAL) << "Not Implemented.";
+      return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+      std::vector<int> *in_type) const override;
+
+  inline const BatchNormParam& getParam() const {
+    return param_;
+  }
+
+ private:
+  BatchNormParam param_;
+};  // class BatchNormProp
 
 namespace batchnorm {
 
diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc
index ba6c413819..ca28832394 100644
--- a/src/operator/nn/batch_norm.cc
+++ b/src/operator/nn/batch_norm.cc
@@ -21,15 +21,16 @@
  * Copyright (c) 2015 by Contributors
  * \file batch_norm.cc
  * \brief
- * \author Bing Xu, Chris Olivier, Da Zheng
+ * \author Bing Xu, Chris Olivier
 */
 
 #include "batch_norm-inl.h"
 #include <nnvm/op_attr_types.h>
-#include "../elemwise_op_common.h"
-#if MXNET_USE_MKLDNN == 1
-#include "./mkldnn/mkldnn_batch_norm-inl.h"
-#endif
+#if MXNET_USE_MKL2017 == 1
+#include <mkl_memory.h>
+#include "../mkl/mkl_memory-inl.h"
+#include "../mkl/mkl_batch_norm-inl.h"
+#endif  // MXNET_USE_MKL2017
 
 /*! \brief inverse standard deviation <-> variance */
 #define VARIANCE_TO_INVSTD(__var$,    __eps$)   (1.0/sqrt((__var$) + DType(__eps$)))
@@ -88,12 +89,12 @@ static inline void ForEachFast(const BNTensor3<DType1> &in_data,
 
 /*! \brief Forward CPU */
 template <typename xpu, typename DType, typename AccReal>
-void BatchNormForwardImpl(mshadow::Stream<cpu> *,
-                          const OpContext &ctx, const BatchNormParam& param_,
-                          const std::vector<TBlob> &in_data,
-                          const std::vector<OpReqType> &req,
-                          const std::vector<TBlob> &out_data,
-                          const std::vector<TBlob> &aux_states) {
+void BatchNormOp<xpu, DType, AccReal>::DoForward(mshadow::Stream<cpu> *,
+                                                 const OpContext &ctx,
+                                                 const std::vector<TBlob> &in_data,
+                                                 const std::vector<OpReqType> &req,
+                                                 const std::vector<TBlob> &out_data,
+                                                 const std::vector<TBlob> &aux_states) {
   // Input
   batchnorm::BNTensor3<DType> inputData(in_data[batchnorm::kData], param_.axis);
   const TBlob &weights         = in_data[batchnorm::kGamma];
@@ -163,7 +164,7 @@ void BatchNormForwardImpl(mshadow::Stream<cpu> *,
 
     // note that var is still invstd
     if (!param_.fix_gamma) {
-      if (IsBNWriting(req[batchnorm::kData])) {
+      if (IsWriting(req[batchnorm::kData])) {
         ForEachFast(inputData, outputData, channel,
                     [thisWeight, thisBias, thisMean, thisInvstd](const DType *in_data,
                                                                  DType *out_data) {
@@ -172,10 +173,10 @@ void BatchNormForwardImpl(mshadow::Stream<cpu> *,
                     });
       }
     } else {
-      if (IsBNWriting(req[batchnorm::kGamma])) {
+      if (IsWriting(req[batchnorm::kGamma])) {
         w[channel] = AccReal(1);
       }
-      if (IsBNWriting(req[batchnorm::kData])) {
+      if (IsWriting(req[batchnorm::kData])) {
         ForEachFast(inputData, outputData, channel,
                     [thisWeight, thisBias, thisMean, thisInvstd](const DType *in_data,
                                                                  DType *out_data) {
@@ -188,14 +189,14 @@ void BatchNormForwardImpl(mshadow::Stream<cpu> *,
 }
 
 template <typename xpu, typename DType, typename AccReal>
-void BatchNormBackwardImpl(mshadow::Stream<cpu> *,
-                           const OpContext &ctx, const BatchNormParam& param_,
-                           const std::vector<TBlob> &out_grad,
-                           const std::vector<TBlob> &in_data,
-                           const std::vector<TBlob> &out_data,
-                           const std::vector<OpReqType> &req,
-                           const std::vector<TBlob> &in_grad,
-                           const std::vector<TBlob> &aux_states) {
+void BatchNormOp<xpu, DType, AccReal>::DoBackward(mshadow::Stream<cpu> *,
+                                                  const OpContext &ctx,
+                                                  const std::vector<TBlob> &out_grad,
+                                                  const std::vector<TBlob> &in_data,
+                                                  const std::vector<TBlob> &out_data,
+                                                  const std::vector<OpReqType> &req,
+                                                  const std::vector<TBlob> &in_grad,
+                                                  const std::vector<TBlob> &aux_states) {
   // Input Data
   batchnorm::BNTensor3<DType> inputData(in_data[batchnorm::kData], param_.axis);
   const TBlob &weights   = in_data[batchnorm::kGamma];
@@ -263,7 +264,7 @@ void BatchNormBackwardImpl(mshadow::Stream<cpu> *,
                   dotp += (*thisInputData - mean) * (*gradOut_data);
                 });
 
-    if (!gradIn.IsEmpty() && IsBNWriting(req[batchnorm::kData])) {  // if there's a grad input
+    if (!gradIn.IsEmpty() && IsWriting(req[batchnorm::kData])) {  // if there's a grad input
       if (is_train_and_not_global_stats) {
         // when in training mode
         // Q(X) = X - E[x] ; i.e. input centered to zero mean
@@ -299,7 +300,7 @@ void BatchNormBackwardImpl(mshadow::Stream<cpu> *,
     // May want to make this a param eventually
     const AccReal scale = 1.0f;
 
-    if (IsBNWriting(req[batchnorm::kGamma])) {
+    if (IsWriting(req[batchnorm::kGamma])) {
       if (!param_.fix_gamma) {
         gradWeightData[channel] = scale * dotp * invstd;
       } else {
@@ -307,185 +308,51 @@ void BatchNormBackwardImpl(mshadow::Stream<cpu> *,
       }
     }
 
-    if (IsBNWriting(req[batchnorm::kBeta])) {
+    if (IsWriting(req[batchnorm::kBeta])) {
       gradBiasData[channel] = scale * sumGradOut;
     }
   }
 }
 
-DMLC_REGISTER_PARAMETER(BatchNormParam);
-
-static bool BatchNormShape(const nnvm::NodeAttrs& attrs,
-                           std::vector<TShape> *in_shape,
-                           std::vector<TShape> *out_shape) {
-  const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
-  using namespace mshadow;
-  CHECK_EQ(in_shape->size(), 5U) << "Input:[data, gamma, beta, MovingMean, MovingVar]";
-  const TShape &dshape = in_shape->at(batchnorm::kData);
-
-  const size_t channelAxis = static_cast<size_t>(param.axis < 0
-      ? static_cast<int>(dshape.ndim()) + param.axis
-      : param.axis);
-  CHECK_LT(channelAxis, dshape.ndim()) << "Channel axis out of range: " << param.axis;
-
-  const int channelCount = dshape[channelAxis];
-
-  if (dshape.ndim() == 0) {
-    return false;
-  }
-
-  in_shape->at(batchnorm::kGamma) = TShape(Shape1(channelCount));
-  in_shape->at(batchnorm::kBeta) = TShape(Shape1(channelCount));
-  in_shape->at(batchnorm::kInMovingMean) = TShape(Shape1(channelCount));  // kMovingMean
-  in_shape->at(batchnorm::kInMovingVar) = TShape(Shape1(channelCount));  // kMovingVar
-
-  out_shape->clear();
-  out_shape->push_back(dshape);                // kOut
-  out_shape->push_back(Shape1(channelCount));  // kMean
-  out_shape->push_back(Shape1(channelCount));  // kVar
-
-  return true;
-}
-
-static bool BatchNormType(const nnvm::NodeAttrs& attrs,
-                          std::vector<int> *in_type, std::vector<int> *out_type) {
-  using namespace mshadow;
-  CHECK_GE(in_type->size(), 1U);
-  const int dtype = (*in_type)[0];
-  CHECK_NE(dtype, -1) << "First input must have specified type";
-  // For float16 input type beta, gamma, mean, and average are stored in float32.
-  // For other input types, these parameters have the same type as input
-  // NOTE: This requirement is from cuDNN (v. 4 and 5)
-  int dtype_param;
-  MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DTypeX, AccRealX, {
-      dtype_param = mshadow::DataType<AccRealX>::kFlag; });
-  std::vector<std::string> args{"data", "gamma", "beta", "mean", "var"};
-  CHECK_LE(in_type->size(), args.size());
-  for (index_t i = 1; i < in_type->size(); ++i) {
-    if ((*in_type)[i] == -1) {
-      (*in_type)[i] = dtype_param;
-    } else {
-      UNIFORM_TYPE_CHECK((*in_type)[i], dtype_param, args[i]);
-    }
-  }
-  const size_t n_out = 3;
-  out_type->clear();
-  out_type->push_back(dtype);
-  for (size_t i = 1; i < n_out; ++i) {
-    out_type->push_back(dtype_param);
-  }
-  return true;
-}
-
-#if MXNET_USE_MKLDNN == 1
-static inline bool SupportMKLDNNBN(const NDArray &input, const BatchNormParam &param) {
-  TShape shape = input.shape();
-  return SupportMKLDNN(input) && shape.ndim() == 4
+template<>
+Operator *CreateOp<cpu>(BatchNormParam param, const int dtype, const TShape& shape) {
+  param.axis = mxnet::op::batchnorm::GetRealAxis(shape, param.axis);
+  Operator *op = nullptr;
+#if MXNET_USE_MKL2017 == 1
+  if (shape.ndim() == 4
       && param.axis == mxnet::op::batchnorm::DEFAULT_AXIS
-      && shape[param.axis] % 8 == 0;
-}
-
-void BatchNormComputeExCPU(const nnvm::NodeAttrs &attrs,
-                           const OpContext &ctx,
-                           const std::vector<NDArray> &inputs,
-                           const std::vector<OpReqType> &req,
-                           const std::vector<NDArray> &outputs) {
-  CHECK_EQ(inputs.size(), 5U);
-  const BatchNormParam &param = nnvm::get<BatchNormParam>(attrs.parsed);
-  // MKLDNN batchnorm only works well on the special MKLDNN layout.
-  if (SupportMKLDNNBN(inputs[0], param) && inputs[0].IsMKLDNNData()) {
-    std::vector<NDArray> in_data(inputs.begin(), inputs.begin() + batchnorm::kInMovingMean);
-    std::vector<NDArray> aux_states(inputs.begin() + batchnorm::kInMovingMean, inputs.end());
-
-    if (inputs[0].dtype() == mshadow::kFloat32) {
-      MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
-      MKLDNNBatchNormForward<float>(ctx, param, in_data, req, outputs, aux_states);
-      MKLDNN_OPCHECK_RUN(BatchNormCompute<cpu>, attrs, ctx, inputs, req, outputs);
-      return;
+      && !mxnet::op::batchnorm::disable_mkl) {
+    switch (dtype) {
+      case mshadow::kFloat32:
+        op = new MKLBatchNormOp<cpu, float>(param);
+        break;
+      case mshadow::kFloat64:
+        op = new MKLBatchNormOp<cpu, double>(param);
+        break;
+      default:
+        // MKL operator doesn't support half_t, so fall through
+        break;
     }
   }
-  FallBackCompute(BatchNormCompute<cpu>, attrs, ctx, inputs, req, outputs);
-}
-
-void BatchNormGradComputeExCPU(const nnvm::NodeAttrs &attrs,
-                               const OpContext &ctx,
-                               const std::vector<NDArray> &inputs,
-                               const std::vector<OpReqType> &req,
-                               const std::vector<NDArray> &outputs) {
-  CHECK_EQ(inputs.size(), 11U);
-  const BatchNormParam &param = nnvm::get<BatchNormParam>(attrs.parsed);
-  int num_out_grads = param.output_mean_var ? 3U : 1U;
-  int in_data_start = 3;
-  int aux_states_start = in_data_start + batchnorm::kInMovingMean;
-  int out_data_start = in_data_start + batchnorm::kInMovingVar + 1;
-
-  TShape shape = inputs[0].shape();
-  // MKLDNN batchnorm only works well on the special MKLDNN layout.
-  if (SupportMKLDNNBN(inputs[0], param)
-      && (inputs[in_data_start].IsMKLDNNData() || inputs[0].IsMKLDNNData())) {
-    std::vector<NDArray> out_grad(inputs.begin(), inputs.begin() + num_out_grads);
-    std::vector<NDArray> in_data(inputs.begin() + in_data_start,
-                                 inputs.begin() + aux_states_start);
-    std::vector<NDArray> aux_states(inputs.begin() + aux_states_start,
-                                    inputs.begin() + out_data_start);
-    std::vector<NDArray> out_data(inputs.begin() + out_data_start, inputs.end());
-    std::vector<NDArray> in_grad(outputs.begin(), outputs.begin() + 3);
-
-    if (inputs[0].dtype() == mshadow::kFloat32) {
-      MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
-      MKLDNNBatchNormBackward<float>(ctx, param, out_grad, in_data,
-                                     out_data, req, in_grad, aux_states);
-      MKLDNN_OPCHECK_RUN(BatchNormGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
-      return;
-    }
-  }
-  FallBackCompute(BatchNormGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
-}
 #endif
-
-static inline bool BatchNormStorageType(const nnvm::NodeAttrs &attrs,
-                                        const int dev_mask,
-                                        DispatchMode *dispatch_mode,
-                                        std::vector<int> *in_attrs,
-                                        std::vector<int> *out_attrs) {
-  CHECK_EQ(in_attrs->size(), 5);
-  CHECK_EQ(out_attrs->size(), 3);
-  DispatchMode wanted_mode;
-#if MXNET_USE_MKLDNN == 1
-  if (dev_mask == mshadow::cpu::kDevMask)
-    wanted_mode = DispatchMode::kFComputeEx;
-  else
-#endif
-    wanted_mode = DispatchMode::kFCompute;
-  for (int& v : *in_attrs) {
-    if (v == - 1) v = kDefaultStorage;
+  if (!op) {
+    MSHADOW_REAL_TYPE_SWITCH_EX(dtype,
+                                DType,
+                                AccReal, {
+                                  op = new BatchNormOp<cpu, DType, AccReal>(param); });
   }
-  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                             dispatch_mode, wanted_mode);
+  return op;
 }
 
-static inline bool backward_BatchNormStorageType(const nnvm::NodeAttrs &attrs,
-                                                 const int dev_mask,
-                                                 DispatchMode *dispatch_mode,
-                                                 std::vector<int> *in_attrs,
-                                                 std::vector<int> *out_attrs) {
-  CHECK_EQ(in_attrs->size(), 11);
-  CHECK_EQ(out_attrs->size(), 5);
-  DispatchMode wanted_mode;
-#if MXNET_USE_MKLDNN == 1
-  if (dev_mask == mshadow::cpu::kDevMask)
-    wanted_mode = DispatchMode::kFComputeEx;
-  else
-#endif
-    wanted_mode = DispatchMode::kFCompute;
-  for (int& v : *in_attrs) {
-    if (v == - 1) v = kDefaultStorage;
-  }
-  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                             dispatch_mode, wanted_mode);
+// DO_BIND_DISPATCH comes from operator_common.h
+Operator *BatchNormProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                          std::vector<int> *in_type) const {
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], (*in_shape)[0]);
 }
 
-NNVM_REGISTER_OP(BatchNorm)
+DMLC_REGISTER_PARAMETER(BatchNormParam);
+
+MXNET_REGISTER_OP_PROPERTY(BatchNorm, BatchNormProp)
 .describe(R"code(Batch normalization.
 
 Normalizes a data batch by mean and variance, and applies a scale ``gamma`` as
@@ -531,44 +398,14 @@ Both ``gamma`` and ``beta`` are learnable parameters. But if ``fix_gamma`` is tr
 then set ``gamma`` to 1 and its gradient to 0.
 
 )code" ADD_FILELINE)
-.set_num_inputs(5)
-.set_num_outputs(3)
-.set_attr_parser(ParamParser<BatchNormParam>)
-.set_attr<nnvm::FListInputNames>("FListInputNames",
-    [](const NodeAttrs& attrs) {
-  return std::vector<std::string>{"data", "gamma", "beta", "moving_mean", "moving_var"};
-})
-.set_attr<nnvm::FListOutputNames>("FListOutputNames",
-    [](const NodeAttrs& attrs) {
-  return std::vector<std::string>{"output", "mean", "var"};
-})
-.set_attr<nnvm::FNumVisibleOutputs>("FNumVisibleOutputs",
-    [](const NodeAttrs& attrs) {
-  const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
-  return param.output_mean_var ? 3 : 1;
-})
-.set_attr<nnvm::FMutateInputs>("FMutateInputs", [](const nnvm::NodeAttrs& attrs) {
-  return std::vector<uint32_t>{3, 4};
-})
-.set_attr<nnvm::FInferShape>("FInferShape", BatchNormShape)
-.set_attr<nnvm::FInferType>("FInferType", BatchNormType)
-.set_attr<FInferStorageType>("FInferStorageType", BatchNormStorageType)
-.set_attr<FCompute>("FCompute<cpu>", BatchNormCompute<cpu>)
-#if MXNET_USE_MKLDNN == 1
-.set_attr<FComputeEx>("FComputeEx<cpu>", BatchNormComputeExCPU)
-#endif
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseInOut{"_backward_BatchNorm"})
-#if MXNET_USE_MKLDNN == 1
-.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
-  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-})
-#endif
 .add_argument("data", "NDArray-or-Symbol", "Input data to batch normalization")
 .add_argument("gamma", "NDArray-or-Symbol", "gamma array")
 .add_argument("beta", "NDArray-or-Symbol", "beta array")
 .add_argument("moving_mean", "NDArray-or-Symbol", "running mean of input")
 .add_argument("moving_var", "NDArray-or-Symbol", "running variance of input")
-.add_arguments(BatchNormParam::__FIELDS__())
+.add_arguments(BatchNormParam::__FIELDS__());
+
+NNVM_REGISTER_OP(BatchNorm)
 .set_attr<nnvm::FSetInputVarAttrOnCompose>(
   "FSetInputVarAttrOnCompose",
   [](const nnvm::NodeAttrs& attrs, nnvm::NodePtr var, const int index) {
@@ -580,20 +417,5 @@ then set ``gamma`` to 1 and its gradient to 0.
     }
   });
 
-NNVM_REGISTER_OP(_backward_BatchNorm)
-.set_num_outputs(5)
-.set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<FInferStorageType>("FInferStorageType", backward_BatchNormStorageType)
-#if MXNET_USE_MKLDNN == 1
-.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
-  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-})
-#endif
-.set_attr_parser(ParamParser<BatchNormParam>)
-#if MXNET_USE_MKLDNN == 1
-.set_attr<FComputeEx>("FComputeEx<cpu>", BatchNormGradComputeExCPU)
-#endif
-.set_attr<FCompute>("FCompute<cpu>", BatchNormGradCompute<cpu>);
-
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/batch_norm.cu b/src/operator/nn/batch_norm.cu
index 80c15976b6..59317b7fa8 100644
--- a/src/operator/nn/batch_norm.cu
+++ b/src/operator/nn/batch_norm.cu
@@ -21,7 +21,7 @@
  * Copyright (c) 2017 by Contributors
  * \file batch_norm.cu
  * \brief CUDA Batch Normalization code
- * \author Chris Olivier, Bing Xu, Da Zheng
+ * \author Chris Olivier, Bing Xu
  * Adapted from Torch
 */
 #include <cuda_runtime_api.h>
@@ -579,13 +579,13 @@ static inline uint32_t SetupFlags(const OpContext &ctx,
   flags |= ctx.is_train ? IS_TRAINING_FLAG : 0;
   flags |= params.fix_gamma ? FIX_GAMMA_FLAG : 0;
   flags |= params.use_global_stats ? USE_GLOBAL_STATS_FLAG : 0;
-  if (IsBNWriting(req[batchnorm::kData])) {
+  if (BatchNormOp<xpu, DType, AccReal>::IsWriting(req[batchnorm::kData])) {
     flags |= WRITE_DATA_FLAG;
   }
-  if (IsBNWriting(req[batchnorm::kGamma])) {
+  if (BatchNormOp<xpu, DType, AccReal>::IsWriting(req[batchnorm::kGamma])) {
     flags |= WRITE_GAMMA_FLAG;
   }
-  if (IsBNWriting(req[batchnorm::kBeta])) {
+  if (BatchNormOp<xpu, DType, AccReal>::IsWriting(req[batchnorm::kBeta])) {
     flags |= WRITE_BETA_FLAG;
   }
   return flags;
@@ -593,12 +593,12 @@ static inline uint32_t SetupFlags(const OpContext &ctx,
 
 /*! \brief Forward batch-norm pass on GPU */
 template<typename xpu, typename DType, typename AccReal>
-void BatchNormForwardImpl(mshadow::Stream<gpu> *stream,
-                          const OpContext &ctx, const BatchNormParam& param_,
-                          const std::vector<TBlob> &in_data,
-                          const std::vector<OpReqType> &req,
-                          const std::vector<TBlob> &out_data,
-                          const std::vector<TBlob> &aux_states) {
+void BatchNormOp<xpu, DType, AccReal>::DoForward(mshadow::Stream<gpu> *stream,
+                                                 const OpContext &ctx,
+                                                 const std::vector<TBlob> &in_data,
+                                                 const std::vector<OpReqType> &req,
+                                                 const std::vector<TBlob> &out_data,
+                                                 const std::vector<TBlob> &aux_states) {
   batchnorm::cuda::BatchNormalizationUpdateOutput<DType, AccReal>(
     stream,
     ctx,
@@ -614,14 +614,14 @@ void BatchNormForwardImpl(mshadow::Stream<gpu> *stream,
 
 /*! \brief Backward batch-norm pass on GPU */
 template<typename xpu, typename DType, typename AccReal>
-void BatchNormBackwardImpl(mshadow::Stream<gpu> *stream,
-                           const OpContext &ctx, const BatchNormParam& param_,
-                           const std::vector<TBlob> &out_grad,
-                           const std::vector<TBlob> &in_data,
-                           const std::vector<TBlob> &out_data,
-                           const std::vector<OpReqType> &req,
-                           const std::vector<TBlob> &in_grad,
-                           const std::vector<TBlob> &aux_states) {
+void BatchNormOp<xpu, DType, AccReal>::DoBackward(mshadow::Stream<gpu> *stream,
+                                                  const OpContext &ctx,
+                                                  const std::vector<TBlob> &out_grad,
+                                                  const std::vector<TBlob> &in_data,
+                                                  const std::vector<TBlob> &out_data,
+                                                  const std::vector<OpReqType> &req,
+                                                  const std::vector<TBlob> &in_grad,
+                                                  const std::vector<TBlob> &aux_states) {
   batchnorm::cuda::BatchNormalizationBackward<DType, AccReal>(
     stream,
     ctx,
@@ -637,92 +637,30 @@ void BatchNormBackwardImpl(mshadow::Stream<gpu> *stream,
   MSHADOW_CUDA_POST_KERNEL_CHECK(BatchNormOp_DoBackward_gpu);
 }
 
-#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 4
-template<typename DType>
-static CuDNNBatchNormOp<DType> &GetCuDNNOp(const BatchNormParam& param) {
-#if DMLC_CXX11_THREAD_LOCAL
-  static thread_local CuDNNBatchNormOp<DType> op;
-#else
-  static MX_THREAD_LOCAL CuDNNBatchNormOp<DType> op;
-#endif
-  op.Init(param);
-  return op;
-}
-#endif
-
-template<>
-void BatchNormCompute<gpu>(const nnvm::NodeAttrs& attrs,
-                           const OpContext& ctx, const std::vector<TBlob>& inputs,
-                           const std::vector<OpReqType>& req,
-                           const std::vector<TBlob>& outputs) {
-  BatchNormParam param = nnvm::get<BatchNormParam>(attrs.parsed);
-  CHECK_EQ(inputs.size(), 5U);
-  std::vector<TBlob> in_data(inputs.begin(), inputs.begin() + 3);
-  std::vector<TBlob> aux_states(inputs.begin() + 3, inputs.end());
-  int dtype = inputs[0].type_flag_;
-  TShape shape = inputs[0].shape_;
-
-  param.axis = mxnet::op::batchnorm::GetRealAxis(shape, param.axis);
-#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
-  if (!param.use_global_stats && !param.cudnn_off && shape.ndim() <= 4
-      && param.axis == mxnet::op::batchnorm::DEFAULT_AXIS) {
-    MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-      GetCuDNNOp<DType>(param).Forward(ctx, in_data, req, outputs, aux_states);
-    })
-  } else {
-    MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DType, AccReal, {
-      BatchNormForward<gpu, DType, AccReal>(ctx, param, in_data, req, outputs, aux_states);
-    })
-  }
-#else
-  MSHADOW_REAL_TYPE_SWITCH_EX(inputs[0].type_flag_, DType, AccReal, {
-    BatchNormForward<gpu, DType, AccReal>(ctx, param, in_data, req, outputs, aux_states);
-  });
-#endif
-}
-
+/*! \brief Create GPU operator for batch normalization */
 template<>
-void BatchNormGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
-                               const OpContext& ctx, const std::vector<TBlob>& inputs,
-                               const std::vector<OpReqType>& req,
-                               const std::vector<TBlob>& outputs) {
-  CHECK_EQ(inputs.size(), 11U);
-  BatchNormParam param = nnvm::get<BatchNormParam>(attrs.parsed);
-  std::vector<TBlob> out_grad(1, inputs[0]);
-  std::vector<TBlob> in_data(inputs.begin() + 3, inputs.begin() + 6);
-  std::vector<TBlob> aux_states(inputs.begin() + 6, inputs.begin() + 8);
-  std::vector<TBlob> out_data(inputs.begin() + 8, inputs.end());
-  std::vector<TBlob> in_grad(outputs.begin(), outputs.begin() + 3);
-  int dtype = inputs[0].type_flag_;
-  TShape shape = inputs[0].shape_;
-
+Operator *CreateOp<gpu>(BatchNormParam param, const int dtype, const TShape& shape) {
   param.axis = mxnet::op::batchnorm::GetRealAxis(shape, param.axis);
+  Operator *op = NULL;
 #if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
   if (!param.use_global_stats && !param.cudnn_off && shape.ndim() <= 4
       && param.axis == mxnet::op::batchnorm::DEFAULT_AXIS) {
     MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-      GetCuDNNOp<DType>(param).Backward(ctx, out_grad, in_data, out_data,
-        req, in_grad, aux_states);
+      op = new CuDNNBatchNormOp<DType>(param);
     })
   } else {
     MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DType, AccReal, {
-      BatchNormBackward<gpu, DType, AccReal>(ctx, param, out_grad,
-          in_data, out_data, req, in_grad, aux_states);
+      op = new BatchNormOp<gpu, DType, AccReal>(param);
     })
   }
 #else
-  MSHADOW_REAL_TYPE_SWITCH_EX(out_grad[0].type_flag_, DType, AccReal, {
-    BatchNormBackward<gpu, DType, AccReal>(ctx, param, out_grad,
-        in_data, out_data, req, in_grad, aux_states);
-  });
+  MSHADOW_REAL_TYPE_SWITCH_EX(dtype,
+                              DType,
+                              AccReal,
+                              { op = new BatchNormOp<gpu, DType, AccReal>(param); });
 #endif
+  return op;
 }
 
-NNVM_REGISTER_OP(BatchNorm)
-.set_attr<FCompute>("FCompute<gpu>", BatchNormCompute<gpu>);
-
-NNVM_REGISTER_OP(_backward_BatchNorm)
-.set_attr<FCompute>("FCompute<gpu>", BatchNormGradCompute<gpu>);
-
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/concat-inl.h b/src/operator/nn/concat-inl.h
deleted file mode 100644
index a7f1fa85f6..0000000000
--- a/src/operator/nn/concat-inl.h
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2015 by Contributors
- * \file concat-inl.h
- * \brief
- * \author Bing Xu
-*/
-#ifndef MXNET_OPERATOR_NN_CONCAT_INL_H_
-#define MXNET_OPERATOR_NN_CONCAT_INL_H_
-#include <dmlc/logging.h>
-#include <dmlc/parameter.h>
-#include <mxnet/operator.h>
-#include <cstring>
-#include <map>
-#include <string>
-#include <vector>
-#include <utility>
-#include "../operator_common.h"
-#include "../channel_op_common.h"
-#include "../tensor/broadcast_reduce_op.h"
-
-namespace mxnet {
-namespace op {
-
-namespace concat_enum {
-enum ConcatOpInputs {kData0, kData1, kData2, kData3, kData4};
-enum ConcatOpResource {kTempSpace};
-enum ConcatOpOutputs {kOut};
-}  // namespace concat_enum
-
-struct ConcatParam : public dmlc::Parameter<ConcatParam> {
-  int num_args;
-  int dim;
-  DMLC_DECLARE_PARAMETER(ConcatParam) {
-    DMLC_DECLARE_FIELD(num_args).set_lower_bound(1)
-    .describe("Number of inputs to be concated.");
-    DMLC_DECLARE_FIELD(dim).set_default(1)
-    .describe("the dimension to be concated.");
-  }
-};  // struct ConcatParam
-
-template<typename xpu, typename DType>
-class ConcatOp {
- public:
-  void Init(const ConcatParam &param) {
-    this->size_ = param.num_args;
-    this->dimension_ = param.dim;
-  }
-
-  void Forward(const OpContext &ctx,
-               const std::vector<TBlob> &in_data,
-               const std::vector<OpReqType> &req,
-               const std::vector<TBlob> &out_data) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(static_cast<int>(in_data.size()), size_);
-    CHECK_EQ(out_data.size(), 1U);
-    int axis = CheckAxis(dimension_, in_data[concat_enum::kData0].ndim());
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    std::vector<Tensor<xpu, 3, DType> > data(size_);
-    Tensor<xpu, 3, DType> out;
-    size_t leading = 1, trailing = 1;
-    for (int i = 0; i < axis; ++i) {
-      leading *= out_data[concat_enum::kOut].shape_[i];
-    }
-    for (int i = axis + 1; i < out_data[concat_enum::kOut].ndim(); ++i) {
-      trailing *= out_data[concat_enum::kOut].shape_[i];
-    }
-    size_t mid = out_data[concat_enum::kOut].shape_[axis];
-    Shape<3> oshape = Shape3(leading, mid, trailing);
-    out = out_data[concat_enum::kOut].get_with_shape<xpu, 3, DType>(oshape, s);
-
-    for (int i = 0; i < size_; ++i) {
-      Shape<3> dshape = Shape3(leading, in_data[i].shape_[axis], trailing);
-      data[i] = in_data[i].get_with_shape<xpu, 3, DType>(dshape, s);
-    }
-    Concatenate(data, &out, 1, req[concat_enum::kOut]);
-  }
-
-  void Backward(const OpContext &ctx, const TBlob &out_grad,
-                const std::vector<OpReqType> &req,
-                const std::vector<TBlob> &in_grad) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(in_grad.size(), static_cast<size_t>(size_));
-    int axis = CheckAxis(dimension_, out_grad.ndim());
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    std::vector<Tensor<xpu, 3, DType> > grad_in(size_);
-    Tensor<xpu, 3, DType> grad;
-    size_t leading = 1, trailing = 1;
-    for (int i = 0; i < axis; ++i) {
-      leading *= out_grad.shape_[i];
-    }
-    for (int i = axis + 1; i < out_grad.ndim(); ++i) {
-      trailing *= out_grad.shape_[i];
-    }
-    size_t mid = out_grad.shape_[axis];
-    Shape<3> oshape = Shape3(leading, mid, trailing);
-    grad = out_grad.get_with_shape<xpu, 3, DType>(oshape, s);
-
-    for (int i = 0; i < size_; ++i) {
-      Shape<3> dshape = Shape3(leading, in_grad[i].shape_[axis], trailing);
-      grad_in[i] = in_grad[i].get_with_shape<xpu, 3, DType>(dshape, s);
-    }
-    Split(grad, &grad_in, 1, req);
-  }
-
- private:
-  int size_;
-  int dimension_;
-};  // class ConcatOp
-
-template<typename xpu>
-void ConcatCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx,
-                   const std::vector<TBlob>& inputs,
-                   const std::vector<OpReqType>& req,
-                   const std::vector<TBlob>& outputs) {
-  const ConcatParam& param = nnvm::get<ConcatParam>(attrs.parsed);
-  MSHADOW_TYPE_SWITCH(inputs[concat_enum::kData0].type_flag_, DType, {
-    ConcatOp<xpu, DType> op;
-    op.Init(param);
-    op.Forward(ctx, inputs, req, outputs);
-  });
-}
-
-template<typename xpu>
-void ConcatGradCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx,
-                       const std::vector<TBlob>& inputs,
-                       const std::vector<OpReqType>& req,
-                       const std::vector<TBlob>& outputs) {
-  const ConcatParam& param = nnvm::get<ConcatParam>(attrs.parsed);
-  MSHADOW_TYPE_SWITCH(inputs[concat_enum::kOut].type_flag_, DType, {
-    ConcatOp<xpu, DType> op;
-    op.Init(param);
-    op.Backward(ctx, inputs[concat_enum::kOut], req, outputs);
-  });
-}
-
-}  // namespace op
-}  // namespace mxnet
-
-#endif  // MXNET_OPERATOR_NN_CONCAT_INL_H_
diff --git a/src/operator/nn/concat.cc b/src/operator/nn/concat.cc
deleted file mode 100644
index 81dc95f1a5..0000000000
--- a/src/operator/nn/concat.cc
+++ /dev/null
@@ -1,289 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2015 by Contributors
- * \file concat.cc
- * \brief
- * \author Bing Xu
-*/
-
-#include "./concat-inl.h"
-#include "./mkldnn/mkldnn_ops-inl.h"
-#include "./mkldnn/mkldnn_base-inl.h"
-#include "../../common/utils.h"
-
-namespace mxnet {
-namespace op {
-
-static bool ConcatShape(const nnvm::NodeAttrs& attrs,
-                        std::vector<TShape> *in_shape,
-                        std::vector<TShape> *out_shape) {
-  using namespace mshadow;
-  const ConcatParam& param_ = nnvm::get<ConcatParam>(attrs.parsed);
-  CHECK_EQ(in_shape->size(), static_cast<size_t>(param_.num_args));
-  TShape dshape;
-  index_t size = 0;
-  bool has_zero = false;
-  int axis = -1;
-  for (int i = 0; i < param_.num_args; ++i) {
-    TShape tmp = (*in_shape)[i];
-    if (tmp.ndim()) {
-      axis = CheckAxis(param_.dim, tmp.ndim());
-      has_zero = tmp[axis] == 0 || has_zero;
-      size += tmp[axis];
-      tmp[axis] = 0;
-      shape_assign(&dshape, tmp);
-    }
-  }
-
-  TShape tmp = (*out_shape)[0];
-  if (tmp.ndim()) {
-    axis = CheckAxis(param_.dim, tmp.ndim());
-    tmp[axis] = 0;
-    shape_assign(&dshape, tmp);
-  }
-
-  if (dshape.ndim() == 0) return false;
-
-  for (int i = 0; i < param_.num_args; ++i) {
-    CHECK(shape_assign(&(*in_shape)[i], dshape))
-        << "Incompatible input shape: expected " << dshape << ", got " << (*in_shape)[i];
-  }
-
-  if (!has_zero) dshape[axis] = size;
-  CHECK(shape_assign(&(*out_shape)[0], dshape))
-      << "Incompatible output shape: expected " << dshape << ", got " << (*out_shape)[0];
-
-  return dshape.Size() != 0;
-}
-
-static bool ConcatType(const nnvm::NodeAttrs& attrs,
-                       std::vector<int> *in_type,
-                       std::vector<int> *out_type) {
-  const ConcatParam& param_ = nnvm::get<ConcatParam>(attrs.parsed);
-  int dtype = -1;
-
-  for (size_t i = 0; i < in_type->size(); ++i) {
-    if (dtype == -1) {
-      dtype = in_type->at(i);
-    } else {
-      CHECK(in_type->at(i) == dtype ||
-            in_type->at(i) == -1) <<
-          "Non-uniform data type in Concat";
-    }
-  }
-
-  if (dtype == -1) {
-    LOG(FATAL) << "Not enough information to infer type in Concat.";
-    return false;
-  }
-
-  size_t nin = param_.num_args;
-  in_type->clear();
-  for (size_t i = 0; i < nin; ++i) in_type->push_back(dtype);
-
-  out_type->clear();
-  out_type->push_back(dtype);
-
-  return true;
-}
-
-inline static bool ConcatForwardInferStorageType(const nnvm::NodeAttrs& attrs,
-                                                 const int dev_mask,
-                                                 DispatchMode* dispatch_mode,
-                                                 std::vector<int> *in_attrs,
-                                                 std::vector<int> *out_attrs) {
-  CHECK(!in_attrs->empty());
-  CHECK_EQ(out_attrs->size(), 1U);
-  DispatchMode wanted_mode;
-#if MXNET_USE_MKLDNN == 1
-  const ConcatParam& param = nnvm::get<ConcatParam>(attrs.parsed);
-  if (dev_mask == mshadow::cpu::kDevMask
-      && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)
-      && param.dim > 0)
-    wanted_mode = DispatchMode::kFComputeEx;
-  else
-#endif
-    wanted_mode = DispatchMode::kFCompute;
-  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                             dispatch_mode, wanted_mode);
-}
-
-inline static bool BackwardConcatStorageType(const nnvm::NodeAttrs& attrs,
-                                             const int dev_mask,
-                                             DispatchMode* dispatch_mode,
-                                             std::vector<int> *in_attrs,
-                                             std::vector<int> *out_attrs) {
-  DispatchMode wanted_mode;
-#if MXNET_USE_MKLDNN == 1
-  const ConcatParam& param = nnvm::get<ConcatParam>(attrs.parsed);
-  CHECK_EQ(out_attrs->size(), in_attrs->size() - 1);
-  if (dev_mask == mshadow::cpu::kDevMask
-      && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)
-      && param.dim > 0)
-    wanted_mode = DispatchMode::kFComputeEx;
-  else
-#endif
-    wanted_mode = DispatchMode::kFCompute;
-  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                             dispatch_mode, wanted_mode);
-}
-
-#if MXNET_USE_MKLDNN == 1
-static void ConcatComputeExCPU(const nnvm::NodeAttrs& attrs,
-                               const OpContext& op_ctx,
-                               const std::vector<NDArray>& inputs,
-                               const std::vector<OpReqType>& req,
-                               const std::vector<NDArray>& outputs) {
-  CHECK(!inputs.empty());
-  CHECK_EQ(outputs.size(), 1U);
-  CHECK_EQ(req.size(), 1U);
-  if (req[0] == kNullOp) return;
-  // MKLDNN support 2D and 4D concat
-  if ((inputs[0].shape().ndim() == 2 || inputs[0].shape().ndim() == 4)
-      && inputs[0].dtype() == mshadow::kFloat32) {
-    MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
-    MKLDNNConcatForward(attrs, op_ctx, inputs, req, outputs);
-    MKLDNN_OPCHECK_RUN(ConcatCompute<cpu>, attrs, op_ctx, inputs, req, outputs);
-    return;
-  }
-  FallBackCompute(ConcatCompute<cpu>, attrs, op_ctx, inputs, req, outputs);
-}
-
-static void ConcatGradComputeExCPU(const nnvm::NodeAttrs& attrs,
-                                   const OpContext& ctx,
-                                   const std::vector<NDArray>& inputs,
-                                   const std::vector<OpReqType>& req,
-                                   const std::vector<NDArray>& outputs) {
-  if ((inputs[0].shape().ndim() == 2 || inputs[0].shape().ndim() == 4)
-      && inputs[0].dtype() == mshadow::kFloat32) {
-    MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
-    MKLDNNConcatBackward(attrs, ctx, inputs, req, outputs);
-    MKLDNN_OPCHECK_RUN(ConcatGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
-    return;
-  }
-  FallBackCompute(ConcatGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
-}
-#endif
-
-struct ConcatGrad {
-  const char *op_name;
-  std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
-                                          const std::vector<nnvm::NodeEntry>& ograds) const {
-    CHECK_EQ(ograds.size(), 1);
-    std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
-#if MXNET_USE_MKLDNN == 1
-    for (size_t i = 0; i < n->inputs.size(); i++) {
-      heads.push_back(n->inputs[i]);
-    }
-#endif
-    return MakeGradNode(op_name, n, heads, n->attrs.dict);
-  }
-};
-
-DMLC_REGISTER_PARAMETER(ConcatParam);
-
-NNVM_REGISTER_OP(Concat)
-.describe(R"code(Joins input arrays along a given axis.
-
-.. note:: `Concat` is deprecated. Use `concat` instead.
-
-The dimensions of the input arrays should be the same except the axis along
-which they will be concatenated.
-The dimension of the output array along the concatenated axis will be equal
-to the sum of the corresponding dimensions of the input arrays.
-
-Example::
-
-   x = [[1,1],[2,2]]
-   y = [[3,3],[4,4],[5,5]]
-   z = [[6,6], [7,7],[8,8]]
-
-   concat(x,y,z,dim=0) = [[ 1.,  1.],
-                          [ 2.,  2.],
-                          [ 3.,  3.],
-                          [ 4.,  4.],
-                          [ 5.,  5.],
-                          [ 6.,  6.],
-                          [ 7.,  7.],
-                          [ 8.,  8.]]
-
-   Note that you cannot concat x,y,z along dimension 1 since dimension
-   0 is not the same for all the input arrays.
-
-   concat(y,z,dim=1) = [[ 3.,  3.,  6.,  6.],
-                         [ 4.,  4.,  7.,  7.],
-                         [ 5.,  5.,  8.,  8.]]
-
-)code" ADD_FILELINE)
-.set_num_inputs([](const NodeAttrs& attrs) {
-  const ConcatParam& params = nnvm::get<ConcatParam>(attrs.parsed);
-  return params.num_args;
-})
-.set_num_outputs(1)
-.set_attr_parser(ParamParser<ConcatParam>)
-.set_attr<nnvm::FListInputNames>("FListInputNames",
-    [](const NodeAttrs& attrs) {
-  const ConcatParam& params = nnvm::get<ConcatParam>(attrs.parsed);
-  std::vector<std::string> ret;
-  for (int i = 0; i < params.num_args; ++i) {
-    ret.push_back(std::string("arg") + std::to_string(i));
-  }
-  return ret;
-})
-#if MXNET_USE_MKLDNN == 1
-.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
-  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-})
-#endif
-.set_attr<nnvm::FInferShape>("FInferShape", ConcatShape)
-.set_attr<nnvm::FInferType>("FInferType", ConcatType)
-.set_attr<FInferStorageType>("FInferStorageType", ConcatForwardInferStorageType)
-.set_attr<FCompute>("FCompute<cpu>", ConcatCompute<cpu>)
-#if MXNET_USE_MKLDNN == 1
-.set_attr<FComputeEx>("FComputeEx<cpu>", ConcatComputeExCPU)
-#endif
-.set_attr<nnvm::FGradient>("FGradient", ConcatGrad{"_backward_Concat"})
-.set_attr<std::string>("key_var_num_args", "num_args")
-.add_argument("data", "NDArray-or-Symbol[]", "List of arrays to concatenate")
-.add_arguments(ConcatParam::__FIELDS__());
-
-NNVM_REGISTER_OP(Concat).add_alias("concat");
-
-NNVM_REGISTER_OP(_backward_Concat)
-.set_num_outputs([](const NodeAttrs& attrs) {
-  const ConcatParam& params = nnvm::get<ConcatParam>(attrs.parsed);
-  return params.num_args;
-})
-.set_attr_parser(ParamParser<ConcatParam>)
-#if MXNET_USE_MKLDNN == 1
-.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
-  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-})
-#endif
-.set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<FInferStorageType>("FInferStorageType", BackwardConcatStorageType)
-#if MXNET_USE_MKLDNN == 1
-.set_attr<FComputeEx>("FComputeEx<cpu>", ConcatGradComputeExCPU)
-#endif
-.set_attr<FCompute>("FCompute<cpu>", ConcatGradCompute<cpu>);
-
-}  // namespace op
-}  // namespace mxnet
diff --git a/src/operator/nn/convolution-inl.h b/src/operator/nn/convolution-inl.h
index 6204f75c46..1613da6c85 100644
--- a/src/operator/nn/convolution-inl.h
+++ b/src/operator/nn/convolution-inl.h
@@ -22,7 +22,7 @@
  * \file convolution-inl.h
  * \brief
  * \ref: https://github.com/Yangqing/caffe/wiki/Convolution-in-Caffe:-a-memo
- * \author Bing Xu, Jun Wu, Da Zheng
+ * \author Bing Xu, Jun Wu
 */
 #ifndef MXNET_OPERATOR_NN_CONVOLUTION_INL_H_
 #define MXNET_OPERATOR_NN_CONVOLUTION_INL_H_
@@ -148,9 +148,9 @@ namespace mxnet {
 namespace op {
 
 template<typename xpu, typename DType>
-class ConvolutionOp {
+class ConvolutionOp : public Operator {
  public:
-  void Init(ConvolutionParam p) {
+  explicit ConvolutionOp(ConvolutionParam p) {
     this->param_ = p;
     // convert MBytes first to Bytes and then to elements.
     param_.workspace = (param_.workspace << 20) / sizeof(DType);
@@ -160,10 +160,11 @@ class ConvolutionOp {
       << "Only support NCW, NCHW and NCDHW layout";
   }
 
-  void Forward(const OpContext &ctx,
-               const std::vector<TBlob> &in_data,
-               const std::vector<OpReqType> &req,
-               const std::vector<TBlob> &out_data) {
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
     CHECK_EQ(req[conv::kOut], kWriteTo);
@@ -232,19 +233,18 @@ class ConvolutionOp {
     }
   }
 
-  void Backward(const OpContext &ctx,
-                const std::vector<TBlob>& out_grad,
-                const std::vector<TBlob>& in_data,
-                const std::vector<OpReqType>& req,
-                const std::vector<TBlob>& in_grad) {
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob>& out_grad,
+                        const std::vector<TBlob>& in_data,
+                        const std::vector<TBlob>& out_data,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<TBlob>& in_grad,
+                        const std::vector<TBlob>& aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
     CHECK_EQ(out_grad.size(), 1U);
-    // We expect 2 inputs: in data and weight. We don't need bias for
-    // computing gradient.
     size_t expected = param_.no_bias == 0 ? 3 : 2;
-    CHECK_EQ(in_data.size(), expected);
-    CHECK_EQ(in_grad.size(), expected);
+    CHECK(in_data.size() == expected && in_grad.size() == expected);
     CHECK_EQ(req.size(), expected);
     CHECK_EQ(in_data[conv::kWeight].CheckContiguous(), true);
     LayerSetUp(in_grad[conv::kData].shape_, out_grad[conv::kOut].shape_);
@@ -386,35 +386,299 @@ class ConvolutionOp {
 };  // class ConvolutionOp
 
 template<typename xpu>
-void ConvolutionCompute(const nnvm::NodeAttrs& attrs,
-                        const OpContext& ctx, const std::vector<TBlob>& inputs,
-                        const std::vector<OpReqType>& req,
-                        const std::vector<TBlob>& outputs) {
-  const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
-  MSHADOW_REAL_TYPE_SWITCH(inputs[conv::kData].type_flag_, DType, {
-    ConvolutionOp<xpu, DType> op;
-    op.Init(param);
-    op.Forward(ctx, inputs, req, outputs);
-  });
-}
+Operator* CreateOp(ConvolutionParam param, int dtype,
+                   std::vector<TShape> *in_shape,
+                   std::vector<TShape> *out_shape,
+                   Context ctx);
 
-template<typename xpu>
-void ConvolutionGradCompute(const nnvm::NodeAttrs& attrs,
-                            const OpContext& ctx, const std::vector<TBlob>& inputs,
-                            const std::vector<OpReqType>& req,
-                            const std::vector<TBlob>& outputs) {
-  const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
-  std::vector<TBlob> in_data(inputs.begin() + 1, inputs.end());
-  const TBlob &out_grad = inputs[0];
-  const std::vector<TBlob> &in_grad = outputs;
-
-  MSHADOW_REAL_TYPE_SWITCH(out_grad.type_flag_, DType, {
-    ConvolutionOp<xpu, DType> op;
-    op.Init(param);
-    op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
-  });
-}
+#if DMLC_USE_CXX11
+class ConvolutionProp : public OperatorProperty {
+ public:
+  std::vector<std::string> ListArguments() const override {
+    if (!param_.no_bias) {
+      return {"data", "weight", "bias"};
+    } else {
+      return {"data", "weight"};
+    }
+  }
+
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    using namespace mshadow;
+    param_.Init(kwargs);
+    if (param_.kernel.ndim() == 1) {
+      param_.layout = param_.layout? param_.layout.value() : mshadow::kNCW;
+      if (param_.stride.ndim() == 0) param_.stride = Shape1(1);
+      if (param_.dilate.ndim() == 0) param_.dilate = Shape1(1);
+      if (param_.pad.ndim() == 0) param_.pad = Shape1(0);
+    } else if (param_.kernel.ndim() == 2) {
+      param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW;
+      if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1);
+      if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1);
+      if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0);
+    } else {
+      CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D convolution not supported";
+      param_.layout = param_.layout ? param_.layout.value(): mshadow::kNCDHW;
+      if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1);
+      if (param_.dilate.ndim() == 0) param_.dilate = Shape3(1, 1, 1);
+      if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0);
+    }
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    if (!param_.no_bias) {
+      CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]";
+    } else {
+      CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
+    }
+    // CHECK_EQ(out_shape->size(), 1) << "Output: [output]";
+    out_shape->resize(1, TShape());
+    const TShape &dshp = (*in_shape)[conv::kData];
+    if (dshp.ndim() ==  0) return false;
+
+    if (param_.kernel.ndim() == 1) {
+      // 1d conv
+      CHECK_EQ(dshp.ndim(), 3U) << "Input data should be 3D in batch-num_filter-x";
+      Shape<3> dshape = ConvertLayout(dshp.get<3>(), param_.layout.value(), kNCW);
+      Shape<3> wshape = Shape3(param_.num_filter / param_.num_group, dshape[1] / param_.num_group,
+                               param_.kernel[0]);
+      wshape = ConvertLayout(wshape, kNCW, param_.layout.value());
+      wshape[0] *= param_.num_group;
+      SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape);
+      if (!param_.no_bias) {
+        SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter));
+      }
+
+      const index_t dilated_ksize_x = param_.DilatedKernelSize(0);
+      CHECK_EQ(dshape[1] % param_.num_group, 0U) \
+          << "input num_filter must divide group size";
+      CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
+          << "output num_filter must divide group size";
+      CHECK_GT(param_.kernel.Size(), 0U) \
+          << "incorrect kernel size: " << param_.kernel;
+      CHECK_GT(param_.stride.Size(), 0U) \
+          << "incorrect stride size: " << param_.stride;
+      CHECK_GT(param_.dilate.Size(), 0U) \
+          << "incorrect dilate size: " << param_.dilate;
+      Shape<3> oshape;
+      oshape[0] = dshape[0];
+      oshape[1] = param_.num_filter;
+      oshape[2] = dshape[2] ?
+          (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_x) / param_.stride[0] + 1 : 0;
+      SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCW, param_.layout.value()));
+      // Perform incomplete shape inference. Fill in the missing values in data shape.
+      // 1) We can always fill in the batch_size.
+      // 2) We can back-calculate the input height/width if the corresponding stride is 1.
+      oshape = ConvertLayout((*out_shape)[0].get<3>(), param_.layout.value(), kNCW);
+      dshape[0] = oshape[0];
+      if (oshape[2] && param_.stride[0] == 1) {
+        dshape[2] = oshape[2] + dilated_ksize_x - 1 - 2 * param_.pad[0];
+      }
+      SHAPE_ASSIGN_CHECK(*in_shape, conv::kData,
+                          ConvertLayout(dshape, kNCW, param_.layout.value()));
+      // Check whether the kernel sizes are valid
+      if (dshape[2] != 0) {
+        CHECK_LE(dilated_ksize_x, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input";
+      }
+      return true;
+    } else if (param_.kernel.ndim() == 2) {
+      // 2d conv
+      CHECK_EQ(dshp.ndim(), 4U) \
+          << "Input data should be 4D in batch-num_filter-y-x";
+      Shape<4> dshape = ConvertLayout(dshp.get<4>(), param_.layout.value(), kNCHW);
+      Shape<4> wshape = Shape4(param_.num_filter / param_.num_group,
+                               dshape[1] / param_.num_group,
+                               param_.kernel[0], param_.kernel[1]);
+      wshape = ConvertLayout(wshape, kNCHW, param_.layout.value());
+      wshape[0] *= param_.num_group;
+      SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape);
+      if (!param_.no_bias) {
+        SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter));
+      }
+
+      const index_t dilated_ksize_y = param_.DilatedKernelSize(0);
+      const index_t dilated_ksize_x = param_.DilatedKernelSize(1);
+      CHECK_EQ(dshape[1] % param_.num_group, 0U) \
+          << "input num_filter must divide group size";
+      CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
+          << "output num_filter must divide group size";
+      CHECK_GT(param_.kernel.Size(), 0U) \
+          << "incorrect kernel size: " << param_.kernel;
+      CHECK_GT(param_.stride.Size(), 0U) \
+          << "incorrect stride size: " << param_.stride;
+      CHECK_GT(param_.dilate.Size(), 0U) \
+          << "incorrect dilate size: " << param_.dilate;
+      Shape<4> oshape;
+      oshape[0] = dshape[0];
+      oshape[1] = param_.num_filter;
+      oshape[2] = dshape[2] ?
+        (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_y) / param_.stride[0] + 1 : 0;
+      oshape[3] = dshape[3] ?
+        (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_x) / param_.stride[1] + 1 : 0;
+      SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value()));
+      // Perform incomplete shape inference. Fill in the missing values in data shape.
+      // 1) We can always fill in the batch_size.
+      // 2) We can back-calculate the input height/width if the corresponding stride is 1.
+      oshape = ConvertLayout((*out_shape)[0].get<4>(), param_.layout.value(), kNCHW);
+      dshape[0] = oshape[0];
+      if (oshape[2] && param_.stride[0] == 1) {
+        dshape[2] = oshape[2] + dilated_ksize_y - 1 - 2 * param_.pad[0];
+      }
+      if (oshape[3] && param_.stride[1] == 1) {
+        dshape[3] = oshape[3] + dilated_ksize_x - 1 - 2 * param_.pad[1];
+      }
+      SHAPE_ASSIGN_CHECK(*in_shape, conv::kData,
+                          ConvertLayout(dshape, kNCHW, param_.layout.value()));
+      // Check whether the kernel sizes are valid
+      if (dshape[2] != 0) {
+        CHECK_LE(dilated_ksize_y, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input";
+      }
+      if (dshape[3] != 0) {
+        CHECK_LE(dilated_ksize_x, AddPad(dshape[3], param_.pad[1])) << "kernel size exceed input";
+      }
+      return true;
+    } else if (param_.kernel.ndim() == 3) {
+      // 3d conv
+      CHECK_EQ(dshp.ndim(), 5U) \
+        << "Input data should be 5D in batch-num_filter-depth-y-x";
+      Shape<5> dshape = ConvertLayout(dshp.get<5>(), param_.layout.value(), kNCDHW);
+      Shape<5> wshape = Shape5(param_.num_filter / param_.num_group, dshape[1] / param_.num_group,
+                               param_.kernel[0], param_.kernel[1], param_.kernel[2]);
+      wshape = ConvertLayout(wshape, kNCDHW, param_.layout.value());
+      wshape[0] *= param_.num_group;
+      SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape);
+      if (!param_.no_bias) {
+        SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter));
+      }
+
+      // Note: 3D dilation currently not supported.
+      // Calculations below done to preserve symmetry with 1D/2D code.
+      const index_t dilated_ksize_d = param_.DilatedKernelSize(0);
+      const index_t dilated_ksize_y = param_.DilatedKernelSize(1);
+      const index_t dilated_ksize_x = param_.DilatedKernelSize(2);
+      CHECK_EQ(dshape[1] % param_.num_group, 0U)
+        << "input num_filter must divide group size";
+      CHECK_EQ(param_.num_filter % param_.num_group, 0U)
+        << "output num_filter must divide group size";
+      CHECK_GT(param_.kernel.Size(), 0U) \
+        << "incorrect kernel size: " << param_.kernel;
+      CHECK_GT(param_.stride.Size(), 0U) \
+        << "incorrect stride size: " << param_.stride;
+      CHECK_GT(param_.dilate.Size(), 0U) \
+        << "incorrect dilate size: " << param_.dilate;
+      CHECK_EQ(param_.dilate.Size(), 1U)
+        << "Dilate is not supported in 3d convolution";
+      Shape<5> oshape;
+      oshape[0] = dshape[0];
+      oshape[1] = param_.num_filter;
+      oshape[2] = dshape[2] ?
+        (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_d) / param_.stride[0] + 1 : 0;
+      oshape[3] = dshape[3] ?
+        (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_y) / param_.stride[1] + 1 : 0;
+      oshape[4] = dshape[4] ?
+        (AddPad(dshape[4], param_.pad[2]) - dilated_ksize_x) / param_.stride[2] + 1 : 0;
+      SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCDHW, param_.layout.value()));
+      // Perform incomplete shape inference. Fill in the missing values in data shape.
+      // 1) We can always fill in the batch_size.
+      // 2) We can back-calculate the input depth/height/width if the corresponding stride is 1.
+      oshape = ConvertLayout((*out_shape)[0].get<5>(), param_.layout.value(), kNCDHW);
+      dshape[0] = oshape[0];
+      if (oshape[2] && param_.stride[0] == 1) {
+        dshape[2] = oshape[2] + dilated_ksize_d - 1 - 2 * param_.pad[0];
+      }
+      if (oshape[3] && param_.stride[1] == 1) {
+        dshape[3] = oshape[3] + dilated_ksize_y - 1 - 2 * param_.pad[1];
+      }
+      if (oshape[4] && param_.stride[2] == 1) {
+        dshape[4] = oshape[4] + dilated_ksize_x - 1 - 2 * param_.pad[2];
+      }
+      SHAPE_ASSIGN_CHECK(*in_shape, conv::kData,
+                          ConvertLayout(dshape, kNCDHW, param_.layout.value()));
+      // Check whether the kernel sizes are valid
+      if (dshape[2] != 0) {
+        CHECK_LE(dilated_ksize_d, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input";
+      }
+      if (dshape[3] != 0) {
+        CHECK_LE(dilated_ksize_y, AddPad(dshape[3], param_.pad[1])) << "kernel size exceed input";
+      }
+      if (dshape[4] != 0) {
+        CHECK_LE(dilated_ksize_x, AddPad(dshape[4], param_.pad[2])) << "kernel size exceed input";
+      }
+      return true;
+    } else {
+      LOG(FATAL) << "Unknown convolution type";
+      return false;
+    }
+  }
+
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_GE(in_type->size(), 1U);
+    int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "First input must have specified type";
+    for (index_t i = 0; i < in_type->size(); ++i) {
+      if ((*in_type)[i] == -1) {
+        (*in_type)[i] = dtype;
+      } else {
+        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
+      }
+    }
+    out_type->clear();
+    out_type->push_back(dtype);
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    auto ptr = new ConvolutionProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  std::string TypeString() const override {
+    return "Convolution";
+  }
+
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    return {out_grad[conv::kOut], in_data[conv::kData], in_data[conv::kWeight]};
+  }
+
+  std::vector<ResourceRequest> ForwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+
+  std::vector<ResourceRequest> BackwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented.";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
 
+ private:
+  // Adds symmetric padding to a data input (in one dimension)
+  index_t AddPad(index_t dsize, index_t pad) const {
+    return dsize + 2 * pad;
+  }
+
+  ConvolutionParam param_;
+};  // class ConvolutionProp
+#endif  // DMLC_USE_CXX11
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_NN_CONVOLUTION_INL_H_
diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc
index 951063fb4b..ef8ec9034d 100644
--- a/src/operator/nn/convolution.cc
+++ b/src/operator/nn/convolution.cc
@@ -21,13 +21,15 @@
  * Copyright (c) 2017 by Contributors
  * \file convolution.cc
  * \brief
- * \author Bing Xu, Jun Wu, Da Zheng
+ * \author Bing Xu, Jun Wu
 */
 
 #include "./convolution-inl.h"
-#include "../elemwise_op_common.h"
-#include "./mkldnn/mkldnn_ops-inl.h"
-#include "./mkldnn/mkldnn_base-inl.h"
+#if MXNET_USE_MKL2017 == 1
+#include <mkl_memory.h>
+#include "../mkl/mkl_memory-inl.h"
+#include "../mkl/mkl_convolution-inl.h"
+#endif  // MXNET_USE_MKL2017
 #if MXNET_USE_NNPACK == 1
 #include "./nnpack/nnpack_convolution-inl.h"
 #endif  // MXNET_USE_NNPACK
@@ -36,351 +38,63 @@ namespace mxnet {
 namespace op {
 DMLC_REGISTER_PARAMETER(ConvolutionParam);
 
-static inline index_t AddPad(index_t dsize, index_t pad) {
-  return dsize + 2 * pad;
-}
-
-static inline std::vector<std::string> ListArguments(const ConvolutionParam& param_) {
-  if (!param_.no_bias) {
-    return {"data", "weight", "bias"};
-  } else {
-    return {"data", "weight"};
+template<>
+Operator* CreateOp<cpu>(ConvolutionParam param, int dtype,
+                        std::vector<TShape> *in_shape,
+                        std::vector<TShape> *out_shape,
+                        Context ctx) {
+  Operator *op = NULL;
+  // If 1D convolution, use MXNet implementation
+  if (param.kernel.ndim() == 1) {
+    MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+      op = new ConvolutionOp<cpu, DType>(param);
+    })
+    return op;
   }
-}
-
-#if MXNET_USE_MKLDNN == 1
-static void ConvolutionComputeExCPU(const nnvm::NodeAttrs& attrs,
-                                    const OpContext& ctx,
-                                    const std::vector<NDArray>& inputs,
-                                    const std::vector<OpReqType>& req,
-                                    const std::vector<NDArray>& outputs) {
-  if (SupportMKLDNNConv(inputs[0])) {
-    MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
-    MKLDNNConvolutionForward(attrs, ctx, inputs, req, outputs);
-    MKLDNN_OPCHECK_RUN(ConvolutionCompute<cpu>, attrs, ctx, inputs, req, outputs);
-    return;
-  }
-  FallBackCompute(ConvolutionCompute<cpu>, attrs, ctx, inputs, req, outputs);
-}
-
-static void ConvolutionGradComputeExCPU(const nnvm::NodeAttrs& attrs,
-                                        const OpContext& ctx,
-                                        const std::vector<NDArray>& inputs,
-                                        const std::vector<OpReqType>& req,
-                                        const std::vector<NDArray>& outputs) {
-  if (SupportMKLDNNConv(inputs[0])) {
-    MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
-    MKLDNNConvolutionBackward(attrs, ctx, inputs, req, outputs);
-    MKLDNN_OPCHECK_RUN(ConvolutionGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
-    return;
-  }
-  FallBackCompute(ConvolutionGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
-}
-#endif
-
-static bool ConvolutionShape(const nnvm::NodeAttrs& attrs,
-                             std::vector<TShape> *in_shape,
-                             std::vector<TShape> *out_shape) {
-  using namespace mshadow;
-  const ConvolutionParam& param_ = nnvm::get<ConvolutionParam>(attrs.parsed);
-  if (!param_.no_bias) {
-    CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]";
-  } else {
-    CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
-  }
-  // CHECK_EQ(out_shape->size(), 1) << "Output: [output]";
-  out_shape->resize(1, TShape());
-  const TShape &dshp = (*in_shape)[conv::kData];
-  if (dshp.ndim() ==  0) return false;
-
-  if (param_.kernel.ndim() == 1) {
-    // 1d conv
-    CHECK_EQ(dshp.ndim(), 3U) << "Input data should be 3D in batch-num_filter-x";
-    Shape<3> dshape = ConvertLayout(dshp.get<3>(), param_.layout.value(), kNCW);
-    Shape<3> wshape = Shape3(param_.num_filter / param_.num_group, dshape[1] / param_.num_group,
-        param_.kernel[0]);
-    wshape = ConvertLayout(wshape, kNCW, param_.layout.value());
-    wshape[0] *= param_.num_group;
-    SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape);
-    if (!param_.no_bias) {
-      SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter));
-    }
-
-    const index_t dilated_ksize_x = param_.DilatedKernelSize(0);
-    CHECK_EQ(dshape[1] % param_.num_group, 0U) \
-      << "input num_filter must divide group size";
-    CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
-      << "output num_filter must divide group size";
-    CHECK_GT(param_.kernel.Size(), 0U) \
-      << "incorrect kernel size: " << param_.kernel;
-    CHECK_GT(param_.stride.Size(), 0U) \
-      << "incorrect stride size: " << param_.stride;
-    CHECK_GT(param_.dilate.Size(), 0U) \
-      << "incorrect dilate size: " << param_.dilate;
-    Shape<3> oshape;
-    oshape[0] = dshape[0];
-    oshape[1] = param_.num_filter;
-    oshape[2] = dshape[2] ?
-      (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_x) / param_.stride[0] + 1 : 0;
-    SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCW, param_.layout.value()));
-    // Perform incomplete shape inference. Fill in the missing values in data shape.
-    // 1) We can always fill in the batch_size.
-    // 2) We can back-calculate the input height/width if the corresponding stride is 1.
-    oshape = ConvertLayout((*out_shape)[0].get<3>(), param_.layout.value(), kNCW);
-    dshape[0] = oshape[0];
-    if (oshape[2] && param_.stride[0] == 1) {
-      dshape[2] = oshape[2] + dilated_ksize_x - 1 - 2 * param_.pad[0];
-    }
-    SHAPE_ASSIGN_CHECK(*in_shape, conv::kData,
-        ConvertLayout(dshape, kNCW, param_.layout.value()));
-    // Check whether the kernel sizes are valid
-    if (dshape[2] != 0) {
-      CHECK_LE(dilated_ksize_x, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input";
-    }
-    return true;
-  } else if (param_.kernel.ndim() == 2) {
-    // 2d conv
-    CHECK_EQ(dshp.ndim(), 4U) \
-      << "Input data should be 4D in batch-num_filter-y-x";
-    Shape<4> dshape = ConvertLayout(dshp.get<4>(), param_.layout.value(), kNCHW);
-    Shape<4> wshape = Shape4(param_.num_filter / param_.num_group,
-        dshape[1] / param_.num_group,
-        param_.kernel[0], param_.kernel[1]);
-    wshape = ConvertLayout(wshape, kNCHW, param_.layout.value());
-    wshape[0] *= param_.num_group;
-    SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape);
-    if (!param_.no_bias) {
-      SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter));
-    }
-
-    const index_t dilated_ksize_y = param_.DilatedKernelSize(0);
-    const index_t dilated_ksize_x = param_.DilatedKernelSize(1);
-    CHECK_EQ(dshape[1] % param_.num_group, 0U) \
-      << "input num_filter must divide group size";
-    CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
-      << "output num_filter must divide group size";
-    CHECK_GT(param_.kernel.Size(), 0U) \
-      << "incorrect kernel size: " << param_.kernel;
-    CHECK_GT(param_.stride.Size(), 0U) \
-      << "incorrect stride size: " << param_.stride;
-    CHECK_GT(param_.dilate.Size(), 0U) \
-      << "incorrect dilate size: " << param_.dilate;
-    Shape<4> oshape;
-    oshape[0] = dshape[0];
-    oshape[1] = param_.num_filter;
-    oshape[2] = dshape[2] ?
-      (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_y) / param_.stride[0] + 1 : 0;
-    oshape[3] = dshape[3] ?
-      (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_x) / param_.stride[1] + 1 : 0;
-    SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value()));
-    // Perform incomplete shape inference. Fill in the missing values in data shape.
-    // 1) We can always fill in the batch_size.
-    // 2) We can back-calculate the input height/width if the corresponding stride is 1.
-    oshape = ConvertLayout((*out_shape)[0].get<4>(), param_.layout.value(), kNCHW);
-    dshape[0] = oshape[0];
-    if (oshape[2] && param_.stride[0] == 1) {
-      dshape[2] = oshape[2] + dilated_ksize_y - 1 - 2 * param_.pad[0];
-    }
-    if (oshape[3] && param_.stride[1] == 1) {
-      dshape[3] = oshape[3] + dilated_ksize_x - 1 - 2 * param_.pad[1];
-    }
-    SHAPE_ASSIGN_CHECK(*in_shape, conv::kData,
-        ConvertLayout(dshape, kNCHW, param_.layout.value()));
-    // Check whether the kernel sizes are valid
-    if (dshape[2] != 0) {
-      CHECK_LE(dilated_ksize_y, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input";
-    }
-    if (dshape[3] != 0) {
-      CHECK_LE(dilated_ksize_x, AddPad(dshape[3], param_.pad[1])) << "kernel size exceed input";
-    }
-    return true;
-  } else if (param_.kernel.ndim() == 3) {
-    // 3d conv
-    CHECK_EQ(dshp.ndim(), 5U) \
-      << "Input data should be 5D in batch-num_filter-depth-y-x";
-    Shape<5> dshape = ConvertLayout(dshp.get<5>(), param_.layout.value(), kNCDHW);
-    Shape<5> wshape = Shape5(param_.num_filter / param_.num_group, dshape[1] / param_.num_group,
-        param_.kernel[0], param_.kernel[1], param_.kernel[2]);
-    wshape = ConvertLayout(wshape, kNCDHW, param_.layout.value());
-    wshape[0] *= param_.num_group;
-    SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape);
-    if (!param_.no_bias) {
-      SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter));
-    }
-
-    // Note: 3D dilation currently not supported.
-    // Calculations below done to preserve symmetry with 1D/2D code.
-    const index_t dilated_ksize_d = param_.DilatedKernelSize(0);
-    const index_t dilated_ksize_y = param_.DilatedKernelSize(1);
-    const index_t dilated_ksize_x = param_.DilatedKernelSize(2);
-    CHECK_EQ(dshape[1] % param_.num_group, 0U)
-      << "input num_filter must divide group size";
-    CHECK_EQ(param_.num_filter % param_.num_group, 0U)
-      << "output num_filter must divide group size";
-    CHECK_GT(param_.kernel.Size(), 0U) \
-      << "incorrect kernel size: " << param_.kernel;
-    CHECK_GT(param_.stride.Size(), 0U) \
-      << "incorrect stride size: " << param_.stride;
-    CHECK_GT(param_.dilate.Size(), 0U) \
-      << "incorrect dilate size: " << param_.dilate;
-    CHECK_EQ(param_.dilate.Size(), 1U)
-      << "Dilate is not supported in 3d convolution";
-    Shape<5> oshape;
-    oshape[0] = dshape[0];
-    oshape[1] = param_.num_filter;
-    oshape[2] = dshape[2] ?
-      (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_d) / param_.stride[0] + 1 : 0;
-    oshape[3] = dshape[3] ?
-      (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_y) / param_.stride[1] + 1 : 0;
-    oshape[4] = dshape[4] ?
-      (AddPad(dshape[4], param_.pad[2]) - dilated_ksize_x) / param_.stride[2] + 1 : 0;
-    SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCDHW, param_.layout.value()));
-    // Perform incomplete shape inference. Fill in the missing values in data shape.
-    // 1) We can always fill in the batch_size.
-    // 2) We can back-calculate the input depth/height/width if the corresponding stride is 1.
-    oshape = ConvertLayout((*out_shape)[0].get<5>(), param_.layout.value(), kNCDHW);
-    dshape[0] = oshape[0];
-    if (oshape[2] && param_.stride[0] == 1) {
-      dshape[2] = oshape[2] + dilated_ksize_d - 1 - 2 * param_.pad[0];
+#if MXNET_USE_MKL2017 == 1
+  if ((param.dilate[0] == 1 && param.dilate[1] == 1)
+      && param.kernel.ndim() == 2) {
+    switch (dtype) {
+    case mshadow::kFloat32:
+      return new MKLConvolutionOp<cpu, float>(param);
+    case mshadow::kFloat64:
+      return new MKLConvolutionOp<cpu, double>(param);
+    default:
+      break;
     }
-    if (oshape[3] && param_.stride[1] == 1) {
-      dshape[3] = oshape[3] + dilated_ksize_y - 1 - 2 * param_.pad[1];
-    }
-    if (oshape[4] && param_.stride[2] == 1) {
-      dshape[4] = oshape[4] + dilated_ksize_x - 1 - 2 * param_.pad[2];
-    }
-    SHAPE_ASSIGN_CHECK(*in_shape, conv::kData,
-        ConvertLayout(dshape, kNCDHW, param_.layout.value()));
-    // Check whether the kernel sizes are valid
-    if (dshape[2] != 0) {
-      CHECK_LE(dilated_ksize_d, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input";
-    }
-    if (dshape[3] != 0) {
-      CHECK_LE(dilated_ksize_y, AddPad(dshape[3], param_.pad[1])) << "kernel size exceed input";
-    }
-    if (dshape[4] != 0) {
-      CHECK_LE(dilated_ksize_x, AddPad(dshape[4], param_.pad[2])) << "kernel size exceed input";
-    }
-    return true;
-  } else {
-    LOG(FATAL) << "Unknown convolution type";
-    return false;
   }
-}
-
-static bool ConvolutionType(const nnvm::NodeAttrs& attrs,
-                            std::vector<int> *in_type, std::vector<int> *out_type) {
-  const ConvolutionParam& param_ = nnvm::get<ConvolutionParam>(attrs.parsed);
-  CHECK_GE(in_type->size(), 1U);
-  int dtype = (*in_type)[0];
-  CHECK_NE(dtype, -1) << "First input must have specified type";
-  for (index_t i = 0; i < in_type->size(); ++i) {
-    if ((*in_type)[i] == -1) {
-      (*in_type)[i] = dtype;
-    } else {
-      UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments(param_)[i]);
+#endif
+#if MXNET_USE_NNPACK == 1
+  const size_t batch_size = (*in_shape)[0][0];
+  if ((param.dilate[0] == 1 && param.dilate[1] == 1)
+      && param.kernel.ndim() == 2 && (!param.no_bias)
+      && param.num_group == 1 && (batch_size == 1 ||
+      ((batch_size > 1) && (param.stride[0] == 1) &&
+      (param.stride[1] == 1)))) {
+    switch (dtype) {
+    case mshadow::kFloat32:
+      return new NNPACKConvolutionOp<cpu, float>(param);
+    default:
+      break;
     }
   }
-  out_type->clear();
-  out_type->push_back(dtype);
-  return true;
-}
-
-inline static bool ConvStorageType(const nnvm::NodeAttrs& attrs,
-                                   const int dev_mask,
-                                   DispatchMode* dispatch_mode,
-                                   std::vector<int> *in_attrs,
-                                   std::vector<int> *out_attrs) {
-  const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
-  uint32_t in_expected = param.no_bias ? 2 : 3;
-  CHECK_EQ(in_attrs->size(), in_expected);
-  CHECK_EQ(out_attrs->size(), 1);
-
-  DispatchMode wanted_mode;
-#if MXNET_USE_MKLDNN == 1
-  if (dev_mask == mshadow::cpu::kDevMask)
-    wanted_mode = DispatchMode::kFComputeEx;
-  else
 #endif
-    wanted_mode = DispatchMode::kFCompute;
-  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                             dispatch_mode, wanted_mode);
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new ConvolutionOp<cpu, DType>(param);
+  })
+  return op;
 }
 
-inline static bool BackwardConvStorageType(const nnvm::NodeAttrs& attrs,
-                                           const int dev_mask,
-                                           DispatchMode* dispatch_mode,
-                                           std::vector<int> *in_attrs,
-                                           std::vector<int> *out_attrs) {
-  const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
-  uint32_t in_expected = param.no_bias ? 3 : 4;
-  uint32_t out_expected = param.no_bias ? 2 : 3;
-  CHECK_EQ(in_attrs->size(), in_expected);
-  CHECK_EQ(out_attrs->size(), out_expected);
-
-  DispatchMode wanted_mode;
-#if MXNET_USE_MKLDNN == 1
-  if (dev_mask == mshadow::cpu::kDevMask)
-    wanted_mode = DispatchMode::kFComputeEx;
-  else
-#endif
-    wanted_mode = DispatchMode::kFCompute;
-  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                             dispatch_mode, wanted_mode);
-}
-
-static void ConvolutionParamParser(nnvm::NodeAttrs* attrs) {
-  using namespace mshadow;
-  ConvolutionParam param_;
-  try {
-    param_.Init(attrs->dict);
-  } catch (const dmlc::ParamError& e) {
-    std::ostringstream os;
-    os << e.what();
-    os << ", in operator " << attrs->op->name << "("
-       << "name=\"" << attrs->name << "\"";
-    for (const auto& k : attrs->dict) {
-      os << ", " << k.first << "=\"" << k.second << "\"";
-    }
-    os << ")";
-    throw dmlc::ParamError(os.str());
-  }
-
-  if (param_.kernel.ndim() == 1) {
-    param_.layout = param_.layout? param_.layout.value() : mshadow::kNCW;
-    if (param_.stride.ndim() == 0) param_.stride = Shape1(1);
-    if (param_.dilate.ndim() == 0) param_.dilate = Shape1(1);
-    if (param_.pad.ndim() == 0) param_.pad = Shape1(0);
-  } else if (param_.kernel.ndim() == 2) {
-    param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW;
-    if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1);
-    if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1);
-    if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0);
-  } else {
-    CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D convolution not supported";
-    param_.layout = param_.layout ? param_.layout.value(): mshadow::kNCDHW;
-    if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1);
-    if (param_.dilate.ndim() == 0) param_.dilate = Shape3(1, 1, 1);
-    if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0);
-  }
-  attrs->parsed = std::move(param_);
+// DO_BIND_DISPATCH comes from operator_common.h
+Operator *ConvolutionProp::CreateOperatorEx(Context ctx,
+                                            std::vector<TShape> *in_shape,
+                                            std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape, aux_shape;
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], in_shape, &out_shape, ctx);
 }
 
-struct ConvolutionGrad {
-  const char *op_name;
-  std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
-                                          const std::vector<nnvm::NodeEntry>& ograds) const {
-    const ConvolutionParam& param = nnvm::get<ConvolutionParam>(n->attrs.parsed);
-    std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
-    heads.push_back(n->inputs[conv::kData]);
-    heads.push_back(n->inputs[conv::kWeight]);
-    if (!param.no_bias)
-      heads.push_back(n->inputs[conv::kBias]);
-    return MakeGradNode(op_name, n, heads, n->attrs.dict);
-  }
-};
-
-NNVM_REGISTER_OP(Convolution)
+MXNET_REGISTER_OP_PROPERTY(Convolution, ConvolutionProp)
 .describe(R"code(Compute *N*-D convolution on *(N+2)*-D input.
 
 In the 2-D convolution, given input data with shape *(batch_size,
@@ -454,51 +168,10 @@ There are other options to tune the performance.
   the performance.
 
 )code" ADD_FILELINE)
-.set_num_inputs([](const NodeAttrs& attrs) {
-  const ConvolutionParam& params = nnvm::get<ConvolutionParam>(attrs.parsed);
-  return params.no_bias ? 2 : 3;
-})
-.set_num_outputs(1)
-.set_attr_parser(ConvolutionParamParser)
-.set_attr<nnvm::FListInputNames>("FListInputNames",
-    [](const NodeAttrs& attrs) {
-  const ConvolutionParam& params = nnvm::get<ConvolutionParam>(attrs.parsed);
-  if (params.no_bias)
-    return std::vector<std::string>{"data", "weight"};
-  else
-    return std::vector<std::string>{"data", "weight", "bias"};
-})
-.set_attr<nnvm::FInferShape>("FInferShape", ConvolutionShape)
-.set_attr<nnvm::FInferType>("FInferType", ConvolutionType)
-.set_attr<FInferStorageType>("FInferStorageType", ConvStorageType)
-.set_attr<FCompute>("FCompute<cpu>", ConvolutionCompute<cpu>)
-#if MXNET_USE_MKLDNN == 1
-.set_attr<FComputeEx>("FComputeEx<cpu>", ConvolutionComputeExCPU)
-#endif
-.set_attr<nnvm::FGradient>("FGradient", ConvolutionGrad{"_backward_Convolution"})
-.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
-  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-})
 .add_argument("data", "NDArray-or-Symbol", "Input data to the ConvolutionOp.")
 .add_argument("weight", "NDArray-or-Symbol", "Weight matrix.")
 .add_argument("bias", "NDArray-or-Symbol", "Bias parameter.")
 .add_arguments(ConvolutionParam::__FIELDS__());
 
-NNVM_REGISTER_OP(_backward_Convolution)
-.set_num_outputs([](const NodeAttrs& attrs) {
-  const ConvolutionParam& params = nnvm::get<ConvolutionParam>(attrs.parsed);
-  return params.no_bias ? 2 : 3;
-})
-.set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<FInferStorageType>("FInferStorageType", BackwardConvStorageType)
-.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
-  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-})
-.set_attr_parser(ConvolutionParamParser)
-#if MXNET_USE_MKLDNN == 1
-.set_attr<FComputeEx>("FComputeEx<cpu>", ConvolutionGradComputeExCPU)
-#endif
-.set_attr<FCompute>("FCompute<cpu>", ConvolutionGradCompute<cpu>);
-
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/convolution.cu b/src/operator/nn/convolution.cu
index d7f9e564a6..7234daf0d6 100644
--- a/src/operator/nn/convolution.cu
+++ b/src/operator/nn/convolution.cu
@@ -21,136 +21,36 @@
  * Copyright (c) 2017 by Contributors
  * \file convolution.cu
  * \brief
- * \author Bing Xu, Jun Wu, Da Zheng
+ * \author Bing Xu, Jun Wu
 */
 
 #include "./convolution-inl.h"
 #include <vector>
-#include "./depthwise_convolution-inl.h"
 #if MXNET_USE_CUDNN == 1
 #include "./cudnn/cudnn_convolution-inl.h"
 #endif  // MXNET_USE_CUDNN
 
+#include "./depthwise_convolution-inl.h"
+
 namespace mxnet {
 namespace op {
 
-#if MXNET_USE_CUDNN == 1
-template<typename DType>
-static CuDNNConvolutionOp<DType> &GetCuDNNConvOp(const ConvolutionParam& param,
-    int forward_compute_type, int backward_compute_type,
-    const std::vector<TShape>& in_shape, const std::vector<TShape>& out_shape,
-    const Context& ctx) {
-#if DMLC_CXX11_THREAD_LOCAL
-  static thread_local CuDNNConvolutionOp<DType> op;
-#else
-  static MX_THREAD_LOCAL CuDNNConvolutionOp<DType> op;
-#endif
-  op.Init(param, forward_compute_type, backward_compute_type,
-      in_shape, out_shape, ctx);
-  return op;
-}
-#endif
-
-template<>
-void ConvolutionCompute<gpu>(const nnvm::NodeAttrs& attrs,
-    const OpContext& ctx, const std::vector<TBlob>& inputs,
-    const std::vector<OpReqType>& req,
-    const std::vector<TBlob>& outputs) {
-  const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
-  int dtype = inputs[conv::kData].type_flag_;
-
-  // If 1D convolution, use MXNet implementation
-  if (param.kernel.ndim() == 1) {
-    MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-      ConvolutionOp<gpu, DType> op;
-      op.Init(param);
-      op.Forward(ctx, inputs, req, outputs);
-    })
-    return;
-  } else if (param.num_filter == param.num_group &&
-      param.layout.value() == mshadow::kNCHW &&
-      param.num_filter == inputs[conv::kData].shape_[1] &&
-      param.kernel.ndim() == 2 &&
-      param.dilate == mshadow::Shape2(1, 1) &&
-      dtype == mshadow::kFloat32) {
-    std::vector<TShape> in_shape(inputs.size());
-    std::vector<TShape> out_shape(1, outputs[0].shape_);
-    for (size_t i = 0; i < in_shape.size(); i++)
-      in_shape[i] = inputs[i].shape_;
-    DepthwiseConvolutionOp<float> op;
-    op.Init(param, in_shape, out_shape);
-    op.Forward(ctx, inputs, req, outputs);
-    return;
-  }
-
-#if MXNET_USE_CUDNN == 1
-  // On fp16-I/O instances, use fp32 compute (i.e. pseudo-fp16).
-  int compute_type = (dtype == mshadow::kFloat16) ? mshadow::kFloat32 : dtype;
-
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    if (param.cudnn_off) {
-      ConvolutionOp<gpu, DType> op;
-      op.Init(param);
-      op.Forward(ctx, inputs, req, outputs);
-    } else if (!CuDNNConvolutionOp<DType>::Supports(param,
-          compute_type, compute_type, ctx.run_ctx.ctx)) {
-      LOG(WARNING) << "This convolution is not supported by cudnn, MXNET convolution is applied.";
-      ConvolutionOp<gpu, DType> op;
-      op.Init(param);
-      op.Forward(ctx, inputs, req, outputs);
-    } else {
-      std::vector<TShape> in_shape(inputs.size());
-      std::vector<TShape> out_shape(1, outputs[0].shape_);
-      for (size_t i = 0; i < in_shape.size(); i++)
-        in_shape[i] = inputs[i].shape_;
-      CuDNNConvolutionOp<DType> &op = GetCuDNNConvOp<DType>(param,
-          compute_type, compute_type, in_shape, out_shape, ctx.run_ctx.ctx);
-      op.Forward(ctx, inputs, req, outputs);
-    }
-  })
-#else
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    ConvolutionOp<gpu, DType> op;
-    op.Init(param);
-    op.Forward(ctx, inputs, req, outputs);
-  })
-#endif  // MXNET_USE_CUDNN
-}
-
 template<>
-void ConvolutionGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
-    const OpContext& ctx, const std::vector<TBlob>& inputs,
-    const std::vector<OpReqType>& req,
-    const std::vector<TBlob>& outputs) {
-  const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
-  std::vector<TBlob> in_data(inputs.begin() + 1, inputs.end());
-  const TBlob &out_grad = inputs[0];
-  const std::vector<TBlob> &in_grad = outputs;
-  int dtype = out_grad.type_flag_;
-
-  // If 1D convolution, use MXNet implementation
-  if (param.kernel.ndim() == 1) {
-    MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-      ConvolutionOp<gpu, DType> op;
-      op.Init(param);
-      op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
-    })
-    return;
-  } else if (param.num_filter == param.num_group &&
+Operator* CreateOp<gpu>(ConvolutionParam param, int dtype,
+                        std::vector<TShape> *in_shape,
+                        std::vector<TShape> *out_shape,
+                        Context ctx) {
+  Operator *op = NULL;
+
+  // depth wise conv
+  if (param.num_filter == param.num_group &&
       param.layout.value() == mshadow::kNCHW &&
-      param.num_filter == in_data[conv::kData].shape_[1] &&
+      param.num_filter == (*in_shape)[conv::kData][1] &&
       param.kernel.ndim() == 2 &&
       param.dilate == mshadow::Shape2(1, 1) &&
       dtype == mshadow::kFloat32) {
-    // The first element stores out grad.
-    std::vector<TShape> in_shape(in_data.size());
-    std::vector<TShape> out_shape(1, out_grad.shape_);
-    for (size_t i = 0; i < in_shape.size(); i++)
-      in_shape[i] = in_data[i].shape_;
-    DepthwiseConvolutionOp<float> op;
-    op.Init(param, in_shape, out_shape);
-    op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
-    return;
+    op = new DepthwiseConvolutionOp<float>(param, *in_shape, *out_shape);
+    return op;
   }
 
 #if MXNET_USE_CUDNN == 1
@@ -159,41 +59,23 @@ void ConvolutionGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
 
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
     if (param.cudnn_off) {
-      ConvolutionOp<gpu, DType> op;
-      op.Init(param);
-      op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
-    } else if (!CuDNNConvolutionOp<DType>::Supports(param,
-          compute_type, compute_type, ctx.run_ctx.ctx)) {
+      op = new ConvolutionOp<gpu, DType>(param);
+    } else if (!CuDNNConvolutionOp<DType>::Supports(param, compute_type, compute_type, ctx)) {
       LOG(WARNING) << "This convolution is not supported by cudnn, MXNET convolution is applied.";
-      ConvolutionOp<gpu, DType> op;
-      op.Init(param);
-      op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
+      op = new ConvolutionOp<gpu, DType>(param);
     } else {
-      // The first element stores out grad.
-      std::vector<TShape> in_shape(in_data.size());
-      std::vector<TShape> out_shape(1, out_grad.shape_);
-      for (size_t i = 0; i < in_shape.size(); i++)
-        in_shape[i] = in_data[i].shape_;
-      CuDNNConvolutionOp<DType> &op = GetCuDNNConvOp<DType>(param,
-          compute_type, compute_type, in_shape, out_shape, ctx.run_ctx.ctx);
-      op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
+      op = new CuDNNConvolutionOp<DType>(param, compute_type, compute_type,
+                                         *in_shape, *out_shape, ctx);
     }
   })
 #else
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    ConvolutionOp<gpu, DType> op;
-    op.Init(param);
-    op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
+    op = new ConvolutionOp<gpu, DType>(param);
   })
 #endif  // MXNET_USE_CUDNN
+  return op;
 }
 
-NNVM_REGISTER_OP(Convolution)
-.set_attr<FCompute>("FCompute<gpu>", ConvolutionCompute<gpu>);
-
-NNVM_REGISTER_OP(_backward_Convolution)
-.set_attr<FCompute>("FCompute<gpu>", ConvolutionGradCompute<gpu>);
-
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/nn/cudnn/cudnn_activation-inl.h b/src/operator/nn/cudnn/cudnn_activation-inl.h
index a89e7bfaf0..888528309c 100644
--- a/src/operator/nn/cudnn/cudnn_activation-inl.h
+++ b/src/operator/nn/cudnn/cudnn_activation-inl.h
@@ -33,19 +33,12 @@
 namespace mxnet {
 namespace op {
 template<typename DType>
-class CuDNNActivationOp {
+class CuDNNActivationOp : public Operator {
  public:
-  CuDNNActivationOp() {
-    dtype_ = mshadow::DataType<DType>::kCudnnFlag;
-    #if CUDNN_MAJOR >= 5
-    nan_prop_ = CUDNN_NOT_PROPAGATE_NAN;
-    CUDNN_CALL(cudnnCreateActivationDescriptor(&desc_));
-    #endif
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&shape_desc_));
-  }
-
-  void Init(const ActivationParam &param) {
+  explicit CuDNNActivationOp(ActivationParam param) {
     param_ = param;
+    init_cudnn_ = false;
+    dtype_ = mshadow::DataType<DType>::kCudnnFlag;
     switch (param_.act_type) {
       case activation::kReLU:
         mode_ = CUDNN_ACTIVATION_RELU;
@@ -61,54 +54,67 @@ class CuDNNActivationOp {
         break;
     }
     #if CUDNN_MAJOR >= 5
+    nan_prop_ = CUDNN_NOT_PROPAGATE_NAN;
+    CUDNN_CALL(cudnnCreateActivationDescriptor(&desc_));
     CUDNN_CALL(cudnnSetActivationDescriptor(desc_, mode_, nan_prop_, relu_ceil_));
     #endif
   }
 
   ~CuDNNActivationOp() {
-    CUDNN_CALL(cudnnDestroyTensorDescriptor(shape_desc_));
-    #if CUDNN_MAJOR >= 5
-    CUDNN_CALL(cudnnDestroyActivationDescriptor(desc_));
-    #endif
+    if (init_cudnn_) {
+      CUDNN_CALL(cudnnDestroyTensorDescriptor(shape_desc_));
+      #if CUDNN_MAJOR >= 5
+      CUDNN_CALL(cudnnDestroyActivationDescriptor(desc_));
+      #endif
+    }
   }
 
-  void Forward(const OpContext &ctx, const TBlob &in_data,
-      const OpReqType &req, const TBlob &out_data) {
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 1U);
+    CHECK_EQ(out_data.size(), 1U);
     Stream<gpu> *s = ctx.get_stream<gpu>();
     Tensor<gpu, 4, DType> data;
     Tensor<gpu, 4, DType> out;
-    if (in_data.ndim() == 2) {
-      Shape<4> dshape = Shape4(in_data.shape_[0],
-                               in_data.shape_[1], 1, 1);
-      data = in_data.get_with_shape<gpu, 4, DType>(dshape, s);
-      out = out_data.get_with_shape<gpu, 4, DType>(dshape, s);
+    if (in_data[activation::kData].ndim() == 2) {
+      Shape<4> dshape = Shape4(in_data[activation::kData].shape_[0],
+                               in_data[activation::kData].shape_[1], 1, 1);
+      data = in_data[activation::kData].get_with_shape<gpu, 4, DType>(dshape, s);
+      out = out_data[activation::kOut].get_with_shape<gpu, 4, DType>(dshape, s);
     } else {
       Shape<4> dshape;
-      index_t size_left = in_data.Size();
+      index_t size_left = in_data[activation::kData].Size();
       for (int i = 0; i < 3; ++i) {
-        if (i < in_data.ndim()) {
-          dshape[i] = in_data.shape_[i];
+        if (i < in_data[activation::kData].ndim()) {
+          dshape[i] = in_data[activation::kData].shape_[i];
         } else {
           dshape[i] = 1;
         }
         size_left /= dshape[i];
       }
       dshape[3] = size_left;
-      data = in_data.get_with_shape<gpu, 4, DType>(dshape, s);
-      out = out_data.get_with_shape<gpu, 4, DType>(dshape, s);
+      data = in_data[activation::kData].get_with_shape<gpu, 4, DType>(dshape, s);
+      out = out_data[activation::kOut].get_with_shape<gpu, 4, DType>(dshape, s);
     }
     typename DataType<DType>::ScaleType alpha = 1.0f;
     typename DataType<DType>::ScaleType beta = 0.0f;
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
-    CUDNN_CALL(cudnnSetTensor4dDescriptor(shape_desc_,
-                                          CUDNN_TENSOR_NCHW,
-                                          dtype_,
-                                          data.shape_[0],
-                                          data.shape_[1],
-                                          data.shape_[2],
-                                          data.shape_[3]));
+    if (!init_cudnn_) {
+      init_cudnn_ = true;
+      CUDNN_CALL(cudnnCreateTensorDescriptor(&shape_desc_));
+      CUDNN_CALL(cudnnSetTensor4dDescriptor(shape_desc_,
+                                            CUDNN_TENSOR_NCHW,
+                                            dtype_,
+                                            data.shape_[0],
+                                            data.shape_[1],
+                                            data.shape_[2],
+                                            data.shape_[3]));
+    }
     #if CUDNN_MAJOR <= 4
     CUDNN_CALL(cudnnActivationForward(s->dnn_handle_,
                                       mode_,
@@ -130,11 +136,20 @@ class CuDNNActivationOp {
     #endif
   }
 
-  void Backward(const OpContext &ctx, const TBlob &out_grad,
-      const TBlob &in_data, const TBlob &out_data,
-      const OpReqType &req, const TBlob &in_grad) {
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), 1U);
+    CHECK_EQ(in_data.size(), 1U);
+    CHECK_EQ(out_data.size(), 1U);
+    CHECK_EQ(req.size(), 1U);
+    CHECK_EQ(in_grad.size(), 1U);
     typename DataType<DType>::ScaleType alpha = 1.0f;
     typename DataType<DType>::ScaleType beta = 0.0f;
     Stream<gpu> *s = ctx.get_stream<gpu>();
@@ -142,38 +157,31 @@ class CuDNNActivationOp {
     Tensor<gpu, 4, DType> data;
     Tensor<gpu, 4, DType> output_data;
     Tensor<gpu, 4, DType> input_grad;
-    if (in_grad.ndim() == 2) {
-      Shape<4> dshape = Shape4(in_grad.shape_[0],
-                               in_grad.shape_[1], 1, 1);
-      data = in_data.get_with_shape<gpu, 4, DType>(dshape, s);
-      grad = out_grad.get_with_shape<gpu, 4, DType>(dshape, s);
-      output_data = out_data.get_with_shape<gpu, 4, DType>(dshape, s);
-      input_grad = in_grad.get_with_shape<gpu, 4, DType>(dshape, s);
+    if (in_grad[activation::kData].ndim() == 2) {
+      Shape<4> dshape = Shape4(in_grad[activation::kData].shape_[0],
+                               in_grad[activation::kData].shape_[1], 1, 1);
+      data = in_data[activation::kData].get_with_shape<gpu, 4, DType>(dshape, s);
+      grad = out_grad[activation::kOut].get_with_shape<gpu, 4, DType>(dshape, s);
+      output_data = out_data[activation::kOut].get_with_shape<gpu, 4, DType>(dshape, s);
+      input_grad = in_grad[activation::kData].get_with_shape<gpu, 4, DType>(dshape, s);
     } else {
       Shape<4> dshape;
-      index_t size_left = in_grad.Size();
+      index_t size_left = in_grad[activation::kData].Size();
       for (int i = 0; i < 3; ++i) {
-        if (i < in_grad.ndim()) {
-          dshape[i] = in_grad.shape_[i];
+        if (i < in_grad[activation::kData].ndim()) {
+          dshape[i] = in_grad[activation::kData].shape_[i];
         } else {
           dshape[i] = 1;
         }
         size_left /= dshape[i];
       }
       dshape[3] = size_left;
-      data = in_data.get_with_shape<gpu, 4, DType>(dshape, s);
-      output_data = out_data.get_with_shape<gpu, 4, DType>(dshape, s);
-      grad = out_grad.get_with_shape<gpu, 4, DType>(dshape, s);
-      input_grad = in_grad.get_with_shape<gpu, 4, DType>(dshape, s);
+      data = in_data[activation::kData].get_with_shape<gpu, 4, DType>(dshape, s);
+      output_data = out_data[activation::kOut].get_with_shape<gpu, 4, DType>(dshape, s);
+      grad = out_grad[activation::kOut].get_with_shape<gpu, 4, DType>(dshape, s);
+      input_grad = in_grad[activation::kData].get_with_shape<gpu, 4, DType>(dshape, s);
     }
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
-    CUDNN_CALL(cudnnSetTensor4dDescriptor(shape_desc_,
-                                          CUDNN_TENSOR_NCHW,
-                                          dtype_,
-                                          data.shape_[0],
-                                          data.shape_[1],
-                                          data.shape_[2],
-                                          data.shape_[3]));
     #if CUDNN_MAJOR <= 4
     CUDNN_CALL(cudnnActivationBackward(s->dnn_handle_,
                                        mode_,
@@ -204,6 +212,7 @@ class CuDNNActivationOp {
   }
 
  private:
+  bool init_cudnn_;
   cudnnDataType_t dtype_;
   cudnnActivationMode_t mode_;
   cudnnTensorDescriptor_t shape_desc_;
diff --git a/src/operator/nn/cudnn/cudnn_batch_norm-inl.h b/src/operator/nn/cudnn/cudnn_batch_norm-inl.h
index e233704906..3dc9c8353a 100644
--- a/src/operator/nn/cudnn/cudnn_batch_norm-inl.h
+++ b/src/operator/nn/cudnn/cudnn_batch_norm-inl.h
@@ -43,30 +43,28 @@ enum CuDNNBatchNormOpAuxiliary {kMovingMean, kMovingInvVar};
 
 #if defined(__CUDACC__)
 template<typename DType>
-class CuDNNBatchNormOp {
+class CuDNNBatchNormOp : public Operator {
  public:
-  CuDNNBatchNormOp() {
+  explicit CuDNNBatchNormOp(BatchNormParam param) {
     using namespace mshadow;
+    CHECK_GE(param.eps, CUDNN_BN_MIN_EPSILON)
+     << "CuDNN requires eps to be no less than " << CUDNN_BN_MIN_EPSILON;
+    this->param_ = param;
+    init_cudnn_ = false;
     dtype_ = DataType<DType>::kCudnnFlag;
     // For float16 input type beta, gamma, mean, and average are stored in float32.
     // For other input types, these parameters have the same type as input
     dtype_param_ = (dtype_ == CUDNN_DATA_HALF) ? kFloat32 : DataType<DType>::kFlag;
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&io_desc_));
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&mean_desc_));
-  }
-
-  void Init(const BatchNormParam &param) {
-    CHECK_GE(param.eps, CUDNN_BN_MIN_EPSILON)
-     << "CuDNN requires eps to be no less than " << CUDNN_BN_MIN_EPSILON;
-    this->param_ = param;
   }
 
   ~CuDNNBatchNormOp() {
-    CUDNN_CALL(cudnnDestroyTensorDescriptor(io_desc_));
-    CUDNN_CALL(cudnnDestroyTensorDescriptor(mean_desc_));
+    if (init_cudnn_) {
+      CUDNN_CALL(cudnnDestroyTensorDescriptor(io_desc_));
+      CUDNN_CALL(cudnnDestroyTensorDescriptor(mean_desc_));
+    }
   }
 
-  void Forward(const OpContext &ctx,
+  virtual void Forward(const OpContext &ctx,
                        const std::vector<TBlob> &in_data,
                        const std::vector<OpReqType> &req,
                        const std::vector<TBlob> &out_data,
@@ -86,7 +84,29 @@ class CuDNNBatchNormOp {
     CHECK_GE(in_data[cudnnbatchnorm::kData].ndim(), 2);
     CHECK_LE(in_data[cudnnbatchnorm::kData].ndim(), 4);
 
-    Init(in_data[cudnnbatchnorm::kData]);
+    if (!init_cudnn_) {
+      for (int i = 0; i < 4; ++i) {
+        if (i < in_data[cudnnbatchnorm::kData].ndim()) {
+          shape_[i] = in_data[cudnnbatchnorm::kData].shape_[i];
+        } else {
+          shape_[i] = 1;
+        }
+      }
+      CUDNN_CALL(cudnnCreateTensorDescriptor(&io_desc_));
+      CUDNN_CALL(cudnnCreateTensorDescriptor(&mean_desc_));
+      CUDNN_CALL(cudnnSetTensor4dDescriptor(io_desc_,
+                                            CUDNN_TENSOR_NCHW,
+                                            dtype_,
+                                            shape_[0],
+                                            shape_[1],
+                                            shape_[2],
+                                            shape_[3]));
+      CUDNN_CALL(cudnnDeriveBNTensorDescriptor(mean_desc_,
+                                               io_desc_,
+                                               CUDNN_BATCHNORM_SPATIAL));
+      init_cudnn_  = true;
+    }
+
     Stream<gpu> *s = ctx.get_stream<gpu>();
     Tensor<gpu, 4, DType> x =
       in_data[cudnnbatchnorm::kData].get_with_shape<gpu, 4, DType>(shape_, s);
@@ -157,7 +177,7 @@ class CuDNNBatchNormOp {
     })
   }
 
-  void Backward(const OpContext &ctx,
+  virtual void Backward(const OpContext &ctx,
                         const std::vector<TBlob> &out_grad,
                         const std::vector<TBlob> &in_data,
                         const std::vector<TBlob> &out_data,
@@ -173,7 +193,6 @@ class CuDNNBatchNormOp {
     CHECK(ctx.is_train && !param_.use_global_stats)
         << "use global statistics is not yet supported in CuDNNBatchNorm";
 
-    Init(in_data[cudnnbatchnorm::kData]);
     Stream<gpu> *s = ctx.get_stream<gpu>();
     Tensor<gpu, 4, DType> x =
       in_data[cudnnbatchnorm::kData].get_with_shape<gpu, 4, DType>(shape_, s);
@@ -271,27 +290,7 @@ class CuDNNBatchNormOp {
   }
 
  private:
-  void Init(const TBlob &in_data) {
-    for (int i = 0; i < 4; ++i) {
-      if (i < in_data.ndim()) {
-        shape_[i] = in_data.shape_[i];
-      } else {
-        shape_[i] = 1;
-      }
-    }
-
-    CUDNN_CALL(cudnnSetTensor4dDescriptor(io_desc_,
-                                          CUDNN_TENSOR_NCHW,
-                                          dtype_,
-                                          shape_[0],
-                                          shape_[1],
-                                          shape_[2],
-                                          shape_[3]));
-    CUDNN_CALL(cudnnDeriveBNTensorDescriptor(mean_desc_,
-                                             io_desc_,
-                                             CUDNN_BATCHNORM_SPATIAL));
-  }
-
+  bool init_cudnn_;
   cudnnDataType_t dtype_;
   int dtype_param_;
   cudnnTensorDescriptor_t io_desc_, mean_desc_;
@@ -300,6 +299,91 @@ class CuDNNBatchNormOp {
 };
 #endif  // defined(__CUDACC__)
 
+template<typename xpu>
+Operator *CreateOp_CuDNNv4(BatchNormParam param);
+
+
+#if DMLC_USE_CXX11
+class CuDNNBatchNormProp : public OperatorProperty {
+ public:
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    param_.Init(kwargs);
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), 3U) << "Input:[data, gamma, beta]";
+    const TShape &dshape = in_shape->at(0);
+    if (dshape.ndim() == 0) return false;
+    in_shape->at(1) = TShape(Shape1(dshape[1]));
+    in_shape->at(2) = TShape(Shape1(dshape[1]));
+
+    out_shape->clear();
+    out_shape->push_back(dshape);
+    out_shape->push_back(Shape1(dshape[1]));
+    out_shape->push_back(Shape1(dshape[1]));
+
+    aux_shape->clear();
+    aux_shape->push_back(Shape1(dshape[1]));
+    aux_shape->push_back(Shape1(dshape[1]));
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    auto ptr = new CuDNNBatchNormProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  std::string TypeString() const override {
+    return "CuDNNBatchNorm";
+  }
+
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    return {out_grad[cudnnbatchnorm::kOut],
+            out_data[cudnnbatchnorm::kMean],
+            out_data[cudnnbatchnorm::kInvVar],
+            in_data[cudnnbatchnorm::kData],
+            in_data[cudnnbatchnorm::kGamma]
+           };
+  }
+
+  int NumVisibleOutputs() const override {
+    return 1;
+  }
+
+  int NumOutputs() const override {
+    return 3;
+  }
+
+  std::vector<std::string> ListArguments() const override {
+    return {"data", "gamma", "beta"};
+  }
+
+  std::vector<std::string> ListOutputs() const override {
+    return {"output", "mean", "inv_var"};
+  }
+
+  std::vector<std::string> ListAuxiliaryStates() const override {
+    return {"moving_mean", "moving_inv_var"};
+  }
+
+  Operator* CreateOperator(Context ctx) const override;
+
+ private:
+  BatchNormParam param_;
+};  // class CuDNNBatchNormProp
+
+#endif  // DMLC_USE_CXX11
 #endif  // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 4
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/cudnn/cudnn_batch_norm.cc b/src/operator/nn/cudnn/cudnn_batch_norm.cc
index f1d229dd54..e1e0c999b1 100644
--- a/src/operator/nn/cudnn/cudnn_batch_norm.cc
+++ b/src/operator/nn/cudnn/cudnn_batch_norm.cc
@@ -21,100 +21,46 @@
  * Copyright (c) 2015 by Contributors
  * \file cudnn_batch_norm.cc
  * \brief
- * \author Junyuan Xie, Da Zheng
+ * \author Junyuan Xie
 */
 
 #include "./cudnn_batch_norm-inl.h"
 #include <nnvm/op_attr_types.h>
-#include "../../elemwise_op_common.h"
 
 namespace mxnet {
 namespace op {
-#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 4
-
-static bool BatchNormShape(const nnvm::NodeAttrs& attrs, std::vector<TShape> *in_shape,
-    std::vector<TShape> *out_shape) {
-  using namespace mshadow;
-  CHECK_EQ(in_shape->size(), 5U) << "Input:[data, gamma, beta, moving_mean, moving_var]";
-  const TShape &dshape = in_shape->at(0);
-  if (dshape.ndim() == 0) return false;
-  in_shape->at(1) = TShape(Shape1(dshape[1]));
-  in_shape->at(2) = TShape(Shape1(dshape[1]));
-  in_shape->at(3) = TShape(Shape1(dshape[1]));
-  in_shape->at(4) = TShape(Shape1(dshape[1]));
-
-  out_shape->clear();
-  out_shape->push_back(dshape);
-  out_shape->push_back(Shape1(dshape[1]));
-  out_shape->push_back(Shape1(dshape[1]));
-
-  return true;
-}
-
-static void BatchNormCompute_CPU(const nnvm::NodeAttrs& attrs,
-    const OpContext& ctx, const std::vector<TBlob>& inputs,
-    const std::vector<OpReqType>& req,
-    const std::vector<TBlob>& outputs) {
+#if CUDNN_MAJOR >= 4
+template<>
+Operator *CreateOp_CuDNNv4<cpu>(BatchNormParam param) {
   LOG(FATAL) << "CuDNNBatchNormOp is only available for gpu.";
+  return NULL;
 }
 
-static void BatchNormGradCompute_CPU(const nnvm::NodeAttrs& attrs,
-    const OpContext& ctx, const std::vector<TBlob>& inputs,
-    const std::vector<OpReqType>& req,
-    const std::vector<TBlob>& outputs) {
-  LOG(FATAL) << "CuDNNBatchNormOp is only available for gpu.";
+Operator *CuDNNBatchNormProp::CreateOperator(Context ctx) const {
+#if CUDNN_MAJOR >= 5
+  LOG(FATAL) << "CuDNNBatchNorm is merged into BatchNorm for cudnn version above v5."
+                "Use the later instead.";
+  return nullptr;
+#else
+  DO_BIND_DISPATCH(CreateOp_CuDNNv4, param_);
+#endif
 }
 
-NNVM_REGISTER_OP(CuDNNBatchNorm)
+MXNET_REGISTER_OP_PROPERTY(CuDNNBatchNorm, CuDNNBatchNormProp)
 .describe("Apply batch normalization to input.")
-.set_num_inputs(5)
-.set_num_outputs(3)
-.set_attr_parser(ParamParser<BatchNormParam>)
-.set_attr<nnvm::FListInputNames>("FListInputNames",
-    [](const NodeAttrs& attrs) {
-  return std::vector<std::string>{"data", "gamma", "beta", "moving_mean", "moving_var"};
-})
-.set_attr<nnvm::FListOutputNames>("FListOutputNames",
-    [](const NodeAttrs& attrs) {
-  return std::vector<std::string>{"output", "mean", "var"};
-})
-.set_attr<nnvm::FNumVisibleOutputs>("FNumVisibleOutputs",
-    [](const NodeAttrs& attrs) {
-  return 1;
-})
-.set_attr<nnvm::FMutateInputs>("FMutateInputs", [](const nnvm::NodeAttrs& attrs) {
-  return std::vector<uint32_t>{3, 4};
-})
-.set_attr<nnvm::FInferShape>("FInferShape", BatchNormShape)
-.set_attr<FCompute>("FCompute<cpu>", BatchNormCompute_CPU)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseInOut{"_backward_CuDNNBatchNorm"})
 .add_argument("data", "NDArray-or-Symbol", "Input data to batch normalization")
-.add_argument("gamma", "NDArray-or-Symbol", "gamma array")
-.add_argument("beta", "NDArray-or-Symbol", "beta array")
-.add_argument("moving_mean", "NDArray-or-Symbol", "running mean of input")
-.add_argument("moving_var", "NDArray-or-Symbol", "running variance of input")
-.add_arguments(BatchNormParam::__FIELDS__())
-.set_attr<nnvm::FSetInputVarAttrOnCompose>(
-  "FSetInputVarAttrOnCompose",
-  [](const nnvm::NodeAttrs& attrs, nnvm::NodePtr var, const int index) {
-    if (var->attrs.dict.find("__init__") != var->attrs.dict.end()) return;
-    if (index == 3) {
-      var->attrs.dict["__init__"] = "[\"zero\", {}]";
-    } else if (index == 4) {
-      var->attrs.dict["__init__"] = "[\"one\", {}]";
-    }
-  });
-
-NNVM_REGISTER_OP(_backward_CuDNNBatchNorm)
-.set_num_outputs(5)
-.set_attr<nnvm::FMutateInputs>("FMutateInputs", [](const nnvm::NodeAttrs& attrs) {
-  return std::vector<uint32_t>{6, 7};
-})
-.set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr_parser(ParamParser<BatchNormParam>)
-.set_attr<FCompute>("FCompute<cpu>", BatchNormGradCompute_CPU);
+.add_arguments(BatchNormParam::__FIELDS__());
 
+NNVM_REGISTER_OP(CuDNNBatchNorm)
+.set_attr<nnvm::FSetInputVarAttrOnCompose>("FSetInputVarAttrOnCompose",
+    [](const nnvm::NodeAttrs& attrs, nnvm::NodePtr var, const int index) {
+      if (var->attrs.dict.find("__init__") != var->attrs.dict.end()) return;
+      if (index == 3) {
+        var->attrs.dict["__init__"] = "[\"zero\", {}]";
+      } else if (index == 4) {
+        var->attrs.dict["__init__"] = "[\"zero\", {}]";
+      }
+    });
 #endif  // CUDNN_MAJOR >= 4
-
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/cudnn/cudnn_batch_norm.cu b/src/operator/nn/cudnn/cudnn_batch_norm.cu
index e07cd1e6c8..e96db2e5e7 100644
--- a/src/operator/nn/cudnn/cudnn_batch_norm.cu
+++ b/src/operator/nn/cudnn/cudnn_batch_norm.cu
@@ -21,7 +21,7 @@
  * Copyright (c) 2015 by Contributors
  * \file cudnn_batch_norm.cu
  * \brief
- * \author Junyuan Xie, Da Zheng
+ * \author Junyuan Xie
 */
 
 #include "./cudnn_batch_norm-inl.h"
@@ -30,60 +30,10 @@
 namespace mxnet {
 namespace op {
 #if CUDNN_MAJOR == 4
-
-template<typename DType>
-static CuDNNBatchNormOp<DType> &GetCuDNNOp(const BatchNormParam& param) {
-#if DMLC_CXX11_THREAD_LOCAL
-  static thread_local CuDNNBatchNormOp<DType> op;
-#else
-  static MX_THREAD_LOCAL CuDNNBatchNormOp<DType> op;
-#endif
-  op.Init(param);
-  return op;
-}
-
-static void BatchNormCompute_CuDNNv4(const nnvm::NodeAttrs& attrs,
-    const OpContext& ctx, const std::vector<TBlob>& inputs,
-    const std::vector<OpReqType>& req,
-    const std::vector<TBlob>& outputs) {
-#if CUDNN_MAJOR >= 5
-  LOG(FATAL) << "CuDNNBatchNorm is merged into BatchNorm for cudnn version above v5."
-    "Use the later instead.";
-#else
-  const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
-  CHECK_EQ(inputs.size(), 5U);
-  std::vector<TBlob> in_data(inputs.begin(), inputs.begin() + 3);
-  std::vector<TBlob> aux_states(inputs.begin() + 3, inputs.end());
-  GetCuDNNOp<float>(param).Forward(ctx, in_data, req, outputs, aux_states);
-#endif
-}
-
-static void BatchNormGradCompute_CuDNNv4(const nnvm::NodeAttrs& attrs,
-    const OpContext& ctx, const std::vector<TBlob>& inputs,
-    const std::vector<OpReqType>& req,
-    const std::vector<TBlob>& outputs) {
-#if CUDNN_MAJOR >= 5
-  LOG(FATAL) << "CuDNNBatchNorm is merged into BatchNorm for cudnn version above v5."
-    "Use the later instead.";
-#else
-  CHECK_EQ(inputs.size(), 11U);
-  const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
-  std::vector<TBlob> out_grad(1, inputs[0]);
-  std::vector<TBlob> in_data(inputs.begin() + 3, inputs.begin() + 6);
-  std::vector<TBlob> aux_states(inputs.begin() + 6, inputs.begin() + 8);
-  std::vector<TBlob> out_data(inputs.begin() + 8, inputs.end());
-  std::vector<TBlob> in_grad(outputs.begin(), outputs.begin() + 3);
-  GetCuDNNOp<float>(param).Backward(ctx, out_grad, in_data, out_data,
-      req, in_grad, aux_states);
-#endif
+template<>
+Operator *CreateOp_CuDNNv4<gpu>(BatchNormParam param) {
+  return new CuDNNBatchNormOp<float>(param);
 }
-
-NNVM_REGISTER_OP(CuDNNBatchNorm)
-.set_attr<FCompute>("FCompute<gpu>", BatchNormCompute_CuDNNv4);
-
-NNVM_REGISTER_OP(_backward_CuDNNBatchNorm)
-.set_attr<FCompute>("FCompute<gpu>", BatchNormGradCompute_CuDNNv4);
-
 #endif  // CUDNN_MAJOR == 4
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/cudnn/cudnn_convolution-inl.h b/src/operator/nn/cudnn/cudnn_convolution-inl.h
index 229ba3cb1a..8ffe97d943 100644
--- a/src/operator/nn/cudnn/cudnn_convolution-inl.h
+++ b/src/operator/nn/cudnn/cudnn_convolution-inl.h
@@ -42,19 +42,9 @@ namespace op {
  * \brief The Operator used to perform convolution using cuDNN kernels.
  */
 template<typename DType>
-class CuDNNConvolutionOp {
+class CuDNNConvolutionOp : public Operator {
  public:
-  CuDNNConvolutionOp() {
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_));
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_));
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&bias_desc_));
-    CUDNN_CALL(cudnnCreateFilterDescriptor(&filter_desc_));
-    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&forward_conv_desc_));
-    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_));
-    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_w_));
-  }
-
-  void Init(const ConvolutionParam& param,
+  explicit CuDNNConvolutionOp(const ConvolutionParam& param,
                               int forward_compute_type,
                               int backward_compute_type,
                               const std::vector<TShape>& in_shape,
@@ -67,6 +57,8 @@ class CuDNNConvolutionOp {
     auto cudnn_backward_compute_type = convertToCuDNNDataType(backward_compute_type);
     // convert MB to words
     param_.workspace = (param_.workspace << 20) / sizeof(DType);
+    init_cudnn_ = false;
+    init_temp_size_ = false;
     dtype_ = DataType<DType>::kCudnnFlag;
     // TensorCore algos only allowed on fp16-I/O convolutions if permitted by the global policy.
     cudnn_tensor_core_ = DataType<DType>::kFlag == kFloat16 && GetEnvAllowTensorCore();
@@ -110,19 +102,22 @@ class CuDNNConvolutionOp {
   }
 
   ~CuDNNConvolutionOp() {
-    CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_));
-    CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_));
-    CUDNN_CALL(cudnnDestroyTensorDescriptor(bias_desc_));
-    CUDNN_CALL(cudnnDestroyFilterDescriptor(filter_desc_));
-    CUDNN_CALL(cudnnDestroyConvolutionDescriptor(forward_conv_desc_));
-    CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_));
-    CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_w_));
+    if (init_cudnn_) {
+      CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_));
+      CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_));
+      CUDNN_CALL(cudnnDestroyTensorDescriptor(bias_desc_));
+      CUDNN_CALL(cudnnDestroyFilterDescriptor(filter_desc_));
+      CUDNN_CALL(cudnnDestroyConvolutionDescriptor(forward_conv_desc_));
+      CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_));
+      CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_w_));
+    }
   }
 
-  void Forward(const OpContext &ctx,
+  virtual void Forward(const OpContext &ctx,
                        const std::vector<TBlob> &in_data,
                        const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data) {
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     size_t expected = param_.no_bias ? 2 : 3;
     CHECK_EQ(in_data.size(), expected);
@@ -179,17 +174,18 @@ class CuDNNConvolutionOp {
     }
   }
 
-  void Backward(const OpContext &ctx,
+  virtual void Backward(const OpContext &ctx,
                         const std::vector<TBlob> &out_grad,
                         const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
                         const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad) {
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
     size_t expected = param_.no_bias == 0 ? 3 : 2;
     CHECK_EQ(out_grad.size(), 1U);
-    CHECK_EQ(in_data.size(), expected);
-    CHECK_EQ(in_grad.size(), expected);
+    CHECK(in_data.size() == expected && in_grad.size() == expected);
     Stream<gpu> *s = ctx.get_stream<gpu>();
 
     // I/O's should have 2 more dims than the kernel dim
@@ -199,7 +195,6 @@ class CuDNNConvolutionOp {
     DType *data_ptr = GetNdPtr(in_data[conv::kData], param_.kernel.ndim() + 2, s);
     DType *gdata_ptr = GetNdPtr(in_grad[conv::kData], param_.kernel.ndim() + 2, s);
 
-    GetTempSize(ctx);
     Tensor<gpu, 1, DType> workspace = AllocateTempWorkspace(ctx, backward_workspace_byte_);
     size_t workspace_size = TensorSizeBytes(workspace);
     for (uint32_t g = 0; g < param_.num_group; ++g) {
@@ -338,6 +333,13 @@ class CuDNNConvolutionOp {
     size_t expected = param_.no_bias ? 2 : 3;
     CHECK_EQ(in_shape.size(), expected);
     CHECK_EQ(out_shape.size(), 1U);
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&bias_desc_));
+    CUDNN_CALL(cudnnCreateFilterDescriptor(&filter_desc_));
+    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&forward_conv_desc_));
+    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_));
+    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_w_));
 
     TShape dshape = in_shape[conv::kData];
     TShape wshape = in_shape[conv::kWeight];
@@ -510,6 +512,7 @@ class CuDNNConvolutionOp {
                                           &bias_shape[0],
                                           &bias_stride[0]));
     }
+    init_cudnn_ = true;
   }
 
   void SelectAlgo(const Context& ctx,
@@ -753,6 +756,7 @@ class CuDNNConvolutionOp {
   }
 
   void GetTempSize(const OpContext& ctx) {
+    if (init_temp_size_) return;
     mshadow::Stream<gpu> *s = ctx.get_stream<gpu>();
     size_t back_size = 0, back_size_w = 0;
     CUDNN_CALL(cudnnGetConvolutionBackwardDataWorkspaceSize(s->dnn_handle_,
@@ -777,6 +781,8 @@ class CuDNNConvolutionOp {
                out_desc_,
                forward_algo_.AlgoNumber(),
                &forward_workspace_byte_));
+
+    init_temp_size_ = true;
   }
 
   int *CastTShapeToIntPtr(const TShape& s, std::vector<int> *buffer) {
@@ -841,6 +847,8 @@ class CuDNNConvolutionOp {
   std::vector<int> param_dilate_;
   std::vector<int> param_pad_;
 
+  bool init_cudnn_;
+  bool init_temp_size_;
   // Temp workspace size in bytes needed for Forward() operation.
   size_t forward_workspace_byte_;
   // Temp workspace size in bytes needed for Backward() operation.
diff --git a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
index 3c80cdcba4..bc02d1b73f 100644
--- a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
+++ b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
@@ -39,19 +39,9 @@ namespace op {
 #if MXNET_USE_CUDNN == 1
 
 template<typename DType>
-class CuDNNDeconvolutionOp {
+class CuDNNDeconvolutionOp : public Operator {
  public:
-  CuDNNDeconvolutionOp() {
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_));
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_));
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&bias_desc_));
-    CUDNN_CALL(cudnnCreateFilterDescriptor(&filter_desc_));
-    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&forward_conv_desc_));
-    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_));
-    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_w_));
-  }
-
-  void Init(DeconvolutionParam param,
+  explicit CuDNNDeconvolutionOp(DeconvolutionParam param,
                                 int forward_compute_type,
                                 int backward_compute_type,
                                 const std::vector<TShape>& in_shape,
@@ -64,6 +54,8 @@ class CuDNNDeconvolutionOp {
     auto cudnn_backward_compute_type = convertToCuDNNDataType(backward_compute_type);
     // convert MB to words
     param_.workspace = (param_.workspace << 20) / sizeof(DType);
+    init_cudnn_ = false;
+    init_temp_size_ = false;
     dtype_ = mshadow::DataType<DType>::kCudnnFlag;
     // TensorCore algos only allowed on fp16-I/O deconvolutions if permitted by the global policy.
     cudnn_tensor_core_ = DataType<DType>::kFlag == kFloat16 && GetEnvAllowTensorCore();
@@ -107,19 +99,22 @@ class CuDNNDeconvolutionOp {
   }
 
   ~CuDNNDeconvolutionOp() {
-    CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_));
-    CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_));
-    CUDNN_CALL(cudnnDestroyTensorDescriptor(bias_desc_));
-    CUDNN_CALL(cudnnDestroyFilterDescriptor(filter_desc_));
-    CUDNN_CALL(cudnnDestroyConvolutionDescriptor(forward_conv_desc_));
-    CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_));
-    CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_w_));
+    if (init_cudnn_) {
+      CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_));
+      CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_));
+      CUDNN_CALL(cudnnDestroyTensorDescriptor(bias_desc_));
+      CUDNN_CALL(cudnnDestroyFilterDescriptor(filter_desc_));
+      CUDNN_CALL(cudnnDestroyConvolutionDescriptor(forward_conv_desc_));
+      CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_));
+      CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_w_));
+    }
   }
 
-  void Forward(const OpContext &ctx,
+  virtual void Forward(const OpContext &ctx,
                        const std::vector<TBlob> &in_data,
                        const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data) {
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     size_t expected = param_.no_bias ? 2 : 3;
     CHECK_EQ(in_data.size(), expected);
@@ -192,17 +187,18 @@ class CuDNNDeconvolutionOp {
     }
   }
 
-  void Backward(const OpContext &ctx,
+  virtual void Backward(const OpContext &ctx,
                         const std::vector<TBlob> &out_grad,
                         const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
                         const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad) {
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
     size_t expected = param_.no_bias == 0 ? 3 : 2;
     CHECK_EQ(out_grad.size(), 1U);
-    CHECK_EQ(in_data.size(), param_.no_bias ? 2U : 3U);
-    CHECK_EQ(in_grad.size(), expected);
+    CHECK(in_data.size() == expected && in_grad.size() == expected);
     Stream<gpu> *s = ctx.get_stream<gpu>();
 
     // I/O's should have 2 more dims than the kernel dim
@@ -217,7 +213,6 @@ class CuDNNDeconvolutionOp {
       CHECK_NE(req[deconv::kBias], kWriteInplace);
     }
     CHECK_NE(req[deconv::kData], kWriteInplace);
-    GetTempSize(ctx);
     Tensor<gpu, 1, DType> workspace = AllocateTempWorkspace(ctx, backward_workspace_byte_);
     size_t workspace_size = TensorSizeBytes(workspace);
     for (uint32_t g = 0; g < param_.num_group; ++g) {
@@ -353,6 +348,13 @@ class CuDNNDeconvolutionOp {
     size_t expected = param_.no_bias ? 2 : 3;
     CHECK_EQ(in_shape.size(), expected);
     CHECK_EQ(out_shape.size(), 1U);
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&bias_desc_));
+    CUDNN_CALL(cudnnCreateFilterDescriptor(&filter_desc_));
+    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&forward_conv_desc_));
+    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_));
+    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_w_));
 
     TShape dshape = in_shape[deconv::kData];
     TShape wshape = in_shape[deconv::kWeight];
@@ -534,6 +536,7 @@ class CuDNNDeconvolutionOp {
                                             &bias_shape[0],
                                             &bias_stride[0]));
     }
+    init_cudnn_ = true;
   }
 
   void SelectAlgo(const Context& ctx,
@@ -786,6 +789,7 @@ class CuDNNDeconvolutionOp {
   }
 
   void GetTempSize(const OpContext& ctx) {
+    if (init_temp_size_) return;
     mshadow::Stream<gpu> *s = ctx.get_stream<gpu>();
     size_t back_data_algo_workspace_size = 0;
     size_t back_filter_algo_workspace_size = 0;
@@ -815,6 +819,7 @@ class CuDNNDeconvolutionOp {
     forward_workspace_byte_ = back_data_algo_workspace_size;
     backward_workspace_byte_ = std::max(forward_algo_workspace_size,
                                         back_filter_algo_workspace_size);
+    init_temp_size_ = true;
   }
 
   int *CastTShapeToIntPtr(const TShape& s, std::vector<int> *buffer) {
@@ -877,11 +882,8 @@ class CuDNNDeconvolutionOp {
   std::vector<int> param_stride_;
   std::vector<int> param_dilate_;
 
-  int forward_compute_type_;
-  int backward_compute_type_;
-  const std::vector<TShape> in_shapes_;
-  const std::vector<TShape> out_shapes_;
-
+  bool init_cudnn_;
+  bool init_temp_size_;
   // Temp workspace size in bytes needed for Forward() operation.  Note that
   // in deconvolution, this is handled by the cuDNN backprop-to-data kernel.
   size_t forward_workspace_byte_;
diff --git a/src/operator/nn/cudnn/cudnn_pooling-inl.h b/src/operator/nn/cudnn/cudnn_pooling-inl.h
index 8442b37058..104ed8546d 100644
--- a/src/operator/nn/cudnn/cudnn_pooling-inl.h
+++ b/src/operator/nn/cudnn/cudnn_pooling-inl.h
@@ -34,18 +34,13 @@ namespace mxnet {
 namespace op {
 
 template<typename DType>
-class CuDNNPoolingOp {
+class CuDNNPoolingOp : public Operator {
  public:
-  CuDNNPoolingOp() {
+  explicit CuDNNPoolingOp(PoolingParam p) {
+    param_ = p;
+    init_cudnn_ = false;
     // TODO(xxx): fp16
     dtype_ = mshadow::DataType<DType>::kCudnnFlag;
-    CUDNN_CALL(cudnnCreatePoolingDescriptor(&pooling_desc_));
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_));
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_));
-  }
-
-  void Init(const PoolingParam &p) {
-    param_ = p;
     switch (param_.pool_type) {
       case pool_enum::kMaxPooling:
         mode_ = CUDNN_POOLING_MAX;
@@ -59,24 +54,33 @@ class CuDNNPoolingOp {
   }
 
   ~CuDNNPoolingOp() {
-    CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_));
-    CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_));
-    CUDNN_CALL(cudnnDestroyPoolingDescriptor(pooling_desc_));
+    if (init_cudnn_) {
+      CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_));
+      CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_));
+      CUDNN_CALL(cudnnDestroyPoolingDescriptor(pooling_desc_));
+    }
   }
 
-  void Forward(const OpContext &ctx, const TBlob &in_data,
-      const OpReqType &req, const TBlob &out_data) {
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 1U);
+    CHECK_EQ(out_data.size(), 1U);
     Stream<gpu> *s = ctx.get_stream<gpu>();
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
     typename DataType<DType>::ScaleType alpha = 1.0f;
     typename DataType<DType>::ScaleType beta = 0.0f;
-    this->Init(s, in_data, out_data);
     if (param_.kernel.ndim() == 2) {
       // 2d pool
-      Tensor<gpu, 4, DType> data = in_data.get<gpu, 4, DType>(s);
-      Tensor<gpu, 4, DType> out = out_data.get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> data = in_data[pool_enum::kData].get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> out = out_data[pool_enum::kOut].get<gpu, 4, DType>(s);
+      if (!init_cudnn_) {
+        this->Init(s, in_data, out_data);
+      }
       CHECK_EQ(data.CheckContiguous(), true);
       CHECK_EQ(out.CheckContiguous(), true);
       CUDNN_CALL(cudnnPoolingForward(s->dnn_handle_,
@@ -89,8 +93,11 @@ class CuDNNPoolingOp {
                                      out.dptr_));
     } else if (param_.kernel.ndim() == 3) {
       // 3d pool
-      Tensor<gpu, 5, DType> data = in_data.get<gpu, 5, DType>(s);
-      Tensor<gpu, 5, DType> out = out_data.get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> data = in_data[pool_enum::kData].get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> out = out_data[pool_enum::kOut].get<gpu, 5, DType>(s);
+      if (!init_cudnn_) {
+        this->Init(s, in_data, out_data);
+      }
       CHECK_EQ(data.CheckContiguous(), true);
       CHECK_EQ(out.CheckContiguous(), true);
       CUDNN_CALL(cudnnPoolingForward(s->dnn_handle_,
@@ -106,23 +113,31 @@ class CuDNNPoolingOp {
     }
   }
 
-  void Backward(const OpContext &ctx, const TBlob &out_grad,
-      const TBlob &in_data, const TBlob &out_data,
-      const OpReqType &req, const TBlob &in_grad) {
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), 1U);
+    CHECK_EQ(in_data.size(), 1U);
+    CHECK_EQ(out_data.size(), 1U);
+    CHECK_EQ(req.size(), 1U);
+    CHECK_EQ(in_grad.size(), 1U);
 
     Stream<gpu> *s = ctx.get_stream<gpu>();
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
     typename DataType<DType>::ScaleType alpha = 1.0f;
     typename DataType<DType>::ScaleType beta = 0.0f;
-    this->Init(s, in_data, out_data);
     if (param_.kernel.ndim() == 2) {
       // 2d pool
-      Tensor<gpu, 4, DType> m_out_grad = out_grad.get<gpu, 4, DType>(s);
-      Tensor<gpu, 4, DType> m_in_data = in_data.get<gpu, 4, DType>(s);
-      Tensor<gpu, 4, DType> m_out_data = out_data.get<gpu, 4, DType>(s);
-      Tensor<gpu, 4, DType> m_in_grad = in_grad.get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> m_out_grad = out_grad[pool_enum::kOut].get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> m_in_data = in_data[pool_enum::kData].get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> m_out_data = out_data[pool_enum::kOut].get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> m_in_grad = in_grad[pool_enum::kData].get<gpu, 4, DType>(s);
       CUDNN_CALL(cudnnPoolingBackward(s->dnn_handle_,
                                       pooling_desc_,
                                       &alpha,
@@ -137,10 +152,10 @@ class CuDNNPoolingOp {
                                       m_in_grad.dptr_));
     } else if (param_.kernel.ndim() == 3) {
       // 3d pool
-      Tensor<gpu, 5, DType> m_out_grad = out_grad.get<gpu, 5, DType>(s);
-      Tensor<gpu, 5, DType> m_in_data = in_data.get<gpu, 5, DType>(s);
-      Tensor<gpu, 5, DType> m_out_data = out_data.get<gpu, 5, DType>(s);
-      Tensor<gpu, 5, DType> m_in_grad = in_grad.get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> m_out_grad = out_grad[pool_enum::kOut].get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> m_in_data = in_data[pool_enum::kData].get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> m_out_data = out_data[pool_enum::kOut].get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> m_in_grad = in_grad[pool_enum::kData].get<gpu, 5, DType>(s);
       CUDNN_CALL(cudnnPoolingBackward(s->dnn_handle_,
                                       pooling_desc_,
                                       &alpha,
@@ -159,115 +174,129 @@ class CuDNNPoolingOp {
   }
 
  private:
-  inline void Init(mshadow::Stream<gpu> *s, const TBlob &in_data,
-      const TBlob &out_data) {
+  inline void Init(mshadow::Stream<gpu> *s,
+                   const std::vector<TBlob> &in_data,
+                   const std::vector<TBlob> &out_data) {
     using namespace mshadow;
     #if CUDNN_MAJOR >= 5
     nan_prop_ = CUDNN_NOT_PROPAGATE_NAN;
     #endif
-    if (param_.kernel.ndim() == 2) {
-      // 2d conv
-      Tensor<gpu, 4, DType> data = in_data.get<gpu, 4, DType>(s);
-      Tensor<gpu, 4, DType> out = out_data.get<gpu, 4, DType>(s);
-      mshadow::Shape<4> dshape = data.shape_;
-      CUDNN_CALL(cudnnSetTensor4dDescriptor(in_desc_,
-                                            CUDNN_TENSOR_NCHW,
-                                            dtype_,
-                                            data.shape_[0],
-                                            data.shape_[1],
-                                            data.shape_[2],
-                                            data.shape_[3]));
-      CUDNN_CALL(cudnnSetTensor4dDescriptor(out_desc_,
-                                            CUDNN_TENSOR_NCHW,
-                                            dtype_,
-                                            out.shape_[0],
-                                            out.shape_[1],
-                                            out.shape_[2],
-                                            out.shape_[3]));
-      #if CUDNN_MAJOR >= 5
-      CUDNN_CALL(cudnnSetPooling2dDescriptor(pooling_desc_,
-                                             mode_,
-                                             nan_prop_,
-                                             param_.global_pool ? dshape[2] : param_.kernel[0],
-                                             param_.global_pool ? dshape[3] : param_.kernel[1],
-                                             param_.pad[0],
-                                             param_.pad[1],
-                                             param_.global_pool ? 1 : param_.stride[0],
-                                             param_.global_pool ? 1 :param_.stride[1]));
-      #else
-      CUDNN_CALL(cudnnSetPooling2dDescriptor(pooling_desc_,
-                                             mode_,
-                                             param_.global_pool ? dshape[2] : param_.kernel[0],
-                                             param_.global_pool ? dshape[3] : param_.kernel[1],
-                                             param_.pad[0],
-                                             param_.pad[1],
-                                             param_.global_pool ? 1 : param_.stride[0],
-                                             param_.global_pool ? 1 : param_.stride[1]));
-      #endif
-    } else {
-      Tensor<gpu, 5, DType> data = in_data.get<gpu, 5, DType>(s);
-      Tensor<gpu, 5, DType> out = out_data.get<gpu, 5, DType>(s);
-      std::vector<int> ishape = {static_cast<int>(data.shape_[0]),
-                                 static_cast<int>(data.shape_[1]),
-                                 static_cast<int>(data.shape_[2]),
-                                 static_cast<int>(data.shape_[3]),
-                                 static_cast<int>(data.shape_[4])};
+    CHECK_EQ(in_data.size(), 1U);
+    CHECK_EQ(out_data.size(), 1U);
+    if (!init_cudnn_) {
+      init_cudnn_ = true;
+      if (param_.kernel.ndim() == 2) {
+        // 2d conv
+        Tensor<gpu, 4, DType> data = in_data[pool_enum::kData].get<gpu, 4, DType>(s);
+        Tensor<gpu, 4, DType> out = out_data[pool_enum::kOut].get<gpu, 4, DType>(s);
+        mshadow::Shape<4> dshape = data.shape_;
+        CUDNN_CALL(cudnnCreatePoolingDescriptor(&pooling_desc_));
+        CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_));
+        CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_));
+        CUDNN_CALL(cudnnSetTensor4dDescriptor(in_desc_,
+                                              CUDNN_TENSOR_NCHW,
+                                              dtype_,
+                                              data.shape_[0],
+                                              data.shape_[1],
+                                              data.shape_[2],
+                                              data.shape_[3]));
+        CUDNN_CALL(cudnnSetTensor4dDescriptor(out_desc_,
+                                              CUDNN_TENSOR_NCHW,
+                                              dtype_,
+                                              out.shape_[0],
+                                              out.shape_[1],
+                                              out.shape_[2],
+                                              out.shape_[3]));
+        #if CUDNN_MAJOR >= 5
+        CUDNN_CALL(cudnnSetPooling2dDescriptor(pooling_desc_,
+                                               mode_,
+                                               nan_prop_,
+                                               param_.global_pool ? dshape[2] : param_.kernel[0],
+                                               param_.global_pool ? dshape[3] : param_.kernel[1],
+                                               param_.pad[0],
+                                               param_.pad[1],
+                                               param_.global_pool ? 1 : param_.stride[0],
+                                               param_.global_pool ? 1 :param_.stride[1]));
+        #else
+        CUDNN_CALL(cudnnSetPooling2dDescriptor(pooling_desc_,
+                                               mode_,
+                                               param_.global_pool ? dshape[2] : param_.kernel[0],
+                                               param_.global_pool ? dshape[3] : param_.kernel[1],
+                                               param_.pad[0],
+                                               param_.pad[1],
+                                               param_.global_pool ? 1 : param_.stride[0],
+                                               param_.global_pool ? 1 : param_.stride[1]));
+        #endif
+      } else {
+        Tensor<gpu, 5, DType> data = in_data[pool_enum::kData].get<gpu, 5, DType>(s);
+        Tensor<gpu, 5, DType> out = out_data[pool_enum::kOut].get<gpu, 5, DType>(s);
+        CUDNN_CALL(cudnnCreatePoolingDescriptor(&pooling_desc_));
+        CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_));
+        CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_));
+        std::vector<int> ishape = {static_cast<int>(data.shape_[0]),
+                                   static_cast<int>(data.shape_[1]),
+                                   static_cast<int>(data.shape_[2]),
+                                   static_cast<int>(data.shape_[3]),
+                                   static_cast<int>(data.shape_[4])};
 
-      std::vector<int> istride = {static_cast<int>(ishape[1] * ishape[2] * ishape[3] * ishape[4]),
-                                  static_cast<int>(ishape[2] * ishape[3] * ishape[4]),
-                                  static_cast<int>(ishape[3] * ishape[4]),
-                                  static_cast<int>(ishape[4]), 1};
+        std::vector<int> istride = {static_cast<int>(ishape[1] * ishape[2] * ishape[3] * ishape[4]),
+                                    static_cast<int>(ishape[2] * ishape[3] * ishape[4]),
+                                    static_cast<int>(ishape[3] * ishape[4]),
+                                    static_cast<int>(ishape[4]),
+                                    1};
 
-      std::vector<int> oshape = {static_cast<int>(out.shape_[0]),
-                                 static_cast<int>(out.shape_[1]),
-                                 static_cast<int>(out.shape_[2]),
-                                 static_cast<int>(out.shape_[3]),
-                                 static_cast<int>(out.shape_[4])};
+        std::vector<int> oshape = {static_cast<int>(out.shape_[0]),
+                                   static_cast<int>(out.shape_[1]),
+                                   static_cast<int>(out.shape_[2]),
+                                   static_cast<int>(out.shape_[3]),
+                                   static_cast<int>(out.shape_[4])};
 
-      std::vector<int> ostride = {static_cast<int>(oshape[1] * oshape[2] * oshape[3] * oshape[4]),
-                                  static_cast<int>(oshape[2] * oshape[3] * oshape[4]),
-                                  static_cast<int>(oshape[3] * oshape[4]),
-                                  static_cast<int>(oshape[4]), 1};
+        std::vector<int> ostride = {static_cast<int>(oshape[1] * oshape[2] * oshape[3] * oshape[4]),
+                                    static_cast<int>(oshape[2] * oshape[3] * oshape[4]),
+                                    static_cast<int>(oshape[3] * oshape[4]),
+                                    static_cast<int>(oshape[4]),
+                                    1};
 
-      std::vector<int> kernel_vec = {param_.global_pool ? ishape[2] :
-                                                          static_cast<int>(param_.kernel[0]),
-                                     param_.global_pool ? ishape[3] :
-                                                          static_cast<int>(param_.kernel[1]),
-                                     param_.global_pool ? ishape[4] :
-                                                          static_cast<int>(param_.kernel[2])};
+        std::vector<int> kernel_vec = {param_.global_pool ? ishape[2] :
+                                                            static_cast<int>(param_.kernel[0]),
+                                       param_.global_pool ? ishape[3] :
+                                                            static_cast<int>(param_.kernel[1]),
+                                       param_.global_pool ? ishape[4] :
+                                                            static_cast<int>(param_.kernel[2])};
 
-      std::vector<int> pad_vec = {param_.global_pool ? 0 : static_cast<int>(param_.pad[0]),
-                                  param_.global_pool ? 0 : static_cast<int>(param_.pad[1]),
-                                  param_.global_pool ? 0 : static_cast<int>(param_.pad[2])};
+        std::vector<int> pad_vec = {param_.global_pool ? 0 : static_cast<int>(param_.pad[0]),
+                                    param_.global_pool ? 0 : static_cast<int>(param_.pad[1]),
+                                    param_.global_pool ? 0 : static_cast<int>(param_.pad[2])};
 
-      std::vector<int> stride_vec = {param_.global_pool ? 1 : static_cast<int>(param_.stride[0]),
-                                     param_.global_pool ? 1 : static_cast<int>(param_.stride[1]),
-                                     param_.global_pool ? 1 : static_cast<int>(param_.stride[2])};
+        std::vector<int> stride_vec = {param_.global_pool ? 1 : static_cast<int>(param_.stride[0]),
+                                       param_.global_pool ? 1 : static_cast<int>(param_.stride[1]),
+                                       param_.global_pool ? 1 : static_cast<int>(param_.stride[2])};
 
-      CUDNN_CALL(cudnnSetTensorNdDescriptor(in_desc_,
-                                            dtype_,
-                                            static_cast<int>(ishape.size()),
-                                            &ishape[0],
-                                            &istride[0]));
-      CUDNN_CALL(cudnnSetTensorNdDescriptor(out_desc_,
-                                            dtype_,
-                                            static_cast<int>(oshape.size()),
-                                            &oshape[0],
-                                            &ostride[0]));
-      #if CUDNN_MAJOR >= 5
-      CUDNN_CALL(cudnnSetPoolingNdDescriptor(pooling_desc_,
-                                             mode_,
-                                             nan_prop_,
-                                             static_cast<int>(kernel_vec.size()),
-                                             &(kernel_vec[0]),
-                                             &(pad_vec[0]),
-                                             &(stride_vec[0])));
-      #else
-      LOG(FATAL) << "3D pooling only support CUDNN v5 and abouve";
-      #endif
+        CUDNN_CALL(cudnnSetTensorNdDescriptor(in_desc_,
+                                              dtype_,
+                                              static_cast<int>(ishape.size()),
+                                              &ishape[0],
+                                              &istride[0]));
+        CUDNN_CALL(cudnnSetTensorNdDescriptor(out_desc_,
+                                              dtype_,
+                                              static_cast<int>(oshape.size()),
+                                              &oshape[0],
+                                              &ostride[0]));
+        #if CUDNN_MAJOR >= 5
+        CUDNN_CALL(cudnnSetPoolingNdDescriptor(pooling_desc_,
+                                               mode_,
+                                               nan_prop_,
+                                               static_cast<int>(kernel_vec.size()),
+                                               &(kernel_vec[0]),
+                                               &(pad_vec[0]),
+                                               &(stride_vec[0])));
+        #else
+        LOG(FATAL) << "3D pooling only support CUDNN v5 and abouve";
+        #endif
+      }
     }
   }
-
+  bool init_cudnn_;
   cudnnDataType_t dtype_;
   cudnnHandle_t handle_;
   cudnnPoolingMode_t mode_;
diff --git a/src/operator/nn/cudnn/cudnn_softmax_activation-inl.h b/src/operator/nn/cudnn/cudnn_softmax_activation-inl.h
index 239da02366..5afdb48443 100644
--- a/src/operator/nn/cudnn/cudnn_softmax_activation-inl.h
+++ b/src/operator/nn/cudnn/cudnn_softmax_activation-inl.h
@@ -32,64 +32,73 @@
 
 namespace mxnet {
 namespace op {
-class CuDNNSoftmaxActivationOp {
+class CuDNNSoftmaxActivationOp : public Operator {
  public:
-  CuDNNSoftmaxActivationOp() {
-    dtype_ = CUDNN_DATA_FLOAT;
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&shape_desc_));
-  }
-
-  void Init(SoftmaxActivationParam param) {
+  explicit CuDNNSoftmaxActivationOp(SoftmaxActivationParam param) {
     this->param_ = param;
+    init_cudnn_ = false;
+    dtype_ = CUDNN_DATA_FLOAT;
   }
 
   ~CuDNNSoftmaxActivationOp() {
-    CUDNN_CALL(cudnnDestroyTensorDescriptor(shape_desc_));
+    if (init_cudnn_) {
+      CUDNN_CALL(cudnnDestroyTensorDescriptor(shape_desc_));
+    }
   }
 
-  void Forward(const OpContext &ctx, const TBlob &in_data,
-      const OpReqType &req, const TBlob &out_data) {
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 1U);
+    CHECK_EQ(out_data.size(), 1U);
     Stream<gpu> *s = ctx.get_stream<gpu>();
     Tensor<gpu, 4> data;
     Tensor<gpu, 4> out;
     cudnnSoftmaxMode_t softmax_mode;
     if (param_.mode == softmax_activation::kInstance) {
-      CHECK_EQ(in_data.ndim(), 2)
+      CHECK_EQ(in_data[softmax_activation::kData].ndim(), 2)
         << "Input need to have 2 dimensions when mode=instance.";
-      Shape<4> dshape = Shape4(in_data.shape_[0], in_data.shape_[1], 1, 1);
-      data = in_data.get_with_shape<gpu, 4, real_t>(dshape, s);
-      out = out_data.get_with_shape<gpu, 4, real_t>(dshape, s);
+      Shape<4> dshape = Shape4(in_data[softmax_activation::kData].shape_[0],
+                               in_data[softmax_activation::kData].shape_[1], 1, 1);
+      data = in_data[softmax_activation::kData].get_with_shape<gpu, 4, real_t>(dshape, s);
+      out = out_data[softmax_activation::kOut].get_with_shape<gpu, 4, real_t>(dshape, s);
       softmax_mode = CUDNN_SOFTMAX_MODE_INSTANCE;
     } else {
-      CHECK_GE(in_data.ndim(), 3)
+      CHECK_GE(in_data[softmax_activation::kData].ndim(), 3)
         << "Input need to have a least 3 dimensions when mode=channel";
       Shape<4> dshape;
-      index_t size_left = in_data.Size();
+      index_t size_left = in_data[softmax_activation::kData].Size();
       for (int i = 0; i < 3; ++i) {
-        if (i < in_data.ndim()) {
-          dshape[i] = in_data.shape_[i];
+        if (i < in_data[softmax_activation::kData].ndim()) {
+          dshape[i] = in_data[softmax_activation::kData].shape_[i];
         } else {
           dshape[i] = 1;
         }
         size_left /= dshape[i];
       }
       dshape[3] = size_left;
-      data = in_data.get_with_shape<gpu, 4, real_t>(dshape, s);
-      out = out_data.get_with_shape<gpu, 4, real_t>(dshape, s);
+      data = in_data[softmax_activation::kData].get_with_shape<gpu, 4, real_t>(dshape, s);
+      out = out_data[softmax_activation::kOut].get_with_shape<gpu, 4, real_t>(dshape, s);
       softmax_mode = CUDNN_SOFTMAX_MODE_CHANNEL;
     }
     float alpha = 1.0f;
     float beta = 0.0f;
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
-    CUDNN_CALL(cudnnSetTensor4dDescriptor(shape_desc_,
-                                          CUDNN_TENSOR_NCHW,
-                                          dtype_,
-                                          data.shape_[0],
-                                          data.shape_[1],
-                                          data.shape_[2],
-                                          data.shape_[3]));
+    if (!init_cudnn_) {
+      init_cudnn_ = true;
+      CUDNN_CALL(cudnnCreateTensorDescriptor(&shape_desc_));
+      CUDNN_CALL(cudnnSetTensor4dDescriptor(shape_desc_,
+                                            CUDNN_TENSOR_NCHW,
+                                            dtype_,
+                                            data.shape_[0],
+                                            data.shape_[1],
+                                            data.shape_[2],
+                                            data.shape_[3]));
+    }
     CUDNN_CALL(cudnnSoftmaxForward(s->dnn_handle_,
                                    CUDNN_SOFTMAX_ACCURATE,
                                    softmax_mode,
@@ -101,10 +110,19 @@ class CuDNNSoftmaxActivationOp {
                                    out.dptr_));
   }
 
-  void Backward(const OpContext &ctx, const TBlob &out_grad,
-      const TBlob &out_data, const OpReqType &req, const TBlob &in_grad) {
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), 1U);
+    CHECK_EQ(out_data.size(), 1U);
+    CHECK_EQ(req.size(), 1U);
+    CHECK_EQ(in_grad.size(), 1U);
     float alpha = 1.0f;
     float beta = 0.0f;
     Stream<gpu> *s = ctx.get_stream<gpu>();
@@ -114,30 +132,31 @@ class CuDNNSoftmaxActivationOp {
     Tensor<gpu, 4> input_grad;
     cudnnSoftmaxMode_t softmax_mode;
     if (param_.mode == softmax_activation::kInstance) {
-      CHECK_EQ(in_grad.ndim(), 2)
+      CHECK_EQ(in_grad[softmax_activation::kData].ndim(), 2)
         << "Input need to have 2 dimensions when mode=instance.";
-      Shape<4> dshape = Shape4(in_grad.shape_[0], in_grad.shape_[1], 1, 1);
-      grad = out_grad.get_with_shape<gpu, 4, real_t>(dshape, s);
-      output_data = out_data.get_with_shape<gpu, 4, real_t>(dshape, s);
-      input_grad = in_grad.get_with_shape<gpu, 4, real_t>(dshape, s);
+      Shape<4> dshape = Shape4(in_grad[softmax_activation::kData].shape_[0],
+                               in_grad[softmax_activation::kData].shape_[1], 1, 1);
+      grad = out_grad[softmax_activation::kOut].get_with_shape<gpu, 4, real_t>(dshape, s);
+      output_data = out_data[softmax_activation::kOut].get_with_shape<gpu, 4, real_t>(dshape, s);
+      input_grad = in_grad[softmax_activation::kData].get_with_shape<gpu, 4, real_t>(dshape, s);
       softmax_mode = CUDNN_SOFTMAX_MODE_INSTANCE;
     } else {
-      CHECK_GE(in_grad.ndim(), 3)
+      CHECK_GE(in_grad[softmax_activation::kData].ndim(), 3)
         << "Input need to have a least 3 dimensions when mode=channel";
       Shape<4> dshape;
-      index_t size_left = in_grad.Size();
+      index_t size_left = in_grad[softmax_activation::kData].Size();
       for (int i = 0; i < 3; ++i) {
-        if (i < in_grad.ndim()) {
-          dshape[i] = in_grad.shape_[i];
+        if (i < in_grad[softmax_activation::kData].ndim()) {
+          dshape[i] = in_grad[softmax_activation::kData].shape_[i];
         } else {
           dshape[i] = 1;
         }
         size_left /= dshape[i];
       }
       dshape[3] = size_left;
-      output_data = out_data.get_with_shape<gpu, 4, real_t>(dshape, s);
-      grad = out_grad.get_with_shape<gpu, 4, real_t>(dshape, s);
-      input_grad = in_grad.get_with_shape<gpu, 4, real_t>(dshape, s);
+      output_data = out_data[softmax_activation::kOut].get_with_shape<gpu, 4, real_t>(dshape, s);
+      grad = out_grad[softmax_activation::kOut].get_with_shape<gpu, 4, real_t>(dshape, s);
+      input_grad = in_grad[softmax_activation::kData].get_with_shape<gpu, 4, real_t>(dshape, s);
       softmax_mode = CUDNN_SOFTMAX_MODE_CHANNEL;
     }
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
@@ -155,6 +174,7 @@ class CuDNNSoftmaxActivationOp {
   }
 
  private:
+  bool init_cudnn_;
   cudnnDataType_t dtype_;
   cudnnTensorDescriptor_t shape_desc_;
   SoftmaxActivationParam param_;
diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index b6d522b9e6..fbdfaa84fa 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -21,7 +21,7 @@
  * Copyright (c) 2015 by Contributors
  * \file deconvolution-inl.h
  * \brief
- * \author Wei Wu, Da Zheng
+ * \author Wei Wu
 */
 #ifndef MXNET_OPERATOR_NN_DECONVOLUTION_INL_H_
 #define MXNET_OPERATOR_NN_DECONVOLUTION_INL_H_
@@ -195,18 +195,19 @@ namespace mxnet {
 namespace op {
 
 template<typename xpu, typename DType>
-class DeconvolutionOp {
+class DeconvolutionOp : public Operator {
  public:
-  void Init(DeconvolutionParam p) {
+  explicit DeconvolutionOp(DeconvolutionParam p) {
     this->param_ = p;
     // convert MBytes first to Bytes and then to elements.
     param_.workspace = (param_.workspace << 20) / sizeof(real_t);
   }
 
-  void Forward(const OpContext &ctx,
-               const std::vector<TBlob> &in_data,
-               const std::vector<OpReqType> &req,
-               const std::vector<TBlob> &out_data) {
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
 
@@ -321,18 +322,19 @@ class DeconvolutionOp {
     }
   }
 
-  void Backward(const OpContext &ctx,
-                const std::vector<TBlob> &out_grad,
-                const std::vector<TBlob> &in_data,
-                const std::vector<OpReqType> &req,
-                const std::vector<TBlob> &in_grad) {
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
     // TODO(bing): check the BLAS Handle, be careful
     CHECK_EQ(out_grad.size(), 1U);
     size_t expected = param_.no_bias == 0 ? 3 : 2;
-    CHECK_EQ(in_data.size(), expected);
-    CHECK_EQ(in_grad.size(), expected);
+    CHECK(in_data.size() == expected && in_grad.size() == expected);
     CHECK_EQ(req.size(), expected);
     CHECK_EQ(in_data[deconv::kWeight].CheckContiguous(), true);
     // get data
@@ -487,52 +489,300 @@ class DeconvolutionOp {
 };  // class DeconvolutionOp
 
 template<typename xpu>
-void _DeconvolutionCompute(const DeconvolutionParam& param,
-                           const OpContext& ctx, const std::vector<TBlob>& inputs,
-                           const std::vector<OpReqType>& req,
-                           const std::vector<TBlob>& outputs) {
-  MSHADOW_REAL_TYPE_SWITCH(inputs[deconv::kData].type_flag_, DType, {
-    DeconvolutionOp<xpu, DType> op;
-    op.Init(param);
-    op.Forward(ctx, inputs, req, outputs);
-  });
-}
+Operator* CreateOp(DeconvolutionParam param, int dtype,
+                   std::vector<TShape> *in_shape,
+                   std::vector<TShape> *out_shape,
+                   Context ctx);
 
-template<typename xpu>
-void DeconvolutionCompute(const nnvm::NodeAttrs& attrs,
-                          const OpContext& ctx, const std::vector<TBlob>& inputs,
-                          const std::vector<OpReqType>& req,
-                          const std::vector<TBlob>& outputs) {
-  const DeconvolutionParam& param = nnvm::get<DeconvolutionParam>(attrs.parsed);
-  _DeconvolutionCompute<xpu>(param, ctx, inputs, req, outputs);
-}
+#if DMLC_USE_CXX11
+class DeconvolutionProp : public OperatorProperty {
+ public:
+  std::vector<std::string> ListArguments() const override {
+    if (!param_.no_bias) {
+      return {"data", "weight", "bias"};
+    } else {
+      return {"data", "weight"};
+    }
+  }
 
-template<typename xpu>
-void _DeconvolutionGradCompute(const DeconvolutionParam& param,
-                               const OpContext& ctx, const std::vector<TBlob>& inputs,
-                               const std::vector<OpReqType>& req,
-                               const std::vector<TBlob>& outputs) {
-  std::vector<TBlob> in_data(inputs.begin() + 1, inputs.end());
-  const TBlob &out_grad = inputs[0];
-  const std::vector<TBlob> &in_grad = outputs;
-
-  MSHADOW_REAL_TYPE_SWITCH(out_grad.type_flag_, DType, {
-    DeconvolutionOp<xpu, DType> op;
-    op.Init(param);
-    op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
-  });
-}
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    using namespace mshadow;
+    param_.Init(kwargs);
+    if (param_.kernel.ndim() == 1) {
+      param_.layout = param_.layout? param_.layout.value() : mshadow::kNCW;
+      if (param_.stride.ndim() == 0) param_.stride = Shape1(1);
+      if (param_.dilate.ndim() == 0) param_.dilate = Shape1(1);
+      if (param_.pad.ndim() == 0) param_.pad = Shape1(0);
+      if (param_.adj.ndim() == 0) param_.adj = Shape1(0);
+    } else if (param_.kernel.ndim() == 2) {
+      param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW;
+      if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1);
+      if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1);
+      if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0);
+      if (param_.adj.ndim() == 0) param_.adj = Shape2(0, 0);
+    } else {
+      CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D deconvolution not supported";
+      param_.layout = param_.layout ? param_.layout.value(): mshadow::kNCDHW;
+      if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1);
+      if (param_.dilate.ndim() == 0) param_.dilate = Shape3(1, 1, 1);
+      if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0);
+      if (param_.adj.ndim() == 0) param_.adj = Shape3(0, 0, 0);
+    }
+  }
 
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
 
-template<typename xpu>
-void DeconvolutionGradCompute(const nnvm::NodeAttrs& attrs,
-                              const OpContext& ctx, const std::vector<TBlob>& inputs,
-                              const std::vector<OpReqType>& req,
-                              const std::vector<TBlob>& outputs) {
-  const DeconvolutionParam& param = nnvm::get<DeconvolutionParam>(attrs.parsed);
-  _DeconvolutionGradCompute<xpu>(param, ctx, inputs, req, outputs);
-}
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+#if MXNET_USE_CUDNN == 0
+    if (param_.kernel.ndim() > 2) {
+      LOG(FATAL) << "If not using CUDNN, only 1D or 2D Deconvolution is supported";
+      return false;
+    }
+#endif  // CUDNN
+
+    using namespace mshadow;
+    if (!param_.no_bias) {
+      CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]";
+    } else {
+      CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
+    }
+    out_shape->resize(1, TShape());
+    const TShape &dshape = (*in_shape)[deconv::kData];
+    if (dshape.ndim() ==  0) return false;
+
+    if (param_.kernel.ndim() == 1) {
+      // 1d conv
+      CHECK_EQ(dshape.ndim(), 3U) << "Input data should be 3D in batch-num_filter-x";
+      Shape<3> dshape_ncw = ConvertLayout(dshape.get<3>(), param_.layout.value(), kNCW);
+      Shape<3> wshape = Shape3(dshape_ncw[1], param_.num_filter / param_.num_group,
+                               param_.kernel[0]);
+      wshape = ConvertLayout(wshape, kNCW, param_.layout.value());
+      SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape);
+      if (!param_.no_bias) {
+        SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter));
+      }
+
+      const index_t dilated_ksize_x = param_.DilatedKernelSize(0);
+
+      index_t o_pad[1];
+      index_t o_adj[1];
+      param_.InferPad(dshape_ncw, o_pad, o_adj);
+
+      CHECK_EQ(dshape_ncw[1] % param_.num_group, 0U) \
+        << "input num_filter must divide group size";
+      CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
+        << "output num_filter must divide group size";
+      CHECK_GT(param_.kernel.Size(), 0U) \
+        << "incorrect kernel size: " << param_.kernel;
+      CHECK_GT(param_.stride.Size(), 0U) \
+        << "incorrect stride size: " << param_.stride;
+      CHECK_GT(param_.dilate.Size(), 0U) \
+        << "incorrect dilate size: " << param_.dilate;
+
+      CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(x) must be samller than stride[0]";
+
+      Shape<3> oshape;
+      oshape[0] = dshape_ncw[0];
+      oshape[1] = param_.num_filter;
+      oshape[2] = param_.stride[0] * (dshape_ncw[2] - 1) +
+        dilated_ksize_x - 2 * o_pad[0] + o_adj[0];
+
+      if (param_.target_shape[0] > 0) {
+        CHECK_EQ(param_.target_shape[0], oshape[2]) \
+          << "param_.target_shape[0] was not reasonable, please set it carefully";
+      }
 
+      SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCW, param_.layout.value()));
+
+      return true;
+    } else if (param_.kernel.ndim() == 2) {
+      // 2d conv
+      CHECK_EQ(dshape.ndim(), 4U) \
+        << "Input data should be 4D in batch-num_filter-y-x";
+      Shape<4> dshape_nchw = ConvertLayout(dshape.get<4>(), param_.layout.value(), kNCHW);
+      Shape<4> wshape = Shape4(dshape_nchw[1],
+                               param_.num_filter / param_.num_group,
+                               param_.kernel[0], param_.kernel[1]);
+      wshape = ConvertLayout(wshape, kNCHW, param_.layout.value());
+      SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape);
+      if (!param_.no_bias) {
+        SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter));
+      }
+
+      const index_t dilated_ksize_y = param_.DilatedKernelSize(0);
+      const index_t dilated_ksize_x = param_.DilatedKernelSize(1);
+
+      index_t o_pad[2];
+      index_t o_adj[2];
+      param_.InferPad(dshape_nchw, o_pad, o_adj);
+
+      CHECK_EQ(dshape_nchw[1] % param_.num_group, 0U) \
+        << "input num_filter must divide group size";
+      CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
+        << "output num_filter must divide group size";
+      CHECK_GT(param_.kernel.Size(), 0U) \
+        << "incorrect kernel size: " << param_.kernel;
+      CHECK_GT(param_.stride.Size(), 0U) \
+        << "incorrect stride size: " << param_.stride;
+      CHECK_GT(param_.dilate.Size(), 0U) \
+          << "incorrect dilate size: " << param_.dilate;
+
+      CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(y) must be samller than stride[0]";
+      CHECK_GE(param_.stride[1]-1, o_adj[1]) << "adj(x) must be samller than stride[1]";
+
+      Shape<4> oshape;
+      oshape[0] = dshape_nchw[0];
+      oshape[1] = param_.num_filter;
+      oshape[2] = param_.stride[0] * (dshape_nchw[2] - 1) +
+        dilated_ksize_y - 2 * o_pad[0] + o_adj[0];
+      oshape[3] = param_.stride[1] * (dshape_nchw[3] - 1) +
+        dilated_ksize_x - 2 * o_pad[1] + o_adj[1];
+
+      if (param_.target_shape[0] > 0) {
+        CHECK_EQ(param_.target_shape[0], oshape[2]) \
+          << "param_.target_shape[0] was not reasonable, please set it carefully";
+      }
+      if (param_.target_shape[1] > 0) {
+        CHECK_EQ(param_.target_shape[1], oshape[3]) \
+          << "param_.target_shape[1] was not reasonable, please set it carefully";
+      }
+
+      SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value()));
+
+      return true;
+    } else if (param_.kernel.ndim() == 3) {
+      // 3d conv
+      CHECK_EQ(dshape.ndim(), 5U) \
+        << "Input data should be 5D in batch-num_filter-depth-y-x";
+      Shape<5> dshape_ncdhw = ConvertLayout(dshape.get<5>(), param_.layout.value(), kNCDHW);
+      Shape<5> wshape = Shape5(dshape_ncdhw[1], param_.num_filter / param_.num_group,
+                               param_.kernel[0], param_.kernel[1], param_.kernel[2]);
+      wshape = ConvertLayout(wshape, kNCDHW, param_.layout.value());
+      SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape);
+      if (!param_.no_bias) {
+        SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter));
+      }
+
+      // Note: 3D dilation currently not supported.
+      // Calculations below done to preserve symmetry with 1D/2D code.
+      const index_t dilated_ksize_d = param_.DilatedKernelSize(0);
+      const index_t dilated_ksize_y = param_.DilatedKernelSize(1);
+      const index_t dilated_ksize_x = param_.DilatedKernelSize(2);
+
+      index_t o_pad[3];
+      index_t o_adj[3];
+      param_.InferPad(dshape_ncdhw, o_pad, o_adj);
+
+      CHECK_EQ(dshape_ncdhw[1] % param_.num_group, 0U) \
+        << "input num_filter must divide group size";
+      CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
+        << "output num_filter must divide group size";
+      CHECK_GT(param_.kernel.Size(), 0U) \
+        << "incorrect kernel size: " << param_.kernel;
+      CHECK_GT(param_.stride.Size(), 0U) \
+        << "incorrect stride size: " << param_.stride;
+      CHECK_GT(param_.dilate.Size(), 0U) \
+        << "incorrect dilate size: " << param_.dilate;
+      CHECK_EQ(param_.dilate.Size(), 1U)
+        << "Dilate is not supported in 3d deconvolution";
+
+      CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(d) must be samller than stride[0]";
+      CHECK_GE(param_.stride[1]-1, o_adj[1]) << "adj(y) must be samller than stride[1]";
+      CHECK_GE(param_.stride[2]-1, o_adj[2]) << "adj(x) must be samller than stride[2]";
+
+      Shape<5> oshape;
+      oshape[0] = dshape_ncdhw[0];
+      oshape[1] = param_.num_filter;
+      oshape[2] = param_.stride[0] * (dshape_ncdhw[2] - 1) +
+        dilated_ksize_d - 2 * o_pad[0] + o_adj[0];
+      oshape[3] = param_.stride[1] * (dshape_ncdhw[3] - 1) +
+        dilated_ksize_y - 2 * o_pad[1] + o_adj[1];
+      oshape[4] = param_.stride[2] * (dshape_ncdhw[4] - 1) +
+        dilated_ksize_x - 2 * o_pad[2] + o_adj[2];
+
+      if (param_.target_shape[0] > 0) {
+        CHECK_EQ(param_.target_shape[0], oshape[2]) \
+          << "param_.target_shape[0] was not reasonable, please it carefully";
+      }
+      if (param_.target_shape[1] > 0) {
+        CHECK_EQ(param_.target_shape[1], oshape[3]) \
+          << "param_.target_shape[1] was not reasonable, please set it carefully";
+      }
+      if (param_.target_shape[2] > 0) {
+        CHECK_EQ(param_.target_shape[2], oshape[4]) \
+          << "param_.target_shape[2] was not reasonable, please set it carefully";
+      }
+
+      SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCDHW, param_.layout.value()));
+
+      return true;
+    } else {
+      LOG(FATAL) << "Unknown convolution type";
+      return false;
+    }
+  }
+
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_GE(in_type->size(), 1U);
+    int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "First input must have specified type";
+    for (index_t i = 0; i < in_type->size(); ++i) {
+      if ((*in_type)[i] == -1) {
+        (*in_type)[i] = dtype;
+      } else {
+        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
+      }
+    }
+    out_type->clear();
+    out_type->push_back(dtype);
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    auto ptr = new DeconvolutionProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  std::string TypeString() const override {
+    return "Deconvolution";
+  }
+
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    return {out_grad[deconv::kOut], in_data[deconv::kData], in_data[deconv::kWeight]};
+  }
+
+  std::vector<ResourceRequest> ForwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+
+  std::vector<ResourceRequest> BackwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
+
+ private:
+  DeconvolutionParam param_;
+};  // class DeconvolutionProp
+#endif  // DMLC_USE_CXX11
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_NN_DECONVOLUTION_INL_H_
diff --git a/src/operator/nn/deconvolution.cc b/src/operator/nn/deconvolution.cc
index a3fc915eb0..9d3c040c1d 100644
--- a/src/operator/nn/deconvolution.cc
+++ b/src/operator/nn/deconvolution.cc
@@ -21,408 +21,45 @@
  * Copyright (c) 2015 by Contributors
  * \file deconvolution.cc
  * \brief
- * \author Wei Wu, Da Zheng
+ * \author Wei Wu
 */
 
 #include "./deconvolution-inl.h"
-#include "./mkldnn/mkldnn_ops-inl.h"
-#include "./mkldnn/mkldnn_base-inl.h"
 
 namespace mxnet {
 namespace op {
-
-static bool DeconvolutionShape(const nnvm::NodeAttrs& attrs,
-                               std::vector<TShape> *in_shape,
-                               std::vector<TShape> *out_shape) {
-  const DeconvolutionParam& param_ = nnvm::get<DeconvolutionParam>(attrs.parsed);
-#if MXNET_USE_CUDNN == 0
-  if (param_.kernel.ndim() > 2) {
-    LOG(FATAL) << "If not using CUDNN, only 1D or 2D Deconvolution is supported";
-    return false;
-  }
-#endif  // CUDNN
-
-  using namespace mshadow;
-  if (!param_.no_bias) {
-    CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]";
-  } else {
-    CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
-  }
-  out_shape->resize(1, TShape());
-  const TShape &dshape = (*in_shape)[deconv::kData];
-  if (dshape.ndim() ==  0) return false;
-
-  if (param_.kernel.ndim() == 1) {
-    // 1d conv
-    CHECK_EQ(dshape.ndim(), 3U) << "Input data should be 3D in batch-num_filter-x";
-    Shape<3> dshape_ncw = ConvertLayout(dshape.get<3>(), param_.layout.value(), kNCW);
-    Shape<3> wshape = Shape3(dshape_ncw[1], param_.num_filter / param_.num_group,
-        param_.kernel[0]);
-    wshape = ConvertLayout(wshape, kNCW, param_.layout.value());
-    SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape);
-    if (!param_.no_bias) {
-      SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter));
-    }
-
-    const index_t dilated_ksize_x = param_.DilatedKernelSize(0);
-
-    index_t o_pad[1];
-    index_t o_adj[1];
-    param_.InferPad(dshape_ncw, o_pad, o_adj);
-
-    CHECK_EQ(dshape_ncw[1] % param_.num_group, 0U) \
-      << "input num_filter must divide group size";
-    CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
-      << "output num_filter must divide group size";
-    CHECK_GT(param_.kernel.Size(), 0U) \
-      << "incorrect kernel size: " << param_.kernel;
-    CHECK_GT(param_.stride.Size(), 0U) \
-      << "incorrect stride size: " << param_.stride;
-    CHECK_GT(param_.dilate.Size(), 0U) \
-      << "incorrect dilate size: " << param_.dilate;
-
-    CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(x) must be samller than stride[0]";
-
-    Shape<3> oshape;
-    oshape[0] = dshape_ncw[0];
-    oshape[1] = param_.num_filter;
-    oshape[2] = param_.stride[0] * (dshape_ncw[2] - 1) +
-      dilated_ksize_x - 2 * o_pad[0] + o_adj[0];
-
-    if (param_.target_shape.ndim() > 0) {
-      if (param_.target_shape[0] > 0) {
-        CHECK_EQ(param_.target_shape[0], oshape[2]) \
-          << "param_.target_shape[0] was not reasonable, please set it carefully";
-      }
-    }
-
-    SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCW, param_.layout.value()));
-
-    return true;
-  } else if (param_.kernel.ndim() == 2) {
-    // 2d conv
-    CHECK_EQ(dshape.ndim(), 4U) \
-      << "Input data should be 4D in batch-num_filter-y-x";
-    Shape<4> dshape_nchw = ConvertLayout(dshape.get<4>(), param_.layout.value(), kNCHW);
-    Shape<4> wshape = Shape4(dshape_nchw[1],
-        param_.num_filter / param_.num_group,
-        param_.kernel[0], param_.kernel[1]);
-    wshape = ConvertLayout(wshape, kNCHW, param_.layout.value());
-    SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape);
-    if (!param_.no_bias) {
-      SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter));
-    }
-
-    const index_t dilated_ksize_y = param_.DilatedKernelSize(0);
-    const index_t dilated_ksize_x = param_.DilatedKernelSize(1);
-
-    index_t o_pad[2];
-    index_t o_adj[2];
-    param_.InferPad(dshape_nchw, o_pad, o_adj);
-
-    CHECK_EQ(dshape_nchw[1] % param_.num_group, 0U) \
-      << "input num_filter must divide group size";
-    CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
-      << "output num_filter must divide group size";
-    CHECK_GT(param_.kernel.Size(), 0U) \
-      << "incorrect kernel size: " << param_.kernel;
-    CHECK_GT(param_.stride.Size(), 0U) \
-      << "incorrect stride size: " << param_.stride;
-    CHECK_GT(param_.dilate.Size(), 0U) \
-      << "incorrect dilate size: " << param_.dilate;
-
-    CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(y) must be samller than stride[0]";
-    CHECK_GE(param_.stride[1]-1, o_adj[1]) << "adj(x) must be samller than stride[1]";
-
-    Shape<4> oshape;
-    oshape[0] = dshape_nchw[0];
-    oshape[1] = param_.num_filter;
-    oshape[2] = param_.stride[0] * (dshape_nchw[2] - 1) +
-      dilated_ksize_y - 2 * o_pad[0] + o_adj[0];
-    oshape[3] = param_.stride[1] * (dshape_nchw[3] - 1) +
-      dilated_ksize_x - 2 * o_pad[1] + o_adj[1];
-
-    if (param_.target_shape.ndim() > 1) {
-      if (param_.target_shape[0] > 0) {
-        CHECK_EQ(param_.target_shape[0], oshape[2]) \
-          << "param_.target_shape[0] was not reasonable, please set it carefully";
-      }
-      if (param_.target_shape[1] > 0) {
-        CHECK_EQ(param_.target_shape[1], oshape[3]) \
-          << "param_.target_shape[1] was not reasonable, please set it carefully";
-      }
-    }
-
-    SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value()));
-
-    return true;
-  } else if (param_.kernel.ndim() == 3) {
-    // 3d conv
-    CHECK_EQ(dshape.ndim(), 5U) \
-      << "Input data should be 5D in batch-num_filter-depth-y-x";
-    Shape<5> dshape_ncdhw = ConvertLayout(dshape.get<5>(), param_.layout.value(), kNCDHW);
-    Shape<5> wshape = Shape5(dshape_ncdhw[1], param_.num_filter / param_.num_group,
-        param_.kernel[0], param_.kernel[1], param_.kernel[2]);
-    wshape = ConvertLayout(wshape, kNCDHW, param_.layout.value());
-    SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape);
-    if (!param_.no_bias) {
-      SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter));
-    }
-
-    // Note: 3D dilation currently not supported.
-    // Calculations below done to preserve symmetry with 1D/2D code.
-    const index_t dilated_ksize_d = param_.DilatedKernelSize(0);
-    const index_t dilated_ksize_y = param_.DilatedKernelSize(1);
-    const index_t dilated_ksize_x = param_.DilatedKernelSize(2);
-
-    index_t o_pad[3];
-    index_t o_adj[3];
-    param_.InferPad(dshape_ncdhw, o_pad, o_adj);
-
-    CHECK_EQ(dshape_ncdhw[1] % param_.num_group, 0U) \
-      << "input num_filter must divide group size";
-    CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
-      << "output num_filter must divide group size";
-    CHECK_GT(param_.kernel.Size(), 0U) \
-      << "incorrect kernel size: " << param_.kernel;
-    CHECK_GT(param_.stride.Size(), 0U) \
-      << "incorrect stride size: " << param_.stride;
-    CHECK_GT(param_.dilate.Size(), 0U) \
-      << "incorrect dilate size: " << param_.dilate;
-    CHECK_EQ(param_.dilate.Size(), 1U)
-      << "Dilate is not supported in 3d deconvolution";
-
-    CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(d) must be samller than stride[0]";
-    CHECK_GE(param_.stride[1]-1, o_adj[1]) << "adj(y) must be samller than stride[1]";
-    CHECK_GE(param_.stride[2]-1, o_adj[2]) << "adj(x) must be samller than stride[2]";
-
-    Shape<5> oshape;
-    oshape[0] = dshape_ncdhw[0];
-    oshape[1] = param_.num_filter;
-    oshape[2] = param_.stride[0] * (dshape_ncdhw[2] - 1) +
-      dilated_ksize_d - 2 * o_pad[0] + o_adj[0];
-    oshape[3] = param_.stride[1] * (dshape_ncdhw[3] - 1) +
-      dilated_ksize_y - 2 * o_pad[1] + o_adj[1];
-    oshape[4] = param_.stride[2] * (dshape_ncdhw[4] - 1) +
-      dilated_ksize_x - 2 * o_pad[2] + o_adj[2];
-
-    if (param_.target_shape.ndim() > 2) {
-      if (param_.target_shape[0] > 0) {
-        CHECK_EQ(param_.target_shape[0], oshape[2]) \
-          << "param_.target_shape[0] was not reasonable, please it carefully";
-      }
-      if (param_.target_shape[1] > 0) {
-        CHECK_EQ(param_.target_shape[1], oshape[3]) \
-          << "param_.target_shape[1] was not reasonable, please set it carefully";
-      }
-      if (param_.target_shape[2] > 0) {
-        CHECK_EQ(param_.target_shape[2], oshape[4]) \
-          << "param_.target_shape[2] was not reasonable, please set it carefully";
-      }
-    }
-
-    SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCDHW, param_.layout.value()));
-
-    return true;
-  } else {
-    LOG(FATAL) << "Unknown convolution type";
-    return false;
-  }
-}
-
-static inline std::vector<std::string> ListArguments(const DeconvolutionParam& param_) {
-  if (!param_.no_bias) {
-    return {"data", "weight", "bias"};
-  } else {
-    return {"data", "weight"};
-  }
-}
-
-static bool DeconvolutionType(const nnvm::NodeAttrs& attrs,
-                              std::vector<int> *in_type, std::vector<int> *out_type) {
-  const DeconvolutionParam& param_ = nnvm::get<DeconvolutionParam>(attrs.parsed);
-  CHECK_GE(in_type->size(), 1U);
-  int dtype = (*in_type)[0];
-  CHECK_NE(dtype, -1) << "First input must have specified type";
-  for (index_t i = 0; i < in_type->size(); ++i) {
-    if ((*in_type)[i] == -1) {
-      (*in_type)[i] = dtype;
-    } else {
-      UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments(param_)[i]);
-    }
-  }
-  out_type->clear();
-  out_type->push_back(dtype);
-  return true;
-}
-
-inline static bool DeconvStorageType(const nnvm::NodeAttrs& attrs,
-                                     const int dev_mask,
-                                     DispatchMode* dispatch_mode,
-                                     std::vector<int> *in_attrs,
-                                     std::vector<int> *out_attrs) {
-  const DeconvolutionParam& param = nnvm::get<DeconvolutionParam>(attrs.parsed);
-  uint32_t in_expected = param.no_bias ? 2 : 3;
-  CHECK_EQ(in_attrs->size(), in_expected);
-  CHECK_EQ(out_attrs->size(), 1);
-
-  DispatchMode wanted_mode;
-#if MXNET_USE_MKLDNN == 1
-  if (dev_mask == mshadow::cpu::kDevMask)
-    wanted_mode = DispatchMode::kFComputeEx;
-  else
-#endif
-    wanted_mode = DispatchMode::kFCompute;
-  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                             dispatch_mode, wanted_mode);
-}
-
-inline static bool BackwardDeconvStorageType(const nnvm::NodeAttrs& attrs,
-                                             const int dev_mask,
-                                             DispatchMode* dispatch_mode,
-                                             std::vector<int> *in_attrs,
-                                             std::vector<int> *out_attrs) {
-  const DeconvolutionParam& param = nnvm::get<DeconvolutionParam>(attrs.parsed);
-  uint32_t out_expected = param.no_bias ? 2 : 3;
-  CHECK_EQ(in_attrs->size(), param.no_bias ? 3U : 4U);
-  CHECK_EQ(out_attrs->size(), out_expected);
-
-  DispatchMode wanted_mode;
-#if MXNET_USE_MKLDNN == 1
-  if (dev_mask == mshadow::cpu::kDevMask)
-    wanted_mode = DispatchMode::kFComputeEx;
-  else
-#endif
-    wanted_mode = DispatchMode::kFCompute;
-  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                             dispatch_mode, wanted_mode);
-}
-
-#if MXNET_USE_MKLDNN == 1
-static void DeconvolutionComputeExCPU(const nnvm::NodeAttrs& attrs,
-                                      const OpContext& ctx,
-                                      const std::vector<NDArray>& inputs,
-                                      const std::vector<OpReqType>& req,
-                                      const std::vector<NDArray>& outputs) {
-  if (SupportMKLDNNConv(inputs[0])) {
-    MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
-    MKLDNNDeconvolutionForward(attrs, ctx, inputs, req, outputs);
-    MKLDNN_OPCHECK_RUN(DeconvolutionCompute<cpu>, attrs, ctx, inputs, req,
-                       outputs);
-    return;
-  }
-  FallBackCompute(DeconvolutionCompute<cpu>, attrs, ctx, inputs, req,
-                  outputs);
+template<>
+Operator* CreateOp<cpu>(DeconvolutionParam param, int dtype,
+                        std::vector<TShape> *in_shape,
+                        std::vector<TShape> *out_shape,
+                        Context ctx) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new DeconvolutionOp<cpu, DType>(param);
+  });
+  return op;
 }
 
-static void DeconvolutionGradComputeExCPU(const nnvm::NodeAttrs& attrs,
-                                          const OpContext& ctx,
-                                          const std::vector<NDArray>& inputs,
-                                          const std::vector<OpReqType>& req,
-                                          const std::vector<NDArray>& outputs) {
-  if (SupportMKLDNNConv(inputs[0])) {
-    MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
-    MKLDNNDeconvolutionBackward(attrs, ctx, inputs, req, outputs);
-    MKLDNN_OPCHECK_RUN(DeconvolutionGradCompute<cpu>, attrs, ctx, inputs, req,
-                       outputs);
-    return;
-  }
-  FallBackCompute(DeconvolutionGradCompute<cpu>, attrs, ctx, inputs, req,
-                  outputs);
+Operator* DeconvolutionProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                              std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape, aux_shape;
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0), in_shape, &out_shape, ctx);
 }
-#endif
-
-static void DeconvolutionParamParser(nnvm::NodeAttrs* attrs) {
-  using namespace mshadow;
-  DeconvolutionParam param_;
-  param_.Init(attrs->dict);
-  if (param_.kernel.ndim() == 1) {
-    param_.layout = param_.layout? param_.layout.value() : mshadow::kNCW;
-    if (param_.stride.ndim() == 0) param_.stride = Shape1(1);
-    if (param_.dilate.ndim() == 0) param_.dilate = Shape1(1);
-    if (param_.pad.ndim() == 0) param_.pad = Shape1(0);
-    if (param_.adj.ndim() == 0) param_.adj = Shape1(0);
-  } else if (param_.kernel.ndim() == 2) {
-    param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW;
-    if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1);
-    if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1);
-    if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0);
-    if (param_.adj.ndim() == 0) param_.adj = Shape2(0, 0);
-  } else {
-    CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D deconvolution not supported";
-    param_.layout = param_.layout ? param_.layout.value(): mshadow::kNCDHW;
-    if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1);
-    if (param_.dilate.ndim() == 0) param_.dilate = Shape3(1, 1, 1);
-    if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0);
-    if (param_.adj.ndim() == 0) param_.adj = Shape3(0, 0, 0);
-  }
-  attrs->parsed = std::move(param_);
-}
-
-struct DeconvolutionGrad {
-  const char *op_name;
-  std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
-                                          const std::vector<nnvm::NodeEntry>& ograds) const {
-    std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
-    heads.push_back(n->inputs[deconv::kData]);
-    heads.push_back(n->inputs[deconv::kWeight]);
-    const DeconvolutionParam& param = nnvm::get<DeconvolutionParam>(n->attrs.parsed);
-    if (!param.no_bias)
-      heads.push_back(n->inputs[deconv::kBias]);
-    return MakeGradNode(op_name, n, heads, n->attrs.dict);
-  }
-};
 
 DMLC_REGISTER_PARAMETER(DeconvolutionParam);
 
-NNVM_REGISTER_OP(Deconvolution)
-.describe("Computes 1D or 2D transposed convolution (aka fractionally strided convolution) of the "
-    "input tensor. This operation can be seen as the gradient of Convolution operation with "
-    "respect to its input. Convolution usually reduces the size of the input. Transposed "
-    "convolution works the other way, going from a smaller input to a larger output while "
-    "preserving the connectivity pattern.")
-.set_num_inputs([](const NodeAttrs& attrs) {
-  const DeconvolutionParam& params = nnvm::get<DeconvolutionParam>(attrs.parsed);
-  return params.no_bias ? 2 : 3;
-})
-.set_num_outputs(1)
-.set_attr_parser(DeconvolutionParamParser)
-.set_attr<nnvm::FListInputNames>("FListInputNames",
-    [](const NodeAttrs& attrs) {
-  return ListArguments(nnvm::get<DeconvolutionParam>(attrs.parsed));
-})
-.set_attr<nnvm::FInferShape>("FInferShape", DeconvolutionShape)
-.set_attr<nnvm::FInferType>("FInferType", DeconvolutionType)
-.set_attr<FInferStorageType>("FInferStorageType", DeconvStorageType)
-.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
-  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-})
-.set_attr<FCompute>("FCompute<cpu>", DeconvolutionCompute<cpu>)
-#if MXNET_USE_MKLDNN == 1
-.set_attr<FComputeEx>("FComputeEx<cpu>", DeconvolutionComputeExCPU)
-#endif
-.set_attr<nnvm::FGradient>("FGradient", DeconvolutionGrad{"_backward_Deconvolution"})
+MXNET_REGISTER_OP_PROPERTY(Deconvolution, DeconvolutionProp)
 .add_argument("data", "NDArray-or-Symbol", "Input tensor to the deconvolution operation.")
 .add_argument("weight", "NDArray-or-Symbol", "Weights representing the kernel.")
 .add_argument("bias", "NDArray-or-Symbol", "Bias added to the result after the deconvolution "
     "operation.")
-.add_arguments(DeconvolutionParam::__FIELDS__());
-
-NNVM_REGISTER_OP(_backward_Deconvolution)
-.set_num_outputs([](const NodeAttrs& attrs) {
-  const DeconvolutionParam& params = nnvm::get<DeconvolutionParam>(attrs.parsed);
-  return params.no_bias ? 2 : 3;
-})
-.set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<FInferStorageType>("FInferStorageType", BackwardDeconvStorageType)
-.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
-  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-})
-.set_attr_parser(DeconvolutionParamParser)
-#if MXNET_USE_MKLDNN == 1
-.set_attr<FComputeEx>("FComputeEx<cpu>", DeconvolutionGradComputeExCPU)
-#endif
-.set_attr<FCompute>("FCompute<cpu>", DeconvolutionGradCompute<cpu>);
+.add_arguments(DeconvolutionParam::__FIELDS__())
+.describe("Computes 1D or 2D transposed convolution (aka fractionally strided convolution) of the "
+    "input tensor. This operation can be seen as the gradient of Convolution operation with "
+    "respect to its input. Convolution usually reduces the size of the input. Transposed "
+    "convolution works the other way, going from a smaller input to a larger output while "
+    "preserving the connectivity pattern.");
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/deconvolution.cu b/src/operator/nn/deconvolution.cu
index c7395428c2..623770170d 100644
--- a/src/operator/nn/deconvolution.cu
+++ b/src/operator/nn/deconvolution.cu
@@ -21,7 +21,7 @@
  * Copyright (c) 2015 by Contributors
  * \file deconvolution.cu
  * \brief
- * \author Wei Wu, Da Zheng
+ * \author Wei Wu
 */
 
 #include "./deconvolution-inl.h"
@@ -31,29 +31,13 @@
 
 namespace mxnet {
 namespace op {
-
-#if MXNET_USE_CUDNN == 1
-template<typename DType>
-static CuDNNDeconvolutionOp<DType> &GetCuDNNDeconvOp(const DeconvolutionParam& param,
-                                                     int forward_compute_type,
-                                                     int backward_compute_type,
-                                                     const std::vector<TShape>& in_shape,
-                                                     const std::vector<TShape>& out_shape,
-                                                     const Context& ctx) {
-  static thread_local CuDNNDeconvolutionOp<DType> op;
-  op.Init(param, forward_compute_type, backward_compute_type, in_shape, out_shape, ctx);
-  return op;
-}
-#endif
-
 template<>
-void DeconvolutionCompute<gpu>(const nnvm::NodeAttrs& attrs,
-                               const OpContext& ctx,
-                               const std::vector<TBlob>& inputs,
-                               const std::vector<OpReqType>& req,
-                               const std::vector<TBlob>& outputs) {
-  const DeconvolutionParam& param = nnvm::get<DeconvolutionParam>(attrs.parsed);
-  int dtype = inputs[0].type_flag_;
+Operator* CreateOp<gpu>(DeconvolutionParam param, int dtype,
+                        std::vector<TShape> *in_shape,
+                        std::vector<TShape> *out_shape,
+                        Context ctx) {
+  // Logic here parallels that in Convolution.cu
+  Operator *op = NULL;
 
 #if MXNET_USE_CUDNN == 1
   // On fp16-I/O instances, use fp32 compute (i.e. pseudo-fp16).
@@ -61,88 +45,23 @@ void DeconvolutionCompute<gpu>(const nnvm::NodeAttrs& attrs,
 
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
     if (param.cudnn_off) {
-      DeconvolutionOp<gpu, DType> op;
-      op.Init(param);
-      op.Forward(ctx, inputs, req, outputs);
-    } else if (!CuDNNDeconvolutionOp<DType>::Supports(param,
-          compute_type, compute_type, ctx.run_ctx.ctx)) {
+      op = new DeconvolutionOp<gpu, DType>(param);
+    } else if (!CuDNNDeconvolutionOp<DType>::Supports(param, compute_type, compute_type, ctx)) {
       LOG(WARNING) <<
         "This deconvolution is not supported by cudnn, MXNET deconvolution is applied.";
-      DeconvolutionOp<gpu, DType> op;
-      op.Init(param);
-      op.Forward(ctx, inputs, req, outputs);
+      op = new DeconvolutionOp<gpu, DType>(param);
     } else {
-      std::vector<TShape> in_shape(inputs.size());
-      std::vector<TShape> out_shape(1, outputs[0].shape_);
-      for (size_t i = 0; i < in_shape.size(); i++) {
-        in_shape[i] = inputs[i].shape_;
-      }
-      GetCuDNNDeconvOp<DType>(param, compute_type, compute_type,
-          in_shape, out_shape, ctx.run_ctx.ctx).Forward(ctx, inputs, req, outputs);
+      op = new CuDNNDeconvolutionOp<DType>(param, compute_type, compute_type,
+                                           *in_shape, *out_shape, ctx);
     }
   })
 #else
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    DeconvolutionOp<gpu, DType> op;
-    op.Init(param);
-    op.Forward(ctx, inputs, req, outputs);
-  })
-#endif  // MXNET_USE_CUDNN
-}
-
-template<>
-void DeconvolutionGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
-                                   const OpContext& ctx,
-                                   const std::vector<TBlob>& inputs,
-                                   const std::vector<OpReqType>& req,
-                                   const std::vector<TBlob>& outputs) {
-  const DeconvolutionParam& param = nnvm::get<DeconvolutionParam>(attrs.parsed);
-  std::vector<TBlob> in_data(inputs.begin() + 1, inputs.end());
-  const TBlob &out_grad = inputs[0];
-  const std::vector<TBlob> &in_grad = outputs;
-  int dtype = out_grad.type_flag_;
-
-#if MXNET_USE_CUDNN == 1
-  // On fp16-I/O instances, use fp32 compute (i.e. pseudo-fp16).
-  int compute_type = (dtype == mshadow::kFloat16) ? mshadow::kFloat32 : dtype;
-
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    if (param.cudnn_off) {
-      DeconvolutionOp<gpu, DType> op;
-      op.Init(param);
-      op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
-    } else if (!CuDNNDeconvolutionOp<DType>::Supports(param,
-          compute_type, compute_type, ctx.run_ctx.ctx)) {
-      LOG(WARNING) <<
-        "This deconvolution is not supported by cudnn, MXNET deconvolution is applied.";
-      DeconvolutionOp<gpu, DType> op;
-      op.Init(param);
-      op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
-    } else {
-      std::vector<TShape> in_shape(in_data.size());
-      std::vector<TShape> out_shape(1, out_grad.shape_);
-      for (size_t i = 0; i < in_shape.size(); i++) {
-        in_shape[i] = in_data[i].shape_;
-      }
-      GetCuDNNDeconvOp<DType>(param, compute_type, compute_type,
-          in_shape, out_shape, ctx.run_ctx.ctx).Backward(ctx,
-            std::vector<TBlob>{out_grad}, in_data, req, in_grad);
-    }
-  })
-#else
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    DeconvolutionOp<gpu, DType> op;
-    op.Init(param);
-    op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
+    op = new DeconvolutionOp<gpu, DType>(param);
   })
 #endif  // MXNET_USE_CUDNN
+  return op;
 }
 
-NNVM_REGISTER_OP(Deconvolution)
-.set_attr<FCompute>("FCompute<gpu>", DeconvolutionCompute<gpu>);
-
-NNVM_REGISTER_OP(_backward_Deconvolution)
-.set_attr<FCompute>("FCompute<gpu>", DeconvolutionGradCompute<gpu>);
-
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/depthwise_convolution-inl.h b/src/operator/nn/depthwise_convolution-inl.h
index 0af8cae51c..c4b7a47875 100644
--- a/src/operator/nn/depthwise_convolution-inl.h
+++ b/src/operator/nn/depthwise_convolution-inl.h
@@ -39,11 +39,11 @@ namespace mxnet {
 namespace op {
 using namespace tf::depthwise_conv;
 template<typename DType>
-class DepthwiseConvolutionOp {
+class DepthwiseConvolutionOp : public Operator {
  public:
-  void Init(const ConvolutionParam& param,
-            const std::vector<TShape>& in_shape,
-            const std::vector<TShape>& out_shape) {
+  explicit DepthwiseConvolutionOp(const ConvolutionParam& param,
+                                  const std::vector<TShape>& in_shape,
+                                  const std::vector<TShape>& out_shape) {
     args_.batch = in_shape[conv::kData][0];
     args_.in_channel = in_shape[conv::kData][1];
     args_.in_height = in_shape[conv::kData][2];
@@ -62,16 +62,19 @@ class DepthwiseConvolutionOp {
 
   ~DepthwiseConvolutionOp() {}
 
-  void Forward(const OpContext &ctx,
-               const std::vector<TBlob> &in_data,
-               const std::vector<OpReqType> &req,
-               const std::vector<TBlob> &out_data);
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args);
 
-  void Backward(const OpContext &ctx,
-                const std::vector<TBlob> &out_grad,
-                const std::vector<TBlob> &in_data,
-                const std::vector<OpReqType> &req,
-                const std::vector<TBlob> &in_grad);
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args);
 
  private:
   DepthwiseArgs args_;
@@ -279,7 +282,8 @@ template<typename DType>
 void DepthwiseConvolutionOp<DType>::Forward(const OpContext &ctx,
                                             const std::vector<TBlob> &in_data,
                                             const std::vector<OpReqType> &req,
-                                            const std::vector<TBlob> &out_data) {
+                                            const std::vector<TBlob> &out_data,
+                                            const std::vector<TBlob> &aux_states) {
   using namespace mshadow;
   using namespace mshadow::expr;
   auto stream = ctx.get_stream<gpu>();
@@ -301,8 +305,10 @@ template<typename DType>
 void DepthwiseConvolutionOp<DType>::Backward(const OpContext &ctx,
                                              const std::vector<TBlob> &out_grad,
                                              const std::vector<TBlob> &in_data,
+                                             const std::vector<TBlob> &out_data,
                                              const std::vector<OpReqType> &req,
-                                             const std::vector<TBlob> &in_grad) {
+                                             const std::vector<TBlob> &in_grad,
+                                             const std::vector<TBlob> &aux_states) {
   using namespace mshadow;
   using namespace mshadow::expr;
   auto stream = ctx.get_stream<gpu>();
diff --git a/src/operator/nn/depthwise_convolution_tf.cuh b/src/operator/nn/depthwise_convolution_tf.cuh
index e4dfd8292d..c7f48e6861 100644
--- a/src/operator/nn/depthwise_convolution_tf.cuh
+++ b/src/operator/nn/depthwise_convolution_tf.cuh
@@ -24,8 +24,8 @@
  *        are different with origin version.
  * \author shuqian.qu@hobot.cc
 */
-#ifndef MXNET_OPERATOR_NN_DEPTHWISE_CONVOLUTION_TF_CUH_
-#define MXNET_OPERATOR_NN_DEPTHWISE_CONVOLUTION_TF_CUH_
+#ifndef MXNET_OPERATOR_DEPTHWISE_CONVOLUTION_TF_CUH_
+#define MXNET_OPERATOR_DEPTHWISE_CONVOLUTION_TF_CUH_
 #include "../../common/cuda_utils.h"
 #include "../mxnet_op.h"
 
@@ -730,4 +730,4 @@ bool TryLaunchDepthwiseConv2dBackwardFilterGPUSmall(mshadow::Stream<mxnet::gpu>
 }  // namespace depthwise_conv
 }  // namespace tf
 
-#endif  // MXNET_OPERATOR_NN_DEPTHWISE_CONVOLUTION_TF_CUH_
+#endif  // MXNET_OPERATOR_DEPTHWISE_CONVOLUTION_TF_CUH_
diff --git a/src/operator/nn/dropout-inl.h b/src/operator/nn/dropout-inl.h
index cff35a3cef..715a6f4ee2 100644
--- a/src/operator/nn/dropout-inl.h
+++ b/src/operator/nn/dropout-inl.h
@@ -21,7 +21,7 @@
  * Copyright (c) 2015 by Contributors
  * \file dropout-inl.h
  * \brief
- * \author Bing Xu, Da Zheng
+ * \author Bing Xu
 */
 
 #ifndef MXNET_OPERATOR_NN_DROPOUT_INL_H_
@@ -71,7 +71,7 @@ struct DropoutParam : public dmlc::Parameter<DropoutParam> {
 };  // struct DropoutParam
 
 template<typename xpu, typename DType>
-class DropoutOp {
+class DropoutOp : public Operator {
 #if defined(USE_MKL) && defined(_OPENMP)
   static void BernoulliGenerate(common::random::RandGenerator<cpu, DType> gen,
                                 int n, double p, int* r) {
@@ -206,15 +206,16 @@ class DropoutOp {
     }
   };
 
-  void Init(const DropoutParam &param) {
+  explicit DropoutOp(DropoutParam param) {
     this->pkeep_ = 1.0f - param.p;
     this->mode_ = static_cast<dropout::DropoutOpMode>(param.mode);
   }
 
-  void Forward(const OpContext &ctx,
-               const std::vector<TBlob> &in_data,
-               const std::vector<OpReqType> &req,
-               const std::vector<TBlob> &out_data) {
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_states) {
     if (req[dropout::kOut] != kNullOp) {
       CHECK_EQ(in_data.size(), 1U);
       if (ctx.is_train) {
@@ -248,13 +249,17 @@ class DropoutOp {
     }
   }
 
-  void Backward(const OpContext &ctx,
-                const std::vector<TBlob> &out_grad,
-                const std::vector<TBlob> &out_data,
-                const std::vector<OpReqType> &req,
-                const std::vector<TBlob> &in_grad) {
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_states) {
     using namespace mshadow;
     using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), 1U);
+    CHECK_EQ(in_grad.size(), 1U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
     if (ctx.is_train || mode_ == dropout::kAlways) {
       if (!MKLBackward(s, this->pkeep_, in_grad, out_data, out_grad)) {
@@ -288,42 +293,110 @@ class DropoutOp {
   dropout::DropoutOpMode mode_;
 };  // class DropoutOp
 
-template<typename xpu>
-void DropoutCompute(const nnvm::NodeAttrs& attrs,
-                    const OpContext& ctx,
-                    const std::vector<TBlob>& inputs,
-                    const std::vector<OpReqType>& req,
-                    const std::vector<TBlob>& outputs) {
-  const DropoutParam& param = nnvm::get<DropoutParam>(attrs.parsed);
-  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-    static thread_local DropoutOp<xpu, DType> op;
-    op.Init(param);
-    op.Forward(ctx, inputs, req, outputs);
-  });
-}
 
 template<typename xpu>
-void DropoutGradCompute(const nnvm::NodeAttrs& attrs,
-                        const OpContext& ctx,
-                        const std::vector<TBlob>& inputs,
-                        const std::vector<OpReqType>& req,
-                        const std::vector<TBlob>& outputs) {
-  const DropoutParam& param = nnvm::get<DropoutParam>(attrs.parsed);
-  CHECK_EQ(inputs.size(), 2U);
-  CHECK_EQ(outputs.size(), 1);
-  CHECK_EQ(req.size(), 1);
-  std::vector<TBlob> out_grads(2);
-  std::vector<TBlob> out_data(2);
-  out_grads[dropout::kOut] = inputs[0];
-  out_data[dropout::kMask] = inputs[1];
-
-  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-    static thread_local DropoutOp<xpu, DType> op;
-    op.Init(param);
-    op.Backward(ctx, out_grads, out_data, req, outputs);
-  });
-}
+Operator *CreateOp(DropoutParam param, int dtype);
+
+#if DMLC_USE_CXX11
+class DropoutProp : public OperatorProperty {
+ public:
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    param_.Init(kwargs);
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), 1U);
+    const TShape &dshape = in_shape->at(0);
+    if (dshape.ndim() == 0) return false;
+    out_shape->clear();
+    out_shape->push_back(dshape);
+    out_shape->push_back(dshape);
+    return true;
+  }
+
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_EQ(in_type->size(), 1U);
+    int dtype = in_type->at(0);
+
+    if (dtype == -1) {
+      LOG(FATAL) << "input type to dropout is not specified.";
+      return false;
+    }
+
+    size_t nout = this->ListOutputs().size();
+    out_type->clear();
+    for (size_t i = 0; i < nout; ++i) out_type->push_back(dtype);
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    auto ptr = new DropoutProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  std::string TypeString() const override {
+    return "Dropout";
+  }
 
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    return {out_grad[dropout::kOut], out_data[dropout::kMask]};
+  }
+
+  std::vector<std::pair<int, void*> > BackwardInplaceOption(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data,
+    const std::vector<void*> &in_grad) const override {
+    return {{out_grad[dropout::kOut], in_grad[dropout::kData]}};
+  }
+
+  std::vector<std::pair<int, void*> > ForwardInplaceOption(
+    const std::vector<int> &in_data,
+    const std::vector<void*> &out_data) const override {
+    return {{in_data[dropout::kData], out_data[dropout::kOut]}};
+  }
+
+  std::vector<ResourceRequest> ForwardResource(const std::vector<TShape> &in_shape) const override {
+    return { ResourceRequest::kParallelRandom };
+  }
+
+  int NumVisibleOutputs() const override {
+    return 1;
+  }
+
+  int NumOutputs() const override {
+    return 2;
+  }
+
+  std::vector<std::string> ListOutputs() const override {
+    return {"output", "mask"};
+  }
+
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
+
+ private:
+  DropoutParam param_;
+};  // class DropoutProp
+#endif  // DMLC_USE_CXX11
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_NN_DROPOUT_INL_H_
diff --git a/src/operator/nn/dropout.cc b/src/operator/nn/dropout.cc
index dd5f1e58fb..3aa832a713 100644
--- a/src/operator/nn/dropout.cc
+++ b/src/operator/nn/dropout.cc
@@ -21,7 +21,7 @@
  * Copyright (c) 2015 by Contributors
  * \file dropout.cc
  * \brief
- * \author Bing Xu, Da Zheng
+ * \author Bing Xu
 */
 
 #include "./dropout-inl.h"
@@ -29,21 +29,24 @@
 
 namespace mxnet {
 namespace op {
-
-struct DropoutGrad {
-  const char *op_name;
-  std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
-                                          const std::vector<nnvm::NodeEntry>& ograds) const {
-    std::vector<nnvm::NodeEntry> heads;
-    heads.push_back(ograds[0]);
-    heads.emplace_back(nnvm::NodeEntry{n, dropout::kMask, 0});
-    return MakeGradNode(op_name, n, heads, n->attrs.dict);
-  }
-};
+template<>
+Operator *CreateOp<cpu>(DropoutParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new DropoutOp<cpu, DType>(param);
+  });
+  return op;
+}
+
+// DO_BIND_DISPATCH comes from operator_common.h
+Operator *DropoutProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                              std::vector<int> *in_type) const {
+  DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
+}
 
 DMLC_REGISTER_PARAMETER(DropoutParam);
 
-NNVM_REGISTER_OP(Dropout)
+MXNET_REGISTER_OP_PROPERTY(Dropout, DropoutProp)
 .describe(R"(Applies dropout operation to input array.
 
 - During training, each element of the input is set to zero with probability p.
@@ -74,66 +77,8 @@ Example::
   [[ 3.     0.5   -0.5    2.     7.   ]
    [ 2.    -0.4    7.     3.     0.2  ]]
 )" ADD_FILELINE)
-.set_num_inputs(1)
-.set_num_outputs(2)
-.set_attr_parser(ParamParser<DropoutParam>)
-.set_attr<nnvm::FListInputNames>("FListInputNames",
-    [](const NodeAttrs& attrs) {
-  return std::vector<std::string>{"data"};
-})
-.set_attr<nnvm::FListOutputNames>("FListOutputNames",
-    [](const NodeAttrs& attrs) {
-  return std::vector<std::string>{"output", "mask"};
-})
-.set_attr<nnvm::FNumVisibleOutputs>("FNumVisibleOutputs",
-    [](const NodeAttrs& attrs) {
-  return 1;
-})
-.set_attr<nnvm::FInferShape>("FInferShape", [](const nnvm::NodeAttrs& attrs,
-      std::vector<TShape> *in_shape, std::vector<TShape> *out_shape){
-  using namespace mshadow;
-  CHECK_EQ(in_shape->size(), 1U);
-  const TShape &dshape = in_shape->at(0);
-  if (dshape.ndim() == 0) return false;
-  out_shape->clear();
-  out_shape->push_back(dshape);
-  out_shape->push_back(dshape);
-  return true;
-})
-.set_attr<nnvm::FInferType>("FInferType", [](const nnvm::NodeAttrs& attrs,
-      std::vector<int> *in_type, std::vector<int> *out_type) {
-  CHECK_EQ(in_type->size(), 1U);
-  int dtype = in_type->at(0);
-
-  if (dtype == -1) {
-    LOG(FATAL) << "input type to dropout is not specified.";
-    return false;
-  }
-
-  size_t nout = 2;
-  out_type->clear();
-  for (size_t i = 0; i < nout; ++i) out_type->push_back(dtype);
-  return true;
-})
-.set_attr<FCompute>("FCompute<cpu>", DropoutCompute<cpu>)
-.set_attr<nnvm::FGradient>("FGradient", DropoutGrad{"_backward_Dropout"})
-.set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs){
-  return std::vector<std::pair<int, int> >{{0, 0}};
-})
-.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
-  return std::vector<ResourceRequest>{ ResourceRequest::kParallelRandom };
-})
 .add_argument("data", "NDArray-or-Symbol", "Input array to which dropout will be applied.")
 .add_arguments(DropoutParam::__FIELDS__());
 
-NNVM_REGISTER_OP(_backward_Dropout)
-.set_num_outputs(1)
-.set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr_parser(ParamParser<DropoutParam>)
-.set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs){
-  return std::vector<std::pair<int, int> >{{0, 0}};
-})
-.set_attr<FCompute>("FCompute<cpu>", DropoutGradCompute<cpu>);
-
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/dropout.cu b/src/operator/nn/dropout.cu
index e655278822..f416c58832 100644
--- a/src/operator/nn/dropout.cu
+++ b/src/operator/nn/dropout.cu
@@ -21,20 +21,21 @@
  * Copyright (c) 2015 by Contributors
  * \file dropout.cc
  * \brief
- * \author Bing Xu, Da Zheng
+ * \author Bing Xu
 */
 
 #include "./dropout-inl.h"
 
 namespace mxnet {
 namespace op {
-
-NNVM_REGISTER_OP(Dropout)
-.set_attr<FCompute>("FCompute<gpu>", DropoutCompute<gpu>);
-
-NNVM_REGISTER_OP(_backward_Dropout)
-.set_attr<FCompute>("FCompute<gpu>", DropoutGradCompute<gpu>);
-
+template<>
+Operator *CreateOp<gpu>(DropoutParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new DropoutOp<gpu, DType>(param);
+  });
+  return op;
+}
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/nn/fully_connected-inl.h b/src/operator/nn/fully_connected-inl.h
index e8e95643e6..9f3deec244 100644
--- a/src/operator/nn/fully_connected-inl.h
+++ b/src/operator/nn/fully_connected-inl.h
@@ -43,7 +43,6 @@ namespace op {
 // These enums are only visible within this header
 namespace fullc {
 enum FullyConnectedOpInputs {kData, kWeight, kBias};
-enum FullyConnectedOpResource {kTempSpace};
 enum FullyConnectedOpOutputs {kOut};
 }  // fullc
 
@@ -62,160 +61,240 @@ struct FullyConnectedParam : public dmlc::Parameter<FullyConnectedParam> {
   }
 };
 
+/**
+ * \brief This is the implementation of fully connected operator.
+ * \tparam xpu The device that the op will be executed on.
+ */
 template<typename xpu, typename DType>
-void FCForward(const OpContext &ctx, const FullyConnectedParam &param,
-               const std::vector<TBlob> &in_data, const std::vector<OpReqType> &req,
-               const std::vector<TBlob> &out_data) {
-  using namespace mshadow;
-  using namespace mshadow::expr;
-  if (req[fullc::kOut] == kNullOp) return;
-  CHECK_EQ(req[fullc::kOut], kWriteTo);
-  // TODO(bing): check the BLAS Handle, be careful
-  // maybe need blas handle from context
-  // TODO(bing): judge shape to remove flatten op
-  Stream<xpu> *s = ctx.get_stream<xpu>();
+class FullyConnectedOp : public Operator {
+ public:
+  explicit FullyConnectedOp(FullyConnectedParam p) {
+    this->param_ = p;
+  }
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    if (req[fullc::kOut] == kNullOp) return;
+    CHECK_EQ(req[fullc::kOut], kWriteTo);
+    size_t expected = param_.no_bias ? 2 : 3;
+    CHECK_EQ(in_data.size(), expected);
+    CHECK_EQ(out_data.size(), 1U);
+    // TODO(bing): check the BLAS Handle, be careful
+    // maybe need blas handle from context
+    // TODO(bing): judge shape to remove flatten op
+    Stream<xpu> *s = ctx.get_stream<xpu>();
 #if defined(__CUDACC__)
-  CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
-      << "Must init CuBLAS handle in stream";
+    CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
+        << "Must init CuBLAS handle in stream";
 #endif  // __CUDACC__
-  const TShape& ishape = in_data[fullc::kData].shape_;
-  const TShape& oshape = out_data[fullc::kOut].shape_;
-
-  Tensor<xpu, 2, DType> wmat = in_data[fullc::kWeight].get<xpu, 2, DType>(s);
-  Tensor<xpu, 2, DType> data, out;
-  if (!param.flatten) {
-    data = in_data[fullc::kData].get_with_shape<xpu, 2, DType>(
-        Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s);
-    out = out_data[fullc::kOut].get_with_shape<xpu, 2, DType>(
-        Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
-  } else {
-    data = in_data[fullc::kData].get_with_shape<xpu, 2, DType>(
-        Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s);
-    out = out_data[fullc::kOut].get_with_shape<xpu, 2, DType>(
-        Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s);
-  }
+    const TShape& ishape = in_data[fullc::kData].shape_;
+    const TShape& oshape = out_data[fullc::kOut].shape_;
 
-  // Legacy approach shown here for comparison:
-  //   out = dot(data, wmat.T());
-  linalg_gemm(data, wmat, out, false, true, s);
-  if (!param.no_bias) {
-    Tensor<xpu, 1, DType> bias = in_data[fullc::kBias].get<xpu, 1, DType>(s);
-    out += repmat(bias, data.size(0));
-  }
-}
+    Tensor<xpu, 2, DType> wmat = in_data[fullc::kWeight].get<xpu, 2, DType>(s);
+    Tensor<xpu, 2, DType> data, out;
+    if (!param_.flatten) {
+      data = in_data[fullc::kData].get_with_shape<xpu, 2, DType>(
+          Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s);
+      out = out_data[fullc::kOut].get_with_shape<xpu, 2, DType>(
+          Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
+    } else {
+      data = in_data[fullc::kData].get_with_shape<xpu, 2, DType>(
+          Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s);
+      out = out_data[fullc::kOut].get_with_shape<xpu, 2, DType>(
+          Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s);
+    }
 
-template<typename xpu, typename DType>
-void FCBackward(const OpContext &ctx, const FullyConnectedParam &param,
-                const std::vector<TBlob> &out_grad, const std::vector<TBlob> &in_data,
-                const std::vector<OpReqType> &req, const std::vector<TBlob> &in_grad) {
-  using namespace mshadow;
-  using namespace mshadow::expr;
-  // TODO(bing): check the BLAS Handle, be careful
-  //  maybe need blas handle from context
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-  const TShape& ishape = in_data[fullc::kData].shape_;
-  const TShape& oshape = out_grad[fullc::kOut].shape_;
-
-  Tensor<xpu, 2, DType> wmat = in_data[fullc::kWeight].get<xpu, 2, DType>(s);
-  Tensor<xpu, 2, DType> data, grad, gdata;
-  if (!param.flatten) {
-    data = in_data[fullc::kData].get_with_shape<xpu, 2, DType>(
-        Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s);
-    grad = out_grad[fullc::kOut].get_with_shape<xpu, 2, DType>(
-        Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
-    gdata = in_grad[fullc::kData].get_with_shape<xpu, 2, DType>(
-        Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s);
-  } else {
-    data = in_data[fullc::kData].get_with_shape<xpu, 2, DType>(
-        Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s);
-    grad = out_grad[fullc::kOut].get_with_shape<xpu, 2, DType>(
-        Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s);
-    gdata = in_grad[fullc::kData].get_with_shape<xpu, 2, DType>(
-        Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s);
+    // Legacy approach shown here for comparison:
+    //   out = dot(data, wmat.T());
+    linalg_gemm(data, wmat, out, false, true, s);
+    if (!param_.no_bias) {
+      Tensor<xpu, 1, DType> bias = in_data[fullc::kBias].get<xpu, 1, DType>(s);
+      out += repmat(bias, data.size(0));
+    }
   }
 
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), 1U);
+    size_t expected = param_.no_bias ? 2 : 3;
+    CHECK(in_data.size() == expected && in_grad.size() == expected);
+    CHECK_EQ(req.size(), expected);
+    // TODO(bing): check the BLAS Handle, be careful
+    //  maybe need blas handle from context
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    const TShape& ishape = in_data[fullc::kData].shape_;
+    const TShape& oshape = out_grad[fullc::kOut].shape_;
+
+    Tensor<xpu, 2, DType> wmat = in_data[fullc::kWeight].get<xpu, 2, DType>(s);
+    Tensor<xpu, 2, DType> data, grad, gdata;
+    if (!param_.flatten) {
+      data = in_data[fullc::kData].get_with_shape<xpu, 2, DType>(
+          Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s);
+      grad = out_grad[fullc::kOut].get_with_shape<xpu, 2, DType>(
+          Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
+      gdata = in_grad[fullc::kData].get_with_shape<xpu, 2, DType>(
+          Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s);
+    } else {
+      data = in_data[fullc::kData].get_with_shape<xpu, 2, DType>(
+          Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s);
+      grad = out_grad[fullc::kOut].get_with_shape<xpu, 2, DType>(
+          Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s);
+      gdata = in_grad[fullc::kData].get_with_shape<xpu, 2, DType>(
+          Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s);
+    }
+
 #if defined(__CUDACC__)
-  CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
-      << "Must init CuBLAS handle in stream";
+    CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
+        << "Must init CuBLAS handle in stream";
 #endif
-  //  backprop
-  CHECK_NE(req[fullc::kWeight], kWriteInplace) << "cannot write weight inplace";
-  // gradient of weight
-  Tensor<xpu, 2, DType> gwmat = in_grad[fullc::kWeight].get<xpu, 2, DType>(s);
-  // Legacy approach shown here for comparison:
-  //   out = Assign(gwmat, req[fullc::kWeight], dot(grad.T(), data));
-  linalg_gemm(grad, data, gwmat, true, false, s, req[fullc::kWeight]);
-  // gradient of bias
-  if (!param.no_bias) {
-    Tensor<xpu, 1, DType> gbias = in_grad[fullc::kBias].get<xpu, 1, DType>(s);
-    Assign(gbias, req[fullc::kBias], sum_rows(grad));
+    //  backprop
+    CHECK_NE(req[fullc::kWeight], kWriteInplace) << "cannot write weight inplace";
+    // gradient of weight
+    Tensor<xpu, 2, DType> gwmat = in_grad[fullc::kWeight].get<xpu, 2, DType>(s);
+    // Legacy approach shown here for comparison:
+    //   out = Assign(gwmat, req[fullc::kWeight], dot(grad.T(), data));
+    linalg_gemm(grad, data, gwmat, true, false, s, req[fullc::kWeight]);
+    // gradient of bias
+    if (!param_.no_bias) {
+      Tensor<xpu, 1, DType> gbias = in_grad[fullc::kBias].get<xpu, 1, DType>(s);
+      Assign(gbias, req[fullc::kBias], sum_rows(grad));
+    }
+    // gradient of data
+    // Legacy approach shown here for comparison:
+    //   Assign(gdata, req[fullc::kData], dot(grad, wmat));
+    linalg_gemm(grad, wmat, gdata, false, false, s, req[fullc::kData]);
   }
-  // gradient of data
-  // Legacy approach shown here for comparison:
-  //   Assign(gdata, req[fullc::kData], dot(grad, wmat));
-  linalg_gemm(grad, wmat, gdata, false, false, s, req[fullc::kData]);
-}
 
+ private:
+  FullyConnectedParam param_;
+};  // class FullyConnectedOp
+
+// Decalre Factory function, used for dispatch specialization
 template<typename xpu>
-void FullyConnectedCompute(const nnvm::NodeAttrs& attrs,
-                           const OpContext& ctx,
-                           const std::vector<TBlob>& inputs,
-                           const std::vector<OpReqType>& req,
-                           const std::vector<TBlob>& outputs) {
-  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
-  uint32_t in_expected = param.no_bias ? 2 : 3;
-  CHECK_EQ(inputs.size(), in_expected);
-  CHECK_EQ(outputs.size(), 1U);
-  int dtype = inputs[0].type_flag_;
-
-  switch (dtype) {
-  case mshadow::kFloat32:
-    FCForward<xpu, float>(ctx, param, inputs, req, outputs);
-    break;
-  case mshadow::kFloat64:
-    FCForward<xpu, double>(ctx, param, inputs, req, outputs);
-    break;
-  case mshadow::kFloat16:
-    LOG(FATAL) << "float16 fully connected layer is currently"
-                  "only supported by CuDNN version.";
-    break;
-  default:
-    LOG(FATAL) << "Unsupported type " << dtype;
+Operator* CreateOp(FullyConnectedParam param, int dtype,
+                   std::vector<TShape> *in_shape,
+                   std::vector<TShape> *out_shape,
+                   Context ctx);
+
+#if DMLC_USE_CXX11
+class FullyConnectedProp : public OperatorProperty {
+ public:
+  std::vector<std::string> ListArguments() const override {
+    if (!param_.no_bias) {
+      return {"data", "weight", "bias"};
+    } else {
+      return {"data", "weight"};
+    }
   }
-}
 
-template<typename xpu>
-void FullyConnectedGradCompute(const nnvm::NodeAttrs& attrs,
-                               const OpContext& ctx,
-                               const std::vector<TBlob>& inputs,
-                               const std::vector<OpReqType>& req,
-                               const std::vector<TBlob>& outputs) {
-  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
-  uint32_t out_expected = param.no_bias ? 2 : 3;
-  CHECK_EQ(inputs.size(), 3U);
-  CHECK_EQ(outputs.size(), out_expected);
-  CHECK_EQ(req.size(), out_expected);
-
-  std::vector<TBlob> out_grad{inputs[0]};
-  std::vector<TBlob> in_data(inputs.begin() + 1, inputs.end());
-  int dtype = inputs[0].type_flag_;
-
-  switch (dtype) {
-  case mshadow::kFloat32:
-    FCBackward<xpu, float>(ctx, param, out_grad, in_data, req, outputs);
-    break;
-  case mshadow::kFloat64:
-    FCBackward<xpu, double>(ctx, param, out_grad, in_data, req, outputs);
-    break;
-  case mshadow::kFloat16:
-    LOG(FATAL) << "float16 fully connected layer is currently"
-                  "only supported by CuDNN version.";
-    break;
-  default:
-    LOG(FATAL) << "Unsupported type " << dtype;
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    param_.Init(kwargs);
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
   }
-}
 
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    if (!param_.no_bias) {
+      CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]";
+    } else {
+      CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
+    }
+    CHECK_EQ(out_shape->size(), 1U);
+    TShape dshape = (*in_shape)[fullc::kData];
+    TShape oshape = (*out_shape)[0];
+    // require data to be known
+    if (dshape.ndim() ==  0) return false;
+
+    index_t num_input;
+    if (!param_.flatten) {
+      num_input = dshape[dshape.ndim()-1];
+    } else {
+      num_input = dshape.ProdShape(1, dshape.ndim());
+    }
+    SHAPE_ASSIGN_CHECK(*in_shape, fullc::kWeight, Shape2(param_.num_hidden, num_input));
+    if (!param_.no_bias) {
+      SHAPE_ASSIGN_CHECK(*in_shape, fullc::kBias, Shape1(param_.num_hidden));
+    }
+
+    if (!param_.flatten) {
+      TShape result_shape(dshape);
+      result_shape[dshape.ndim()-1] = param_.num_hidden;
+      SHAPE_ASSIGN_CHECK(*out_shape, 0, result_shape);
+    } else {
+      SHAPE_ASSIGN_CHECK(*out_shape, 0, Shape2(dshape[0], param_.num_hidden));
+    }
+    if (oshape.ndim() != 0) {
+      dshape[0] = oshape[0];
+      SHAPE_ASSIGN_CHECK(*in_shape, fullc::kData, dshape);
+    }
+    return true;
+  }
+
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_GE(in_type->size(), 1U);
+    nnvm::NodeAttrs attrs;
+    attrs.name = "FullyConnected";
+    return ElemwiseAttr<int, type_is_none, type_assign, true, type_string>(
+      attrs, in_type, out_type, -1);
+  }
+
+  OperatorProperty* Copy() const override {
+    FullyConnectedProp* fc_sym = new FullyConnectedProp();
+    fc_sym->param_ = this->param_;
+    return fc_sym;
+  }
+
+  std::string TypeString() const override {
+    return "FullyConnected";
+  }
+
+  // decalre dependency and inplace optimization options
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    return {out_grad[fullc::kOut], in_data[fullc::kData], in_data[fullc::kWeight]};
+  }
+
+  std::vector<std::pair<int, void*> > BackwardInplaceOption(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data,
+    const std::vector<void*> &in_grad) const override {
+    return {{in_data[fullc::kData], in_grad[fullc::kData]}};
+  }
+
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented.";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
+
+ private:
+  FullyConnectedParam param_;
+};  // class FullyConnectedSymbol
+#endif
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_NN_FULLY_CONNECTED_INL_H_
diff --git a/src/operator/nn/fully_connected.cc b/src/operator/nn/fully_connected.cc
index 4362408a23..9a97816029 100644
--- a/src/operator/nn/fully_connected.cc
+++ b/src/operator/nn/fully_connected.cc
@@ -23,153 +23,58 @@
  * \brief fully connect operator
 */
 #include "./fully_connected-inl.h"
-#include "./mkldnn/mkldnn_ops-inl.h"
-#include "./mkldnn/mkldnn_base-inl.h"
 #if MXNET_USE_NNPACK == 1
 #include "./nnpack/nnpack_fully_connected-inl.h"
 #endif  // MXNET_USE_NNPACK
 
 namespace mxnet {
 namespace op {
-
-static bool FullyConnectedShape(const nnvm::NodeAttrs& attrs,
-                                std::vector<TShape> *in_shape,
-                                std::vector<TShape> *out_shape) {
-  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
-  using namespace mshadow;
-  if (!param.no_bias) {
-    CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]";
-  } else {
-    CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
-  }
-  CHECK_EQ(out_shape->size(), 1U);
-  TShape dshape = (*in_shape)[fullc::kData];
-  TShape oshape = (*out_shape)[0];
-  // require data to be known
-  if (dshape.ndim() ==  0) return false;
-
-  index_t num_input;
-  if (!param.flatten) {
-    num_input = dshape[dshape.ndim()-1];
-  } else {
-    num_input = dshape.ProdShape(1, dshape.ndim());
-  }
-  SHAPE_ASSIGN_CHECK(*in_shape, fullc::kWeight, Shape2(param.num_hidden, num_input));
-  if (!param.no_bias) {
-    SHAPE_ASSIGN_CHECK(*in_shape, fullc::kBias, Shape1(param.num_hidden));
-  }
-
-  if (!param.flatten) {
-    TShape result_shape(dshape);
-    result_shape[dshape.ndim()-1] = param.num_hidden;
-    SHAPE_ASSIGN_CHECK(*out_shape, 0, result_shape);
-  } else {
-    SHAPE_ASSIGN_CHECK(*out_shape, 0, Shape2(dshape[0], param.num_hidden));
-  }
-  if (oshape.ndim() != 0) {
-    dshape[0] = oshape[0];
-    SHAPE_ASSIGN_CHECK(*in_shape, fullc::kData, dshape);
-  }
-  return true;
-}
-
-#if MXNET_USE_MKLDNN == 1
-void FullyConnectedComputeExCPU(const nnvm::NodeAttrs& attrs,
-                                const OpContext &ctx,
-                                const std::vector<NDArray> &inputs,
-                                const std::vector<OpReqType> &req,
-                                const std::vector<NDArray> &outputs) {
-  if (SupportMKLDNN(inputs[0])) {
-    MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
-    MKLDNNFCForward(attrs, ctx, inputs, req, outputs);
-    MKLDNN_OPCHECK_RUN(FullyConnectedCompute<cpu>, attrs, ctx, inputs, req,
-                       outputs);
-    return;
-  }
-  FallBackCompute(FullyConnectedCompute<cpu>, attrs, ctx, inputs, req, outputs);
-}
-
-void FullyConnectedGradComputeExCPU(const nnvm::NodeAttrs& attrs,
-                                    const OpContext &ctx,
-                                    const std::vector<NDArray> &inputs,
-                                    const std::vector<OpReqType> &req,
-                                    const std::vector<NDArray> &outputs) {
-  if (SupportMKLDNN(inputs[0])) {
-    MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
-    MKLDNNFCBackward(attrs, ctx, inputs, req, outputs);
-    MKLDNN_OPCHECK_RUN(FullyConnectedGradCompute<cpu>, attrs, ctx, inputs, req,
-                       outputs);
-    return;
+template<>
+Operator* CreateOp<cpu>(FullyConnectedParam param, int dtype,
+                        std::vector<TShape> *in_shape,
+                        std::vector<TShape> *out_shape,
+                        Context ctx) {
+  Operator *op = NULL;
+#if MXNET_USE_NNPACK == 1
+  const size_t batch_size = (*in_shape)[0][0];
+  // nnp_fully_connected_inference will do optimization for batch-size = 1
+  // nnp_fully_connected_output will do optimization for batch-size > 1
+  switch (dtype) {
+  case mshadow::kFloat32:
+    return new NNPACKFullyConnectedOp<cpu, float>(param);
+  default:
+    break;
   }
-  FallBackCompute(FullyConnectedGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
-}
 #endif
-
-static bool FullyConnectedType(const nnvm::NodeAttrs& attrs,
-                               std::vector<int> *in_type, std::vector<int> *out_type) {
-  CHECK_GE(in_type->size(), 1U);
-  return ElemwiseAttr<int, type_is_none, type_assign, true, type_string>(
-      attrs, in_type, out_type, -1);
-}
-
-struct FullyConnectedGrad {
-  const char *op_name;
-  std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
-                                          const std::vector<nnvm::NodeEntry>& ograds) const {
-    std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
-    heads.push_back(n->inputs[fullc::kData]);
-    heads.push_back(n->inputs[fullc::kWeight]);
-    return MakeGradNode(op_name, n, heads, n->attrs.dict);
+  switch (dtype) {
+  case mshadow::kFloat32:
+    op = new FullyConnectedOp<cpu, float>(param);
+    break;
+  case mshadow::kFloat64:
+    op = new FullyConnectedOp<cpu, double>(param);
+    break;
+  case mshadow::kFloat16:
+    LOG(FATAL) << "float16 fully connected layer is currently"
+                  "only supported by CuDNN version.";
+    break;
+  default:
+    LOG(FATAL) << "Unsupported type " << dtype;
   }
-};
-
-inline static bool FCStorageType(const nnvm::NodeAttrs& attrs,
-                                 const int dev_mask,
-                                 DispatchMode* dispatch_mode,
-                                 std::vector<int> *in_attrs,
-                                 std::vector<int> *out_attrs) {
-  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
-  uint32_t in_expected = param.no_bias ? 2 : 3;
-  CHECK_EQ(in_attrs->size(), in_expected);
-  CHECK_EQ(out_attrs->size(), 1);
 
-  DispatchMode wanted_mode;
-#if MXNET_USE_MKLDNN == 1
-  if (dev_mask == mshadow::cpu::kDevMask)
-    wanted_mode = DispatchMode::kFComputeEx;
-  else
-#endif
-    wanted_mode = DispatchMode::kFCompute;
-  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                             dispatch_mode, wanted_mode);
+  return op;
 }
 
-inline static bool BackwardFCStorageType(const nnvm::NodeAttrs& attrs,
-                                         const int dev_mask,
-                                         DispatchMode* dispatch_mode,
-                                         std::vector<int> *in_attrs,
-                                         std::vector<int> *out_attrs) {
-  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
-  uint32_t out_expected = param.no_bias ? 2 : 3;
-  CHECK_EQ(in_attrs->size(), 3U);
-  CHECK_EQ(out_attrs->size(), out_expected);
-
-  DispatchMode wanted_mode;
-#if 0
-  // TODO(zhengda) let's disable MKLDNN for FullyConnected for now.
-  // It seems there is a bug.
-  if (dev_mask == mshadow::cpu::kDevMask)
-    *dispatch_mode = DispatchMode::kFComputeEx;
-  else
-#endif
-    wanted_mode = DispatchMode::kFCompute;
-  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                             dispatch_mode, wanted_mode);
+// DO_BIND_DISPATCH comes from operator_common.h
+Operator *FullyConnectedProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                     std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape(1, TShape()), aux_shape;
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], in_shape, &out_shape, ctx);
 }
 
 DMLC_REGISTER_PARAMETER(FullyConnectedParam);
 
-NNVM_REGISTER_OP(FullyConnected)
+MXNET_REGISTER_OP_PROPERTY(FullyConnected, FullyConnectedProp)
 .describe(R"code(Applies a linear transformation: :math:`Y = XW^T + b`.
 
 If ``flatten`` is set to be true, then the shapes are:
@@ -191,59 +96,9 @@ The learnable parameters include both ``weight`` and ``bias``.
 If ``no_bias`` is set to be true, then the ``bias`` term is ignored.
 
 )code" ADD_FILELINE)
-.set_num_inputs([](const NodeAttrs& attrs) {
-  const FullyConnectedParam& params = nnvm::get<FullyConnectedParam>(attrs.parsed);
-  return params.no_bias ? 2 : 3;
-})
-.set_num_outputs(1)
-.set_attr_parser(ParamParser<FullyConnectedParam>)
-.set_attr<FInferStorageType>("FInferStorageType", FCStorageType)
-.set_attr<nnvm::FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) {
-  const FullyConnectedParam& params = nnvm::get<FullyConnectedParam>(attrs.parsed);
-  if (!params.no_bias) {
-    return std::vector<std::string>{"data", "weight", "bias"};
-  } else {
-    return std::vector<std::string>{"data", "weight"};
-  }
-})
-#if MXNET_USE_MKLDNN == 1
-.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
-  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-})
-#endif
-.set_attr<nnvm::FInferShape>("FInferShape", FullyConnectedShape)
-.set_attr<nnvm::FInferType>("FInferType", FullyConnectedType)
-.set_attr<FCompute>("FCompute<cpu>", FullyConnectedCompute<cpu>)
-#if MXNET_USE_MKLDNN == 1
-.set_attr<FComputeEx>("FComputeEx<cpu>", FullyConnectedComputeExCPU)
-#endif
-.set_attr<nnvm::FGradient>("FGradient", FullyConnectedGrad{"_backward_FullyConnected"})
 .add_argument("data", "NDArray-or-Symbol", "Input data.")
 .add_argument("weight", "NDArray-or-Symbol", "Weight matrix.")
 .add_argument("bias", "NDArray-or-Symbol", "Bias parameter.")
 .add_arguments(FullyConnectedParam::__FIELDS__());
-
-NNVM_REGISTER_OP(_backward_FullyConnected)
-.set_num_inputs(3)
-.set_num_outputs([](const NodeAttrs& attrs) {
-  const FullyConnectedParam& params = nnvm::get<FullyConnectedParam>(attrs.parsed);
-  return params.no_bias ? 2 : 3;
-})
-#if MXNET_USE_MKLDNN == 1
-.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
-  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-})
-#endif
-.set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs){
-  return std::vector<std::pair<int, int> >{{1, 0}};
-})
-.set_attr<FInferStorageType>("FInferStorageType", BackwardFCStorageType)
-.set_attr_parser(ParamParser<FullyConnectedParam>)
-#if MXNET_USE_MKLDNN == 1
-.set_attr<FComputeEx>("FComputeEx<cpu>", FullyConnectedGradComputeExCPU)
-#endif
-.set_attr<FCompute>("FCompute<cpu>", FullyConnectedGradCompute<cpu>);
-
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/fully_connected.cu b/src/operator/nn/fully_connected.cu
index c89d37767c..279a378e2a 100644
--- a/src/operator/nn/fully_connected.cu
+++ b/src/operator/nn/fully_connected.cu
@@ -25,50 +25,16 @@
 #include "./fully_connected-inl.h"
 namespace mxnet {
 namespace op {
-
 template<>
-void FullyConnectedCompute<gpu>(const nnvm::NodeAttrs& attrs,
-                                const OpContext& ctx,
-                                const std::vector<TBlob>& inputs,
-                                const std::vector<OpReqType>& req,
-                                const std::vector<TBlob>& outputs) {
-  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
-  uint32_t in_expected = param.no_bias ? 2 : 3;
-  CHECK_EQ(inputs.size(), in_expected);
-  CHECK_EQ(outputs.size(), 1U);
-  int dtype = inputs[0].type_flag_;
-
+Operator* CreateOp<gpu>(FullyConnectedParam param, int dtype,
+                        std::vector<TShape> *in_shape,
+                        std::vector<TShape> *out_shape,
+                        Context ctx) {
+  Operator *op = NULL;
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    FCForward<gpu, DType>(ctx, param, inputs, req, outputs);
-  });
+    op = new FullyConnectedOp<gpu, DType>(param);
+  })
+  return op;
 }
-
-template<>
-void FullyConnectedGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
-                                    const OpContext& ctx,
-                                    const std::vector<TBlob>& inputs,
-                                    const std::vector<OpReqType>& req,
-                                    const std::vector<TBlob>& outputs) {
-  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
-  uint32_t out_expected = param.no_bias ? 2 : 3;
-  CHECK_EQ(inputs.size(), 3U);
-  CHECK_EQ(outputs.size(), out_expected);
-  CHECK_EQ(req.size(), out_expected);
-
-  std::vector<TBlob> out_grad{inputs[0]};
-  std::vector<TBlob> in_data(inputs.begin() + 1, inputs.end());
-  int dtype = inputs[0].type_flag_;
-
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    FCBackward<gpu, DType>(ctx, param, out_grad, in_data, req, outputs);
-  });
-}
-
-NNVM_REGISTER_OP(FullyConnected)
-.set_attr<FCompute>("FCompute<gpu>", FullyConnectedCompute<gpu>);
-
-NNVM_REGISTER_OP(_backward_FullyConnected)
-.set_attr<FCompute>("FCompute<gpu>", FullyConnectedGradCompute<gpu>);
-
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/lrn-inl.h b/src/operator/nn/lrn-inl.h
deleted file mode 100644
index fdae1eca0a..0000000000
--- a/src/operator/nn/lrn-inl.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2015 by Contributors
- * \file lrn-inl.h
- * \brief
- * \author Bing Xu
-*/
-#ifndef MXNET_OPERATOR_NN_LRN_INL_H_
-#define MXNET_OPERATOR_NN_LRN_INL_H_
-#include <dmlc/logging.h>
-#include <dmlc/parameter.h>
-#include <mxnet/operator.h>
-#include <map>
-#include <vector>
-#include <string>
-#include <utility>
-#include "../operator_common.h"
-#include "../mshadow_op.h"
-
-namespace mxnet {
-namespace op {
-
-namespace lrn_enum {
-enum LRNInputs {kData};
-enum LRNOutputs {kOut, kTmpNorm};
-}  // namespace lrn_enum
-
-struct LRNParam : public dmlc::Parameter<LRNParam> {
-  float alpha;
-  float beta;
-  float knorm;
-  uint32_t nsize;
-  DMLC_DECLARE_PARAMETER(LRNParam) {
-    DMLC_DECLARE_FIELD(alpha).set_default(1e-4f)
-    .describe("The variance scaling parameter :math:`\alpha` in the LRN expression.");
-    DMLC_DECLARE_FIELD(beta).set_default(0.75f)
-    .describe("The power parameter :math:`\beta` in the LRN expression.");
-    DMLC_DECLARE_FIELD(knorm).set_default(2.0f)
-    .describe("The parameter :math:`k` in the LRN expression.");
-    DMLC_DECLARE_FIELD(nsize)
-    .describe("normalization window width in elements.");
-  }
-};  // struct LRNParam
-
-template<typename xpu>
-void LRNForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-                const std::vector<TBlob> &in_data,
-                const std::vector<OpReqType> &req,
-                const std::vector<TBlob> &out_data) {
-  using namespace mshadow;
-  using namespace mshadow::expr;
-  const LRNParam& param_ = nnvm::get<LRNParam>(attrs.parsed);
-  // TODO(xxx): Test with gradient chceker
-  CHECK_EQ(in_data.size(), 1U);
-  CHECK_EQ(out_data.size(), 2U);
-  // CHECK_EQ(req.size(), 2);
-  CHECK_EQ(param_.nsize % 2, 1U) << "LRN only supports odd values for local_size";
-  const real_t salpha = param_.alpha / param_.nsize;
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-  Tensor<xpu, 4> data = in_data[lrn_enum::kData].get<xpu, 4, real_t>(s);
-  Tensor<xpu, 4> out = out_data[lrn_enum::kOut].get<xpu, 4, real_t>(s);
-  Tensor<xpu, 4> tmp_norm = out_data[lrn_enum::kTmpNorm].get<xpu, 4, real_t>(s);
-  tmp_norm = chpool<red::sum>(F<mshadow_op::square>(data) , param_.nsize) * salpha + param_.knorm;
-  Assign(out, req[lrn_enum::kOut], data *  F<mshadow_op::power>(tmp_norm, -param_.beta));
-}
-
-template<typename xpu>
-void LRNBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-                 const TBlob &out_grad, const TBlob &in_data,
-                 const TBlob &out_norm, const OpReqType &req,
-                 const TBlob &in_grad) {
-  using namespace mshadow;
-  using namespace mshadow::expr;
-  const LRNParam& param_ = nnvm::get<LRNParam>(attrs.parsed);
-  const real_t salpha = param_.alpha / param_.nsize;
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-  Tensor<xpu, 4> grad = out_grad.get<xpu, 4, real_t>(s);
-  Tensor<xpu, 4> tmp_norm = out_norm.get<xpu, 4, real_t>(s);
-  Tensor<xpu, 4> data = in_data.get<xpu, 4, real_t>(s);
-  Tensor<xpu, 4> grad_in = in_grad.get<xpu, 4, real_t>(s);
-  grad_in = grad * F<mshadow_op::power>(tmp_norm, -param_.beta);
-  grad_in += (- 2.0f * param_.beta * salpha) *
-      chpool<red::sum>(grad * data *
-                       F<mshadow_op::power>(tmp_norm, -param_.beta - 1.0f),
-                       param_.nsize)  * data;
-}
-
-template<typename xpu>
-void LRNCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx,
-                const std::vector<TBlob>& inputs,
-                const std::vector<OpReqType>& req,
-                const std::vector<TBlob>& outputs) {
-  LRNForward<xpu>(attrs, ctx, inputs, req, outputs);
-}
-
-template<typename xpu>
-void LRNGradCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx,
-                    const std::vector<TBlob>& inputs,
-                    const std::vector<OpReqType>& req,
-                    const std::vector<TBlob>& outputs) {
-  LRNBackward<xpu>(attrs, ctx, inputs[0],  // out_grad
-                   inputs[1],              // in_data
-                   inputs[2],              // out_norm
-                   req[lrn_enum::kData], outputs[lrn_enum::kData]);
-}
-
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_NN_LRN_INL_H_
diff --git a/src/operator/nn/lrn.cc b/src/operator/nn/lrn.cc
deleted file mode 100644
index 2359b49aba..0000000000
--- a/src/operator/nn/lrn.cc
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2015 by Contributors
- * \file lrn.cc
- * \brief
- * \author Bing Xu, Patric Zhao (patric.zhao@intel.com)
-*/
-
-#include "./lrn-inl.h"
-#include "../operator_common.h"
-#if MXNET_USE_MKLDNN == 1
-#include "./mkldnn/mkldnn_lrn-inl.h"
-#endif
-
-namespace mxnet {
-namespace op {
-
-bool LRNShape(const nnvm::NodeAttrs& attrs,
-              std::vector<TShape> *in_shape,
-              std::vector<TShape> *out_shape) {
-  using namespace mshadow;
-  CHECK_EQ(in_shape->size(), 1U) << "Input:[data]";
-  const TShape &dshape = in_shape->at(0);
-  if (dshape.ndim() == 0) return false;
-  out_shape->clear();
-  out_shape->push_back(dshape);
-  out_shape->push_back(dshape);
-  return true;
-}
-
-inline std::vector<std::string> ListArguments() {
-  return {"data"};
-}
-
-bool LRNType(const nnvm::NodeAttrs& attrs,
-             std::vector<int> *in_type,
-             std::vector<int> *out_type) {
-  CHECK_GE(in_type->size(), 1U);
-  int dtype = (*in_type)[0];
-  CHECK_NE(dtype, -1) << "First input must have specified type";
-  for (index_t i = 0; i < in_type->size(); ++i) {
-    if ((*in_type)[i] == -1) {
-      (*in_type)[i] = dtype;
-    } else {
-      UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
-    }
-  }
-  int n_out = 2;
-  out_type->clear();
-  for (int i = 0; i < n_out; ++i ) out_type->push_back(dtype);
-  return true;
-}
-
-struct LRNGrad {
-  const char *op_name;
-  std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
-                const std::vector<nnvm::NodeEntry>& ograds) const {
-    std::vector<nnvm::NodeEntry> heads;
-    heads.push_back(ograds[0]);  // out_grad
-    heads.push_back(n->inputs[lrn_enum::kData]);
-    heads.emplace_back(nnvm::NodeEntry{n, lrn_enum::kTmpNorm, 0});
-    return MakeGradNode(op_name, n, heads, n->attrs.dict);
-  }
-};
-
-bool LRNForwardInferStorageType(const nnvm::NodeAttrs& attrs,
-                                const int dev_mask,
-                                DispatchMode* dispatch_mode,
-                                std::vector<int> *in_attrs,
-                                std::vector<int> *out_attrs) {
-  CHECK(!in_attrs->empty());
-#if MXNET_USE_MKLDNN == 1
-  if (dev_mask == mshadow::cpu::kDevMask) {
-    storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                        dispatch_mode, DispatchMode::kFComputeEx);
-    return true;
-  }
-#endif
-  storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                      dispatch_mode, DispatchMode::kFCompute);
-  return true;
-}
-
-bool LRNBackwardInferStorageType(const nnvm::NodeAttrs& attrs,
-                                 const int dev_mask,
-                                 DispatchMode* dispatch_mode,
-                                 std::vector<int> *in_attrs,
-                                 std::vector<int> *out_attrs) {
-  CHECK(!in_attrs->empty());
-#if MXNET_USE_MKLDNN == 1
-  if (dev_mask == mshadow::cpu::kDevMask) {
-    storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                        dispatch_mode, DispatchMode::kFComputeEx);
-    return true;
-  }
-#endif
-  storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                      dispatch_mode, DispatchMode::kFCompute);
-  return true;
-}
-
-#if MXNET_USE_MKLDNN == 1
-void LRNComputeExCPU(const nnvm::NodeAttrs &attrs,
-                     const OpContext &ctx,
-                     const std::vector<NDArray> &inputs,
-                     const std::vector<OpReqType> &req,
-                     const std::vector<NDArray> &outputs) {
-  const LRNParam &param = nnvm::get<LRNParam>(attrs.parsed);
-  if (SupportMKLDNN(inputs[0])) {
-    // We only need to test one output array.
-    MKLDNN_OPCHECK_INIT(false, 1, inputs, outputs);
-    MKLDNNLRNForward(ctx, param, inputs[0], req[0], outputs[0]);
-    MKLDNN_OPCHECK_RUN(LRNCompute<cpu>, attrs, ctx, inputs, req, outputs);
-    return;
-  }
-  FallBackCompute(LRNCompute<cpu>, attrs, ctx, inputs, req, outputs);
-}
-
-void LRNGradComputeExCPU(const nnvm::NodeAttrs &attrs,
-                         const OpContext &ctx,
-                         const std::vector<NDArray> &inputs,
-                         const std::vector<OpReqType> &req,
-                         const std::vector<NDArray> &outputs) {
-  const LRNParam &param = nnvm::get<LRNParam>(attrs.parsed);
-  const NDArray &out_grad = inputs[0];
-  const NDArray &in_data = inputs[1];
-  const NDArray &in_grad = outputs[0];
-
-  if (SupportMKLDNN(inputs[0])) {
-    MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
-    MKLDNNLRNBackward(ctx, param, out_grad, in_data, req[0], in_grad);
-    MKLDNN_OPCHECK_RUN(LRNGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
-    return;
-  }
-  FallBackCompute(LRNGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
-}
-#endif
-
-DMLC_REGISTER_PARAMETER(LRNParam);
-
-NNVM_REGISTER_OP(LRN)
-.describe(R"code(Applies local response normalization to the input.
-
-The local response normalization layer performs "lateral inhibition" by normalizing
-over local input regions.
-
-If :math:`a_{x,y}^{i}` is the activity of a neuron computed by applying kernel :math:`i` at position
-:math:`(x, y)` and then applying the ReLU nonlinearity, the response-normalized
-activity :math:`b_{x,y}^{i}` is given by the expression:
-
-.. math::
-   b_{x,y}^{i} = \frac{a_{x,y}^{i}}{\Bigg({k + \alpha \sum_{j=max(0, i-\frac{n}{2})}^{min(N-1, i+\frac{n}{2})} (a_{x,y}^{j})^{2}}\Bigg)^{\beta}}
-
-where the sum runs over :math:`n` "adjacent" kernel maps at the same spatial position, and :math:`N` is the total
-number of kernels in the layer.
-
-)code" ADD_FILELINE)
-.set_num_inputs(1)
-.set_num_outputs(2)
-.set_attr<nnvm::FNumVisibleOutputs>("FNumVisibleOutputs",
-                                    [](const NodeAttrs& attrs) { return 1; })
-.set_attr_parser(ParamParser<LRNParam>)
-.set_attr<nnvm::FInferShape>("FInferShape", LRNShape)
-.set_attr<nnvm::FInferType>("FInferType", LRNType)
-.set_attr<FInferStorageType>("FInferStorageType", LRNForwardInferStorageType)
-.set_attr<FCompute>("FCompute<cpu>", LRNCompute<cpu>)
-#if MXNET_USE_MKLDNN == 1
-.set_attr<FComputeEx>("FComputeEx<cpu>", LRNComputeExCPU)
-#endif
-.set_attr<nnvm::FGradient>("FGradient", LRNGrad{"_backward_LRN"})
-.add_argument("data", "NDArray-or-Symbol", "Input data to LRN")
-.add_arguments(LRNParam::__FIELDS__());
-
-NNVM_REGISTER_OP(_backward_LRN)
-.set_num_outputs(1)
-.set_attr_parser(ParamParser<LRNParam>)
-.set_attr<FInferStorageType>("FInferStorageType", LRNBackwardInferStorageType)
-.set_attr<nnvm::TIsBackward>("TIsBackward", true)
-#if MXNET_USE_MKLDNN == 1
-.set_attr<FComputeEx>("FComputeEx<cpu>", LRNGradComputeExCPU)
-#endif
-.set_attr<FCompute>("FCompute<cpu>", LRNGradCompute<cpu>);
-
-}  // namespace op
-}  // namespace mxnet
diff --git a/src/operator/nn/mkldnn/mkldnn_act.cc b/src/operator/nn/mkldnn/mkldnn_act.cc
deleted file mode 100644
index 71fdf4ca58..0000000000
--- a/src/operator/nn/mkldnn/mkldnn_act.cc
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file mkldnn_act.cc
- * \brief
- * \author Da Zheng
-*/
-
-#include <dmlc/logging.h>
-#include <dmlc/parameter.h>
-#include <mxnet/operator.h>
-#include <algorithm>
-#include <map>
-#include <vector>
-#include <string>
-#include <utility>
-#include "../../operator_common.h"
-#include "../activation-inl.h"
-#include "./mkldnn_base-inl.h"
-
-#if MXNET_USE_MKLDNN == 1
-
-#include <mkldnn.hpp>
-
-namespace mxnet {
-namespace op {
-
-bool SupportMKLDNNAct(const ActivationParam& param) {
-  // We only enable ReLU for now. It seems other activations have some precision
-  // problems.
-  return param.act_type == activation::kReLU;
-#if 0
-      || param.act_type == activation::kSigmoid
-      || param.act_type == activation::kSoftReLU;
-#endif
-}
-
-static inline mkldnn::algorithm GetMKLDNNActAlgo(const ActivationParam& param) {
-  switch (param.act_type) {
-    case activation::kReLU:
-      return mkldnn::algorithm::eltwise_relu;
-    case activation::kSigmoid:
-      return mkldnn::algorithm::eltwise_logistic;
-    case activation::kTanh:
-      return mkldnn::algorithm::eltwise_tanh;
-    case activation::kSoftReLU:
-      return mkldnn::algorithm::eltwise_soft_relu;
-    default:
-      LOG(FATAL) << "unknown activation type";
-      return mkldnn::algorithm::eltwise_relu;
-  }
-}
-
-typedef std::shared_ptr<mkldnn::eltwise_forward::primitive_desc> mkldnn_act_pdesc_ptr;
-
-static mkldnn::eltwise_forward::primitive_desc GetActFwdDescImpl(
-    const ActivationParam& param, bool is_train,
-    const mkldnn::memory &input_mem, int dtype) {
-  mkldnn::memory::primitive_desc data_mpd = input_mem.get_primitive_desc();
-  mkldnn::memory::desc data_md = data_mpd.desc();
-  auto cpu_engine = data_mpd.get_engine();
-
-  auto alg = GetMKLDNNActAlgo(param);
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    DType alpha = 0;
-    mkldnn::eltwise_forward::desc desc = is_train
-        ? mkldnn::eltwise_forward::desc(mkldnn::prop_kind::forward_training,
-                                        alg, data_md, alpha)
-        : mkldnn::eltwise_forward::desc(mkldnn::prop_kind::forward_scoring,
-                                        alg, data_md, alpha);
-    return mkldnn::eltwise_forward::primitive_desc(desc, cpu_engine);
-  });
-  LOG(INFO) << "Unsupported data type for MKLDNN activation";
-  mkldnn::eltwise_forward::desc desc = mkldnn::eltwise_forward::desc(
-      mkldnn::prop_kind::forward_training, alg, data_md, 0.0);
-  return mkldnn::eltwise_forward::primitive_desc(desc, cpu_engine);
-}
-
-typedef MKLDNNParamOpSign<ActivationParam> MKLDNNActSignature;
-
-class MKLDNNActForward {
-  std::shared_ptr<mkldnn::eltwise_forward> fwd;
-  std::shared_ptr<mkldnn::memory> data;
-  std::shared_ptr<mkldnn::memory> out;
-
- public:
-  const mkldnn::eltwise_forward::primitive_desc fwd_pd;
-
-  MKLDNNActForward(const ActivationParam& param, bool is_train,
-                   const NDArray &data, const mkldnn::memory &mem): fwd_pd(
-                       GetActFwdDescImpl(param, is_train, mem, data.dtype())) {
-  }
-
-  void SetNewMem(const mkldnn::memory &data, const mkldnn::memory &output) {
-    if (this->data == nullptr)
-      this->data = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
-              data.get_primitive_desc(), data.get_data_handle()));
-    else
-      this->data->set_data_handle(data.get_data_handle());
-
-    CHECK(fwd_pd.dst_primitive_desc() == output.get_primitive_desc());
-    if (this->out == nullptr)
-      this->out = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
-              fwd_pd.dst_primitive_desc(), output.get_data_handle()));
-    else
-      this->out->set_data_handle(output.get_data_handle());
-
-    if (this->fwd == nullptr) {
-      this->fwd = std::shared_ptr<mkldnn::eltwise_forward>(
-          new mkldnn::eltwise_forward(fwd_pd, mkldnn::primitive::at(*this->data),
-                                      *this->out));
-    }
-  }
-
-  const mkldnn::eltwise_forward &GetFwd() const {
-    return *fwd;
-  }
-};
-
-static MKLDNNActForward &GetActForward(const ActivationParam& param,
-                                       const OpContext &ctx, const NDArray &in_data,
-                                       const mkldnn::memory &in_mem) {
-  static thread_local std::unordered_map<MKLDNNActSignature, MKLDNNActForward, MKLDNNOpHash> fwds;
-  MKLDNNActSignature key(param);
-  key.AddSign(ctx.is_train);
-  key.AddSign(param.act_type);
-  key.AddSign(in_data);
-
-  auto it = fwds.find(key);
-  if (it == fwds.end()) {
-    MKLDNNActForward fwd(param, ctx.is_train, in_data, in_mem);
-    auto ins_ret = fwds.insert(std::pair<MKLDNNActSignature, MKLDNNActForward>(
-            key, fwd));
-    CHECK(ins_ret.second);
-    it = ins_ret.first;
-  }
-  return it->second;
-}
-
-void MKLDNNActivationForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-                             const NDArray &in_data, const OpReqType &req,
-                             const NDArray &out_data) {
-  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
-  auto input_mem = in_data.GetMKLDNNData();
-  MKLDNNActForward &fwd = GetActForward(param, ctx, in_data, *input_mem);
-  auto out_mem = const_cast<NDArray &>(out_data).CreateMKLDNNData(
-      fwd.fwd_pd.dst_primitive_desc());
-  fwd.SetNewMem(*input_mem, *out_mem);
-  MKLDNNStream *stream = MKLDNNStream::Get();
-  stream->RegisterPrim(fwd.GetFwd());
-  stream->Submit();
-}
-
-void MKLDNNActivationBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-                              const NDArray &out_grad, const NDArray &in_data,
-                              const OpReqType &req, const NDArray &in_grad) {
-  if (req == kNullOp) {
-    return;
-  }
-
-  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
-  TmpMemMgr::Get()->Init(ctx.requested[activation::kTempSpace]);
-  auto diff_dst_memory = out_grad.GetMKLDNNData();
-  auto input_mem = in_data.GetMKLDNNData();
-  // We need to make sure the two inputs to eltwise_backward has the same memory
-  // descriptor. Otherwise, the perf will suffer.
-  if (input_mem->get_primitive_desc() != diff_dst_memory->get_primitive_desc())
-    input_mem = in_data.GetMKLDNNDataReorder(diff_dst_memory->get_primitive_desc());
-  mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc();
-  mkldnn::memory::desc data_md = data_mpd.desc();
-  mkldnn::memory::desc diff_md = diff_dst_memory->get_primitive_desc().desc();
-  auto cpu_engine = data_mpd.get_engine();
-
-  MKLDNNStream *stream = MKLDNNStream::Get();
-  auto alg = GetMKLDNNActAlgo(param);
-  mkldnn_output_t diff_src_memory;
-
-  MSHADOW_REAL_TYPE_SWITCH(in_data.dtype(), DType, {
-    DType alpha = 0;
-    mkldnn::eltwise_forward::desc fw_desc(mkldnn::prop_kind::forward_training,
-                                          alg, data_md, alpha);
-    mkldnn::eltwise_forward::primitive_desc fw_pdesc(fw_desc, cpu_engine);
-    mkldnn::eltwise_backward::desc bw_desc(alg, diff_md, data_md, alpha);
-    mkldnn::eltwise_backward::primitive_desc bw_pdesc(bw_desc, cpu_engine,
-                                                      fw_pdesc);
-
-    diff_src_memory = CreateMKLDNNMem(in_grad,
-                                      bw_pdesc.diff_src_primitive_desc(), req);
-    stream->RegisterPrim(mkldnn::eltwise_backward(bw_pdesc, *input_mem,
-                                                  *diff_dst_memory,
-                                                  *diff_src_memory.second));
-  });
-  CommitOutput(in_grad, diff_src_memory);
-  stream->Submit();
-}
-
-}  // namespace op
-}  // namespace mxnet
-
-#endif
diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h
deleted file mode 100644
index 1c583e1f67..0000000000
--- a/src/operator/nn/mkldnn/mkldnn_base-inl.h
+++ /dev/null
@@ -1,488 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*******************************************************************************
-* Copyright 2016-2017 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*
-* \file mkldnn_base-inl.h
-* \brief
-* \author young.jin.kim@intel.com
-*         ashok.emani@intel.com
-*         deepthi.karkada@intel.com
-*         louis.feng@intel.com
-*         adam.d.straw@intel.com
-*         zhengda1936@gmail.com
-*
-*******************************************************************************/
-
-#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BASE_INL_H_
-#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BASE_INL_H_
-
-#if MXNET_USE_MKLDNN == 1
-#include <iterator>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include <utility>
-#include <algorithm>
-#include <memory>
-#include "mkldnn.hpp"
-#include "mxnet/ndarray.h"
-#include "mxnet/resource.h"
-#include "mxnet/op_attr_types.h"
-using namespace mkldnn;
-namespace mxnet {
-extern bool EnableMkldnnWarnGenerated();
-// =====  CpuEngine =======================================
-// cpu_engine singleton
-class CpuEngine {
- public:
-  static CpuEngine *Get() {
-    // I's thread-safe in C++11.
-    static thread_local CpuEngine myInstance;
-    return &myInstance;
-  }
-  CpuEngine(CpuEngine const &) = delete;             // Copy construct
-  CpuEngine(CpuEngine &&) = delete;                  // Move construct
-  CpuEngine &operator=(CpuEngine const &) = delete;  // Copy assign
-  CpuEngine &operator=(CpuEngine &&) = delete;       // Move assign
-
-  mkldnn::engine &get_engine() { return _cpu_engine; }
-
- protected:
-  CpuEngine() : _cpu_engine(mkldnn::engine::cpu, 0) {}
-  ~CpuEngine() {}
-
- private:
-  mkldnn::engine _cpu_engine;
-};
-
-// type enumerator
-template <typename T>
-struct data_type_enum {};
-
-template <>
-struct data_type_enum<float> {
-  enum { type = mkldnn::memory::data_type::f32 };
-};
-
-template <>
-struct data_type_enum<int32_t> {
-  enum { type = mkldnn::memory::data_type::s32 };
-};
-
-template <>
-struct data_type_enum<int16_t> {
-  enum { type = mkldnn::memory::data_type::s16 };
-};
-
-template <>
-struct data_type_enum<int8_t> {
-  enum { type = mkldnn::memory::data_type::s8 };
-};
-
-template <>
-struct data_type_enum<uint8_t> {
-  enum { type = mkldnn::memory::data_type::u8 };
-};
-
-static inline bool SupportMKLDNNArray(int dtype, const TShape &shape) {
-  int ndim = shape.ndim();
-  bool support = ndim == 1 || ndim == 2 || ndim == 4;
-  support = support && (dtype == mshadow::kFloat32 || dtype == mshadow::kInt32
-                        || dtype == mshadow::kInt8 || dtype == mshadow::kUint8);
-  return support;
-}
-
-static inline bool SupportStorageMKLDNN(int stype) {
-  return stype == kDefaultStorage;
-}
-
-static inline bool SupportMKLDNN(int dtype, const TShape &shape) {
-  int ndim = shape.ndim();
-  return dtype == mshadow::kFloat32 && (ndim == 1 || ndim == 2 || ndim == 4);
-}
-
-static inline bool SupportMKLDNN(const NDArray &input) {
-  return SupportMKLDNN(input.dtype(), input.shape())
-      && SupportStorageMKLDNN(input.storage_type());
-}
-
-static inline bool SupportMKLDNNConv(const NDArray &input) {
-  return input.dtype() == mshadow::kFloat32 && input.shape().ndim() == 4;
-}
-
-/*
- * This is to align address to a certain alignment.
- */
-void *AlignMem(void *mem, size_t size, size_t alignment, size_t *space);
-
-namespace op {
-struct ActivationParam;
-bool SupportMKLDNNAct(const op::ActivationParam& param);
-}
-
-static int GetTypeSize(int dtype) {
-  int size = -1;
-  MSHADOW_TYPE_SWITCH(dtype, DType, {
-    size = sizeof(DType);
-  });
-  return size;
-}
-
-static inline size_t GetArraySize(const NDArray &arr) {
-  return arr.shape().Size() * GetTypeSize(arr.dtype());
-}
-
-static inline mkldnn::memory::data_type get_mkldnn_type(int dtype) {
-  switch (dtype) {
-    case mshadow::kFloat32:
-      return mkldnn::memory::data_type::f32;
-    case mshadow::kInt32:
-      return mkldnn::memory::data_type::s32;
-    case mshadow::kInt8:
-      return mkldnn::memory::data_type::s8;
-    case mshadow::kUint8:
-      return mkldnn::memory::data_type::u8;
-    default:
-      LOG(FATAL) << "unknown type for MKLDNN";
-      return mkldnn::memory::data_type::data_undef;
-  }
-}
-
-inline static mkldnn::memory::desc GetMemDesc(const NDArray &arr, int ndim) {
-  mkldnn::memory::dims dims(ndim);
-  for (size_t i = 0; i < dims.size(); i++) dims[i] = arr.shape()[i];
-  return mkldnn::memory::desc{dims, get_mkldnn_type(arr.dtype()),
-                              mkldnn::memory::format::any};
-}
-
-inline static mkldnn::memory::desc GetMemDesc(const NDArray &arr) {
-  return GetMemDesc(arr, arr.shape().ndim());
-}
-
-inline static mkldnn::memory::desc GetWeightDesc(const NDArray &arr,
-                                                 int num_groups) {
-  if (num_groups == 1) {
-    return GetMemDesc(arr);
-  } else {
-    CHECK_EQ(arr.shape().ndim(), 4U);
-    mkldnn::memory::dims tz = mkldnn::memory::dims{ num_groups,
-      static_cast<int>(arr.shape()[0] / num_groups),
-      static_cast<int>(arr.shape()[1]),
-      static_cast<int>(arr.shape()[2]),
-      static_cast<int>(arr.shape()[3])};
-    return mkldnn::memory::desc{tz, get_mkldnn_type(arr.dtype()),
-                                mkldnn::memory::format::any};
-  }
-}
-
-typedef std::shared_ptr<mkldnn::memory> mkldnn_mem_ptr;
-typedef std::shared_ptr<const mkldnn::memory> mkldnn_mem_const_ptr;
-
-/*
- * This is to manage the temporary memory provided by MXNet for operators.
- * The temp memory is mainly used to keep the reordered data. In an operator, we
- * may need multiple pieces of memory for them. But MXNet can only provide
- * a single piece of memory. This class is to help break the temporary memory
- * from MXNet to store the reordered data.
- * The amount of temporary memory used in an operator depends on the layout of
- * input arrays and the operator. It's difficult to calculate it manually, so
- * the class also estimate the amount of memory automatically.
- */
-class TmpMemMgr {
-  // This points to the memory buffer where we can allocate temp memory.
-  char *curr_mem;
-  // The total size of the temp memory.
-  size_t mem_size;
-  // This contains the current available memory size.
-  size_t curr_size;
-  // This estimate the required temp memory size in an operator.
-  size_t est_size;
-  const size_t alignment = 4096;
-
- public:
-  static TmpMemMgr *Get() {
-    static thread_local TmpMemMgr mgr;
-    return &mgr;
-  }
-
-  TmpMemMgr() {
-    Reset();
-    est_size = 0;
-    mem_size = 0;
-  }
-
-  void Reset() {
-    curr_mem = nullptr;
-    curr_size = 0;
-    // We don't reset est_size and mem_size because est_size contains the
-    // estimated temp memory size from the last run and mem_size contains the
-    // memroy size allocated in the last run.
-  }
-
-  void Init(const Resource &r) {
-    // If the last time, if we estimate that we need more memory, we should the
-    // larger memory size.
-    mem_size = std::max(mem_size, est_size);
-    if (mem_size > 0) {
-      // Let's allocate some extra memory. If we don't use some of them all the time,
-      // the OS won't physically allocate pages for them any way.
-      this->curr_size = mem_size * 2;
-      this->curr_mem = static_cast<char *>(r.get_host_space_internal(this->curr_size));
-    }
-    // reset est_size, so we can start to estimate the temp memory size.
-    this->est_size = 0;
-  }
-
-  mkldnn::memory *Alloc(const mkldnn::memory::primitive_desc &pd);
-};
-
-class MKLDNNStream {
-  std::vector<mkldnn::primitive> net;
-  // Here we hold all memory related to the operators in the stream.
-  std::vector<std::shared_ptr<const mkldnn::memory> > mem_holder;
-
- public:
-  static MKLDNNStream *Get() {
-    static thread_local MKLDNNStream stream;
-    return &stream;
-  }
-
-  void RegisterPrim(const mkldnn::primitive &prim) { net.push_back(prim); }
-
-  void RegisterMem(std::shared_ptr<const mkldnn::memory> mem) {
-    mem_holder.push_back(mem);
-  }
-
-  bool HasOps() const {
-    return !net.empty();
-  }
-
-  void Submit() {
-    if (!net.empty())
-      mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait();
-    net.clear();
-    mem_holder.clear();
-    TmpMemMgr::Get()->Reset();
-  }
-};
-
-class MKLDNNOpSignature {
-  std::vector<int> eles;
-  uint64_t hash;
-
- public:
-  MKLDNNOpSignature() {
-    hash = 0;
-  }
-
-  explicit MKLDNNOpSignature(uint64_t hash) {
-    this->hash = hash;
-  }
-
-  /*
-   * We provide different methods to add signature to an op.
-   * For operations, such as convolutin and fully connected, which determines
-   * the optimal data layout for the op, we only need to use the shape and data
-   * type to sign the op. For other operations, such as activation, which uses
-   * whatever layout in the input array, we have to use the shape, the data type
-   * and the layout to sign the op.
-   */
-
-  void AddSign(const mkldnn::memory &mem) {
-    auto desc = mem.get_primitive_desc().desc();
-    hash = hash * 2 + desc.data.format;
-    eles.push_back(desc.data.format);
-    hash = hash * 2 + desc.data.data_type;
-    eles.push_back(desc.data.data_type);
-    for (int i = 0; i < desc.data.ndims; i++) {
-      hash = hash * 2 + desc.data.dims[i];
-      eles.push_back(desc.data.dims[i]);
-    }
-  }
-
-  void AddSign(const std::vector<NDArray> &arrs) {
-    for (auto &arr : arrs) {
-      AddSign(arr);
-    }
-  }
-
-  void AddSign(const NDArray &arr) {
-    if (arr.IsMKLDNNData()) {
-      AddSign(*(arr.GetMKLDNNData()));
-    } else {
-      hash = hash * 2 + arr.dtype();
-      eles.push_back(arr.dtype());
-      AddSign(arr.shape());
-    }
-  }
-
-  void AddSign(const TShape &shape) {
-    for (size_t i = 0; i < shape.ndim(); i++) {
-      hash = hash * 2 + shape[i];
-      eles.push_back(shape[i]);
-    }
-  }
-
-  void AddSign(int val) {
-    hash = hash * 2 + val;
-    eles.push_back(val);
-  }
-
-  bool operator==(const MKLDNNOpSignature &sign) const {
-    if (hash != sign.hash)
-      return false;
-    if (eles.size() != sign.eles.size())
-      return false;
-    for (size_t i = 0; i < eles.size(); i++)
-      if (eles[i] != sign.eles[i])
-        return false;
-    return true;
-  }
-
-  uint64_t GetHash() const {
-    return hash;
-  }
-};
-
-struct MKLDNNOpHash {
-  size_t operator()(const MKLDNNOpSignature &sign) const {
-    return sign.GetHash();
-  }
-};
-
-template<typename ParamType>
-class MKLDNNParamOpSign: public MKLDNNOpSignature {
-  const ParamType param;
-
-  static size_t hash(const ParamType &param) {
-    std::hash<ParamType> fn;
-    return fn(param);
-  }
-
- public:
-  explicit MKLDNNParamOpSign(const ParamType &_param): MKLDNNOpSignature(
-      hash(_param)), param(_param) {
-  }
-
-  bool operator==(const MKLDNNParamOpSign<ParamType> &sign) const {
-    const MKLDNNOpSignature &this_upper = *this;
-    const MKLDNNOpSignature &other_upper = sign;
-    return this_upper == other_upper && param == sign.param;
-  }
-};
-
-enum OutDataOp {
-  Noop,
-  CopyBack,
-  AddBack,
-};
-
-typedef std::pair<OutDataOp, mkldnn::memory *> mkldnn_output_t;
-
-/*
- * These two functions try to create MKLDNN memory in an NDArray based on `req'.
- * The difference is that the first function can create MKLDNN memory with
- * special layouts in an NDArray, while the second one can only create MKLDNN
- * memory with default layouts.
- * If these two functions are used, we have to call CommitOutput to write
- * the output back to the output NDArray.
- */
-mkldnn_output_t CreateMKLDNNMem(const NDArray &arr,
-                                const mkldnn::memory::primitive_desc &desc,
-                                OpReqType req);
-mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray &arr,
-                                       const mkldnn::memory::primitive_desc &desc,
-                                       OpReqType req);
-/* This function has to be used with one of the functions above. */
-void CommitOutput(const NDArray &arr, const mkldnn_output_t &res);
-
-static inline void InvalidateOutputs(const std::vector<NDArray> &arrs,
-                                     const std::vector<OpReqType> &reqs) {
-  for (size_t i = 0; i < arrs.size(); i++) {
-    if (reqs[i] == kWriteTo || reqs[i] == kNullOp) {
-      const_cast<NDArray &>(arrs[i]).InvalidateMKLDNNData();
-    }
-  }
-}
-
-const mkldnn::memory *GetWeights(const NDArray &arr,
-                                 const mkldnn::memory::primitive_desc &target_pd,
-                                 int num_groups);
-
-mkldnn_memory_format_t GetDefaultFormat(mkldnn::memory::desc desc);
-mkldnn_memory_format_t GetDefaultFormat(int num_dims);
-mkldnn::memory::primitive_desc GetPrimitiveDesc(mkldnn::memory::primitive_desc pd,
-                                                mkldnn_memory_format_t format);
-
-void FallBackCompute(FCompute fn, const nnvm::NodeAttrs &attrs,
-                     const OpContext &ctx,
-                     const std::vector<NDArray> &inputs,
-                     const std::vector<OpReqType> &req,
-                     const std::vector<NDArray> &outputs);
-
-/*
- * This class is used to check the correctness of MKLDNN operators.
- */
-class OpCheck {
-  std::vector<mxnet::NDArray> inputs;
-  std::vector<mxnet::NDArray> outputs;
-  bool backward;
-  size_t num_checks;
-
- public:
-  OpCheck(bool backward, size_t num_checks) {
-    this->backward = backward;
-    this->num_checks = num_checks;
-  }
-
-  void Init(const std::vector<mxnet::NDArray> &inputs_,
-          const std::vector<mxnet::NDArray> &outputs_);
-
-  void Run(mxnet::FCompute fn, const nnvm::NodeAttrs &attrs,
-           const mxnet::OpContext &ctx,
-           const std::vector<mxnet::NDArray> &inputs_,
-           const std::vector<mxnet::OpReqType> &req,
-           const std::vector<mxnet::NDArray> &outputs_);
-};
-
-#define MKLDNN_OPCHECK_INIT(backward, num_checks, inputs, outputs)  \
-    static bool debug = dmlc::GetEnv("MXNET_MKLDNN_DEBUG", false);  \
-    OpCheck check(backward, num_checks);                            \
-    if (debug) check.Init(inputs, outputs);
-
-#define MKLDNN_OPCHECK_RUN(fn, attrs, ctx, inputs, req, outputs)    \
-    if (debug) check.Run(fn, attrs, ctx, inputs, req, outputs);
-
-}  // namespace mxnet
-#endif
-#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BASE_INL_H_
diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/mkldnn/mkldnn_base.cc
deleted file mode 100644
index c34ca03a28..0000000000
--- a/src/operator/nn/mkldnn/mkldnn_base.cc
+++ /dev/null
@@ -1,385 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#if MXNET_USE_MKLDNN == 1
-
-#include <atomic>
-#include "./mkldnn_base-inl.h"
-#include "./mkldnn_ops-inl.h"
-
-namespace mxnet {
-
-void *AlignMem(void *mem, size_t size, size_t alignment, size_t *space) {
-  if (size > *space)
-    return nullptr;
-  intptr_t addr = reinterpret_cast<intptr_t>(mem);
-  // If the address has been aligned, don't do anything.
-  intptr_t last_chunk = addr % alignment;
-  if (last_chunk == 0)
-    return mem;
-  intptr_t padding = alignment - last_chunk;
-  // If the buffer doesn't have enough space, we should return null here.
-  if (padding + size > *space)
-    return nullptr;
-  addr += padding;
-  *space -= padding;
-  CHECK_EQ(addr % alignment, 0);
-  return reinterpret_cast<void *>(addr);
-}
-
-mkldnn::memory *TmpMemMgr::Alloc(const mkldnn::memory::primitive_desc &pd) {
-  // We need to include the size of the memory used for alignment.
-  this->est_size += pd.get_size() + alignment;
-  void *mem = AlignMem(this->curr_mem, pd.get_size(), alignment, &this->curr_size);
-  if (mem) {
-    // The memory is allocated from the temporary memory space in the
-    // operator. It'll only become invalid after we exit from the operator.
-    mkldnn_mem_ptr ret(new mkldnn::memory(pd, mem));
-    MKLDNNStream::Get()->RegisterMem(ret);
-    CHECK_EQ(mem, mem);
-    this->curr_size -= pd.get_size();
-    this->curr_mem = static_cast<char *>(mem) + pd.get_size();
-    return ret.get();
-  } else {
-    LOG(WARNING) << "Allocate " << pd.get_size()
-        << " bytes with malloc directly";
-    mkldnn_mem_ptr ret(new mkldnn::memory(pd));
-    MKLDNNStream::Get()->RegisterMem(ret);
-    return ret.get();
-  }
-}
-
-mkldnn_output_t CreateMKLDNNMem(const NDArray &arr,
-                                const mkldnn::memory::primitive_desc &desc,
-                                OpReqType req) {
-  if (kAddTo == req) {
-    auto tmp = TmpMemMgr::Get()->Alloc(desc);
-    return mkldnn_output_t(OutDataOp::AddBack, tmp);
-  } else if (kWriteInplace == req) {
-    // MKLDNN ops may not support the case that the input and the output uses
-    // the same memory. Let's use an extra copy to make sure it always works.
-    auto tmp = TmpMemMgr::Get()->Alloc(desc);
-    return mkldnn_output_t(OutDataOp::CopyBack, tmp);
-  } else {
-    mkldnn::memory *mem = const_cast<NDArray &>(arr).CreateMKLDNNData(desc);
-    if (mem == nullptr) {
-      auto tmp = TmpMemMgr::Get()->Alloc(desc);
-      return mkldnn_output_t(OutDataOp::CopyBack, tmp);
-    } else {
-      return mkldnn_output_t(OutDataOp::Noop, mem);
-    }
-  }
-}
-
-mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray &arr,
-                                       const mkldnn::memory::primitive_desc &desc,
-                                       OpReqType req) {
-  if (kAddTo == req) {
-    auto tmp = TmpMemMgr::Get()->Alloc(desc);
-    return mkldnn_output_t(OutDataOp::AddBack, tmp);
-  } else if (kWriteInplace == req) {
-    auto tmp = TmpMemMgr::Get()->Alloc(desc);
-    return mkldnn_output_t(OutDataOp::CopyBack, tmp);
-  } else {
-    auto _desc = desc;
-    auto def_format = GetDefaultFormat(_desc.desc());
-    mkldnn::memory *mem = nullptr;
-    if (def_format == _desc.desc().data.format) {
-      mem = const_cast<NDArray &>(arr).CreateMKLDNNData(desc);
-    }
-    if (mem == nullptr) {
-      auto tmp = TmpMemMgr::Get()->Alloc(desc);
-      return mkldnn_output_t(OutDataOp::CopyBack, tmp);
-    } else {
-      return mkldnn_output_t(OutDataOp::Noop, mem);
-    }
-  }
-}
-
-void CommitOutput(const NDArray &arr, const mkldnn_output_t &res) {
-  if (res.first == CopyBack) {
-    const_cast<NDArray &>(arr).CopyFrom(*res.second);
-  } else if (res.first == AddBack) {
-    auto mem = arr.GetMKLDNNData(res.second->get_primitive_desc());
-    CHECK(mem != nullptr);
-    // We have to allocate new memory for the sum result.
-    auto sum_res = TmpMemMgr::Get()->Alloc(
-        res.second->get_primitive_desc());
-    op::Sum(*res.second, *mem, *sum_res);
-    const_cast<NDArray &>(arr).CopyFrom(*sum_res);
-  }
-}
-
-const mkldnn::memory *GetWeights(const NDArray &arr,
-                                 const mkldnn::memory::primitive_desc &target_pd,
-                                 int num_groups) {
-  const mkldnn::memory *mem = arr.GetMKLDNNData(target_pd);
-  // If the weight array already uses the target layout, simply return it
-  // directly.
-  if (mem)
-    return mem;
-
-  mkldnn::memory::data_type type = get_mkldnn_type(arr.dtype());
-  auto engine = CpuEngine::Get()->get_engine();
-  if (arr.shape().ndim() == 2) {
-    mkldnn::memory::dims tz = mkldnn::memory::dims{
-      static_cast<int>(arr.shape()[0]), static_cast<int>(arr.shape()[1])};
-    mkldnn::memory::desc md =
-        mkldnn::memory::desc{tz, type, mkldnn::memory::format::oi};
-    mkldnn::memory::primitive_desc pd =
-        mkldnn::memory::primitive_desc{md, engine};
-    mem = arr.GetMKLDNNData(pd);
-  } else if (arr.shape().ndim() == 4 && num_groups == 1) {
-    mkldnn::memory::dims tz = mkldnn::memory::dims{
-      static_cast<int>(arr.shape()[0]), static_cast<int>(arr.shape()[1]),
-          static_cast<int>(arr.shape()[2]), static_cast<int>(arr.shape()[3])};
-    mkldnn::memory::desc md =
-        mkldnn::memory::desc{tz, type, mkldnn::memory::format::oihw};
-    mkldnn::memory::primitive_desc pd =
-        mkldnn::memory::primitive_desc{md, engine};
-    mem = arr.GetMKLDNNData(pd);
-  } else if (arr.shape().ndim() == 4) {
-    mkldnn::memory::dims tz = mkldnn::memory::dims{ num_groups,
-      static_cast<int>(arr.shape()[0] / num_groups),
-      static_cast<int>(arr.shape()[1]),
-      static_cast<int>(arr.shape()[2]),
-      static_cast<int>(arr.shape()[3])};
-    mkldnn::memory::desc md =
-        mkldnn::memory::desc{tz, type, mkldnn::memory::format::goihw};
-    mkldnn::memory::primitive_desc pd =
-        mkldnn::memory::primitive_desc{md, engine};
-    mem = arr.GetMKLDNNData(pd);
-  } else {
-    LOG(FATAL) << "The weight array has an unsupported number of dimensions";
-    return nullptr;
-  }
-  if (mem == nullptr)
-    mem = arr.GetMKLDNNDataReorder(target_pd);
-  if (mem->get_primitive_desc() == target_pd) return mem;
-
-  auto ret = TmpMemMgr::Get()->Alloc(target_pd);
-  MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(*mem, *ret));
-  return ret;
-}
-
-mkldnn_memory_format_t GetDefaultFormat(int num_dims) {
-  switch (num_dims) {
-    case 1: return mkldnn_x;
-    case 2: return mkldnn_nc;
-    case 4: return mkldnn_nchw;
-    case 5: return mkldnn_goihw;
-    default:
-      LOG(FATAL) << "Unsupported MKLDNN dimensions: " << num_dims;
-      return mkldnn_format_undef;
-  }
-}
-
-mkldnn_memory_format_t GetDefaultFormat(mkldnn::memory::desc desc) {
-  if (desc.data.ndims == 1) {
-    return desc.data.format;
-  } else if (desc.data.ndims == 2) {
-    if (desc.data.format == mkldnn_io)
-      return mkldnn_oi;
-    else
-      return desc.data.format;
-  } else if (desc.data.ndims == 4) {
-    switch (desc.data.format) {
-      case mkldnn_nchw:
-      case mkldnn_nhwc:
-      case mkldnn_chwn:
-      case mkldnn_nChw8c:
-      case mkldnn_nChw16c:
-        return mkldnn_nchw;
-      case mkldnn_oihw:
-      case mkldnn_ihwo:
-      case mkldnn_hwio:
-      case mkldnn_OIhw8i8o:
-      case mkldnn_OIhw16i16o:
-      case mkldnn_OIhw8i16o2i:
-      case mkldnn_OIhw8o16i2o:
-      case mkldnn_OIhw8o8i:
-      case mkldnn_OIhw16o16i:
-      case mkldnn_IOhw16o16i:
-      case mkldnn_Oihw8o:
-      case mkldnn_Oihw16o:
-      case mkldnn_Ohwi8o:
-      case mkldnn_Ohwi16o:
-      case mkldnn_OhIw16o4i:
-        return mkldnn_oihw;
-      default:
-        LOG(FATAL) << "Unknown MKLDNN format for 4 dimensions: " << desc.data.format;
-        return mkldnn_format_undef;
-    }
-  } else if (desc.data.ndims == 5) {
-    switch (desc.data.format) {
-      case mkldnn_goihw:
-      case mkldnn_gOIhw8i8o:
-      case mkldnn_gOIhw16i16o:
-      case mkldnn_gOIhw8i16o2i:
-      case mkldnn_gOIhw8o16i2o:
-      case mkldnn_gOIhw8o8i:
-      case mkldnn_gOIhw16o16i:
-      case mkldnn_gIOhw16o16i:
-      case mkldnn_gOihw8o:
-      case mkldnn_gOihw16o:
-      case mkldnn_gOhwi8o:
-      case mkldnn_gOhwi16o:
-      case mkldnn_gOhIw16o4i:
-        return mkldnn_goihw;
-      default:
-        LOG(FATAL) << "Unknown MKLDNN format for 4 dimensions: " << desc.data.format;
-        return mkldnn_format_undef;
-    }
-  } else {
-    LOG(FATAL) << "Unsupported dimensions: " << desc.data.ndims;
-    return mkldnn_format_undef;
-  }
-}
-
-mkldnn::memory::primitive_desc GetPrimitiveDesc(mkldnn::memory::primitive_desc pd,
-                                                mkldnn_memory_format_t format) {
-  mkldnn::memory::dims dims(pd.desc().data.ndims);
-  for (size_t i = 0; i < dims.size(); i++)
-    dims[i] = pd.desc().data.dims[i];
-  mkldnn::memory::format cpp_format = static_cast<mkldnn::memory::format>(format);
-  mkldnn::memory::data_type cpp_type = static_cast<mkldnn::memory::data_type>(
-      pd.desc().data.data_type);
-  mkldnn::memory::desc data_md(dims, cpp_type, cpp_format);
-  return mkldnn::memory::primitive_desc(data_md, pd.get_engine());
-}
-
-void FallBackCompute(FCompute fn, const nnvm::NodeAttrs &attrs,
-                     const OpContext &ctx,
-                     const std::vector<NDArray> &inputs,
-                     const std::vector<OpReqType> &req,
-                     const std::vector<NDArray> &outputs) {
-  std::vector<TBlob> in_blobs(inputs.size());
-  for (size_t i = 0; i < in_blobs.size(); i++) {
-      in_blobs[i] = inputs[i].data();
-  }
-  std::vector<TBlob> out_blobs(outputs.size());
-  for (size_t i = 0; i < out_blobs.size(); i++) {
-    if (req[i] == kWriteTo)
-      const_cast<NDArray &>(outputs[i]).InvalidateMKLDNNData();
-    CHECK(outputs[i].IsDefaultData());
-    out_blobs[i] = outputs[i].data();
-  }
-  fn(attrs, ctx, in_blobs, req, out_blobs);
-}
-
-template<typename DType>
-void print_diff(const mxnet::NDArray &arr1, const mxnet::NDArray &arr2) {
-  DType *data1 = reinterpret_cast<DType *>(arr1.data().dptr_);
-  DType *data2 = reinterpret_cast<DType *>(arr2.data().dptr_);
-  for (size_t i = 0; i < arr1.shape().Size(); i++)
-    std::cout << data1[i] - data2[i] << ", ";
-  std::cout << std::endl;
-}
-
-template<typename DType>
-static bool SimilarArray(const mxnet::NDArray &arr1, const mxnet::NDArray &arr2,
-                         DType rtol, DType atol) {
-  if (arr1.shape().Size() != arr2.shape().Size())
-    return false;
-
-  // This function should be used outside an MKLDNN operator.
-  // There shouldn't be any operators in the stream.
-  CHECK(!MKLDNNStream::Get()->HasOps());
-  // We need to reorder data in the arrays to the default layout.
-  // But we shouldn't reorder data in the original array.
-  NDArray buf1, buf2;
-  if (arr1.IsMKLDNNData()) {
-    buf1 = NDArray(arr1.shape(), arr1.ctx(), false, arr1.dtype());
-    auto mem = arr1.GetMKLDNNData();
-    buf1.CopyFrom(*mem);
-  }
-  if (arr2.IsMKLDNNData()) {
-    buf2 = NDArray(arr2.shape(), arr2.ctx(), false, arr2.dtype());
-    auto mem = arr2.GetMKLDNNData();
-    buf2.CopyFrom(*mem);
-  }
-  MKLDNNStream::Get()->Submit();
-
-  DType *data1 = reinterpret_cast<DType *>(
-      arr1.IsMKLDNNData() ? buf1.data().dptr_: arr1.data().dptr_);
-  DType *data2 = reinterpret_cast<DType *>(
-      arr2.IsMKLDNNData() ? buf2.data().dptr_: arr2.data().dptr_);
-  std::atomic<bool> success(true);
-#pragma omp parallel for
-  for (size_t i = 0; i < arr1.shape().Size(); i++) {
-    if (std::abs(data1[i] - data2[i]) > atol + rtol * std::abs(data2[i]))
-      success.store(false);
-  }
-  return success.load();
-}
-
-void OpCheck::Init(const std::vector<mxnet::NDArray> &inputs_,
-                   const std::vector<mxnet::NDArray> &outputs_) {
-  auto ctx = inputs_[0].ctx();
-  CHECK(!MKLDNNStream::Get()->HasOps());
-  for (size_t i = 0; i < inputs_.size(); i++) {
-    inputs.emplace_back(inputs_[i].shape(), ctx,
-                        false, inputs_[i].dtype());
-    auto mem = inputs_[i].GetMKLDNNData();
-    inputs[i].CopyFrom(*mem);
-  }
-  for (size_t i = 0; i < outputs_.size(); i++) {
-    outputs.emplace_back(outputs_[i].shape(), ctx,
-                         false, outputs_[i].dtype());
-    if (backward) {
-      auto mem = outputs_[i].GetMKLDNNData();
-      outputs[i].CopyFrom(*mem);
-    }
-  }
-  MKLDNNStream::Get()->Submit();
-}
-
-void OpCheck::Run(mxnet::FCompute fn, const nnvm::NodeAttrs &attrs,
-                  const mxnet::OpContext &ctx,
-                  const std::vector<mxnet::NDArray> &inputs_,
-                  const std::vector<mxnet::OpReqType> &req,
-                  const std::vector<mxnet::NDArray> &outputs_) {
-  std::vector<mxnet::TBlob> in_blobs(inputs.size());
-  for (size_t i = 0; i < in_blobs.size(); i++) in_blobs[i] = inputs[i].data();
-  std::vector<mxnet::TBlob> out_blobs(outputs.size());
-  for (size_t i = 0; i < out_blobs.size(); i++)
-    out_blobs[i] = outputs[i].data();
-  fn(attrs, ctx, in_blobs, req, out_blobs);
-
-  LOG(INFO) << "test " << attrs.op->name;
-  size_t num = std::min(outputs.size(), outputs_.size());
-  num = std::min(num_checks, num);
-  for (size_t i = 0; i < num; i++) {
-    // We don't need to compare if it doesn't need to output data.
-    if (req[i] == kNullOp)
-      continue;
-    MSHADOW_TYPE_SWITCH(outputs[i].dtype(), DType, {
-      bool similar = SimilarArray<DType>(outputs[i], outputs_[i], 1e-3, 1e-4);
-      if (!similar) {
-        LOG(ERROR) << attrs.op->name << " fails";
-        print_diff<DType>(outputs[i], outputs_[i]);
-      }
-      CHECK(similar);
-    });
-  }
-}
-
-}  // namespace mxnet
-
-#endif
diff --git a/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h b/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h
deleted file mode 100644
index 19a98da6af..0000000000
--- a/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h
+++ /dev/null
@@ -1,431 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file mkldnn_batch_norm.cc
- * \brief
- * \author Tao Lv
-*/
-
-#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BATCH_NORM_INL_H_
-#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BATCH_NORM_INL_H_
-
-#if MXNET_USE_MKLDNN == 1
-#include <vector>
-#include <utility>
-#include <mkldnn.hpp>
-#include "../batch_norm-inl.h"
-#include "./mkldnn_ops-inl.h"
-#include "./mkldnn_base-inl.h"
-
-#define VARIANCE_TO_INVSTD(__var$,    __eps$)   (1.0/sqrt((__var$) + DType(__eps$)))
-#define INVSTD_TO_VARIANCE(__invstd$, __eps$)   ((1.0 / ((__invstd$) * (__invstd$))) - (__eps$))
-namespace mxnet {
-namespace op {
-
-typedef mkldnn::batch_normalization_forward::primitive_desc     t_bn_f_pdesc;
-typedef mkldnn::batch_normalization_forward::desc               t_bn_f_desc;
-typedef mkldnn::batch_normalization_backward::primitive_desc    t_bn_b_pdesc;
-typedef mkldnn::batch_normalization_backward::desc              t_bn_b_desc;
-
-using mkldnn::use_global_stats;
-using mkldnn::use_scale_shift;
-using mkldnn::forward_training;
-using mkldnn::forward_inference;
-
-inline static unsigned _GetFlags(const std::vector<NDArray> &in_data,
-                                 const std::vector<NDArray> &aux_states,
-                                 const BatchNormParam &param, bool is_train) {
-  unsigned flags = 0U;
-  if (in_data.size() == 3U) {
-    flags |= use_scale_shift;
-  }
-
-  // aux_states[0]: inMean
-  // aux_states[1]: inVariance
-  if (aux_states.size() == 2U && !is_train) {
-    flags |= use_global_stats;
-  }
-  return flags;
-}
-
-template <typename DType>
-inline static t_bn_f_pdesc _GetFwd(const mkldnn::memory &data_mem,
-                                   bool is_train,
-                                   DType eps,
-                                   unsigned flags) {
-  auto data_mpd   = data_mem.get_primitive_desc();
-  auto data_md    = data_mpd.desc();
-  auto engine     = CpuEngine::Get()->get_engine();
-
-  if (is_train) {
-    t_bn_f_desc bnFwd_desc(forward_training, data_md, eps, flags);
-    return t_bn_f_pdesc(bnFwd_desc, engine);
-  } else {
-    t_bn_f_desc bnFwd_desc(forward_inference, data_md, eps, flags);
-    return t_bn_f_pdesc(bnFwd_desc, engine);
-  }
-}
-
-template <typename DType>
-inline static t_bn_b_pdesc _GetBwd(const mkldnn::memory &data_mem,
-                                   const mkldnn::memory &diff_mem,
-                                   DType eps,
-                                   unsigned flags) {
-  auto data_mpd   = data_mem.get_primitive_desc();
-  auto data_md    = data_mpd.desc();
-  auto diff_mpd   = diff_mem.get_primitive_desc();
-  auto diff_md    = diff_mpd.desc();
-  auto engine     = CpuEngine::Get()->get_engine();
-
-  t_bn_b_desc  bnBwd_desc(mkldnn::prop_kind::backward, diff_md, data_md, eps, flags);
-  return t_bn_b_pdesc(bnBwd_desc, engine, _GetFwd(data_mem, true, eps, flags));
-}
-
-typedef MKLDNNParamOpSign<BatchNormParam> MKLDNNBNSignature;
-
-class MKLDNNBNForward {
-  std::shared_ptr<const mkldnn::memory> data_m;
-  std::shared_ptr<const mkldnn::memory> weight_m;
-  std::shared_ptr<const mkldnn::memory> out_m;
-  std::shared_ptr<const mkldnn::memory> mean_m;
-  std::shared_ptr<const mkldnn::memory> var_m;
-  std::shared_ptr<mkldnn::batch_normalization_forward> fwd;
-  bool is_train;
-  t_bn_f_pdesc pd;
-
- public:
-  MKLDNNBNForward(const t_bn_f_pdesc &_pd, bool is_train): pd(_pd) {
-    weight_m.reset(new mkldnn::memory(pd.weights_primitive_desc()));
-    this->is_train = is_train;
-  }
-
-  const mkldnn::memory &GetWeight() const {
-    return *weight_m;
-  }
-
-  const t_bn_f_pdesc &GetPd() const {
-    return pd;
-  }
-
-  const mkldnn::memory &GetMean() const {
-    return *mean_m;
-  }
-
-  const mkldnn::memory &GetVar() const {
-    return *var_m;
-  }
-
-  void SetDataHandle(const NDArray &data, const NDArray &mean,
-                     const NDArray &var, const mkldnn::memory &out) {
-    auto _data = data.GetMKLDNNData();
-    if (data_m) {
-      data_m->set_data_handle(_data->get_data_handle());
-    } else {
-      data_m.reset(new mkldnn::memory(_data->get_primitive_desc(),
-                                      _data->get_data_handle()));
-    }
-    if (out_m) {
-      out_m->set_data_handle(out.get_data_handle());
-    } else {
-      out_m.reset(new mkldnn::memory(out.get_primitive_desc(),
-                                     out.get_data_handle()));
-    }
-    auto mean_ptr = mean.data().dptr_;
-    if (mean_m) {
-      mean_m->set_data_handle(mean_ptr);
-    } else {
-      mean_m.reset(new mkldnn::memory(pd.mean_primitive_desc(),
-                                      mean_ptr));
-    }
-    auto var_ptr = var.data().dptr_;
-    if (var_m) {
-      var_m->set_data_handle(var_ptr);
-    } else {
-      var_m.reset(new mkldnn::memory(pd.variance_primitive_desc(),
-                                     var_ptr));
-    }
-
-    if (fwd == nullptr) {
-      if (!is_train)
-        fwd.reset(new mkldnn::batch_normalization_forward(
-                pd, *data_m, mkldnn::primitive::at(*mean_m),
-                mkldnn::primitive::at(*var_m), *weight_m, *out_m));
-      else
-        fwd.reset(new mkldnn::batch_normalization_forward(
-                pd, mkldnn::primitive::at(*data_m),
-                mkldnn::primitive::at(*weight_m), *out_m,
-                *mean_m, *var_m));
-    }
-  }
-
-  const mkldnn::batch_normalization_forward &GetFwd() const {
-    return *fwd;
-  }
-};
-
-template<typename DType>
-static MKLDNNBNForward &GetBNForward(const BatchNormParam& param,
-                                     const OpContext &ctx, const NDArray &in_data,
-                                     unsigned flags) {
-  static thread_local std::unordered_map<MKLDNNBNSignature, MKLDNNBNForward, MKLDNNOpHash> fwds;
-  MKLDNNBNSignature key(param);
-  key.AddSign(ctx.is_train);
-  key.AddSign(in_data);
-
-  auto it = fwds.find(key);
-  if (it == fwds.end()) {
-    auto fwd_pd = _GetFwd(*in_data.GetMKLDNNData(), ctx.is_train,
-                          (DType) param.eps, flags);
-    MKLDNNBNForward fwd(fwd_pd, ctx.is_train);
-    auto ins_ret = fwds.insert(std::pair<MKLDNNBNSignature, MKLDNNBNForward>(
-            key, fwd));
-    CHECK(ins_ret.second);
-    it = ins_ret.first;
-  }
-  return it->second;
-}
-
-template <typename DType>
-void MKLDNNBatchNormForward(const OpContext &ctx, const BatchNormParam &param,
-                            const std::vector<NDArray>   &in_data,
-                            const std::vector<OpReqType> &req,
-                            const std::vector<NDArray>   &out_data,
-                            const std::vector<NDArray>   &aux_states) {
-  TmpMemMgr::Get()->Init(ctx.requested[batchnorm::kTempSpace]);
-  unsigned flags      = _GetFlags(in_data, aux_states, param, ctx.is_train);
-  const NDArray &data = in_data[batchnorm::kData];
-
-  auto &fwd = GetBNForward<DType>(param, ctx, data, flags);
-  const NDArray &out  = out_data[batchnorm::kOut];
-
-  // for output memory
-  auto out_mem = const_cast<NDArray &>(out).CreateMKLDNNData(fwd.GetPd().dst_primitive_desc());
-
-  // mxnet will always use scale shift.
-  // But if fix_gamma is true, then all scale elements will be set to 1.0f
-  if (flags & use_scale_shift) {
-    const NDArray &gamma    = in_data[batchnorm::kGamma];
-    const NDArray &beta     = in_data[batchnorm::kBeta];
-    CHECK_EQ(gamma.storage_type(), mxnet::kDefaultStorage);
-    CHECK_EQ(beta.storage_type(), mxnet::kDefaultStorage);
-
-    const mkldnn::memory &weight_mem = fwd.GetWeight();
-    DType* weight_buf = reinterpret_cast<DType *>(weight_mem.get_data_handle());
-
-    nnvm::dim_t channels_ = data.shape()[1];
-    CHECK(weight_mem.get_primitive_desc().get_size() == channels_ * sizeof(DType) * 2);
-    DType* weight_ptr = gamma.data().dptr<DType>();
-    DType* bias_ptr = beta.data().dptr<DType>();
-    if (!param.fix_gamma) {
-#pragma omp parallel for simd
-      for (int i = 0; i < channels_; i++) {
-        weight_buf[i] = weight_ptr[i];
-        weight_buf[channels_ + i] = bias_ptr[i];  // bias
-      }
-    } else if (IsBNWriting(req[batchnorm::kGamma])) {
-#pragma omp parallel for simd
-      for (int i = 0; i < channels_; i++) {
-        weight_buf[i] = (DType)1.0f;
-        weight_ptr[i] = (DType)1.0f;
-        weight_buf[channels_ + i] = bias_ptr[i];  // bias
-      }
-    } else {
-#pragma omp parallel for simd
-      for (int i = 0; i < channels_; i++) {
-        weight_buf[i] = (DType)1.0f;
-        weight_buf[channels_ + i] = bias_ptr[i];  // bias
-      }
-    }
-
-    if (!ctx.is_train) {
-      DType* omean    = out_data[batchnorm::kMean].data().dptr<DType>();
-      DType* ovar     = out_data[batchnorm::kVar].data().dptr<DType>();
-      DType* inmean   = aux_states[batchnorm::kMovingMean].data().dptr<DType>();
-      DType* invar    = aux_states[batchnorm::kMovingVar].data().dptr<DType>();
-      // to align with origin implmentation: batch_norm.cc: L164
-#pragma omp parallel for simd
-      for (int i = 0; i < channels_; i++) {
-        omean[i] = inmean[i];
-        ovar[i] = VARIANCE_TO_INVSTD(invar[i], param.eps);
-      }
-
-      fwd.SetDataHandle(data, aux_states[batchnorm::kMovingMean],
-                        aux_states[batchnorm::kMovingVar],
-                        *out_mem);
-      MKLDNNStream::Get()->RegisterPrim(fwd.GetFwd());
-      MKLDNNStream::Get()->Submit();
-    } else {  // training
-      const NDArray &outMean  = out_data[batchnorm::kMean];
-      const NDArray &outVar   = out_data[batchnorm::kVar];
-      DType* omean    = outMean.data().dptr<DType>();
-      DType* ovar     = outVar.data().dptr<DType>();
-
-      fwd.SetDataHandle(data, outMean, outVar, *out_mem);
-      MKLDNNStream::Get()->RegisterPrim(fwd.GetFwd());
-      MKLDNNStream::Get()->Submit();
-      DType* mean_mem_ptr = reinterpret_cast<DType*>(fwd.GetMean().get_data_handle());
-      DType* var_mem_ptr  = reinterpret_cast<DType*>(fwd.GetVar().get_data_handle());
-#pragma omp parallel for simd
-      for (int i = 0; i < channels_; i++) {
-        omean[i] = mean_mem_ptr[i];
-        ovar[i]  = VARIANCE_TO_INVSTD(var_mem_ptr[i], param.eps);
-      }
-    }
-  } else {  // no input gamma and beta
-      LOG(FATAL) << "MKLDNN batch normalization: should not reach here ...";
-  }
-}
-
-template <typename DType>
-void MKLDNNBatchNormBackward(const OpContext &ctx, const BatchNormParam &param,
-                             const std::vector<NDArray>    &out_grad,
-                             const std::vector<NDArray>    &in_data,
-                             const std::vector<NDArray>    &out_data,
-                             const std::vector<OpReqType>  &req,
-                             const std::vector<NDArray>    &in_grad,
-                             const std::vector<NDArray>    &aux_states) {
-  TmpMemMgr::Get()->Init(ctx.requested[batchnorm::kTempSpace]);
-  CHECK_EQ(out_grad.size(), param.output_mean_var ? 3U : 1U);
-  CHECK_EQ(in_data.size(), 3U);
-  CHECK_EQ(out_data.size(), 3U);
-  CHECK_EQ(in_grad.size(), 3U);
-  unsigned flags = _GetFlags(in_data, aux_states, param, ctx.is_train);
-
-  const NDArray &data         = in_data[batchnorm::kData];
-  const NDArray &diff         = out_grad[batchnorm::kOut];
-  const NDArray &gradIn       = in_grad[batchnorm::kData];
-  const NDArray &moving_mean  = aux_states[batchnorm::kMovingMean];
-  const NDArray &moving_var   = aux_states[batchnorm::kMovingVar];
-  const NDArray &out_mean     = out_data[batchnorm::kMean];
-  const NDArray &out_var      = out_data[batchnorm::kVar];
-
-  CHECK(out_mean.IsDefaultData());
-  CHECK(out_var.IsDefaultData());
-  CHECK(moving_mean.IsDefaultData());
-  CHECK(moving_var.IsDefaultData());
-
-  auto data_mem  = data.GetMKLDNNData();
-  auto diff_mem  = diff.GetMKLDNNData();
-  // MKLDNN batchnorm should run on special layouts. If one of them isn't, we
-  // should reorder them.
-  if (data.IsDefaultData())
-    data_mem = data.GetMKLDNNDataReorder(diff_mem->get_primitive_desc());
-  else if (diff.IsDefaultData())
-    diff_mem = diff.GetMKLDNNDataReorder(data_mem->get_primitive_desc());
-  auto bwd_pd = _GetBwd(*data_mem, *diff_mem, param.eps, flags);
-  auto gradi_mem = const_cast<NDArray &>(gradIn).CreateMKLDNNData(data_mem->get_primitive_desc());
-
-  if (flags & use_scale_shift) {
-    const NDArray &gamma    = in_data[batchnorm::kGamma];
-    const NDArray &beta     = in_data[batchnorm::kBeta];
-    // TODO(tao): how to reuse this memory?
-    std::shared_ptr<const mkldnn::memory> weight_mem(
-                    new mkldnn::memory(bwd_pd.weights_primitive_desc()));
-
-    DType* weight_buf = reinterpret_cast<DType *>(weight_mem->get_data_handle());
-    nnvm::dim_t channels_ = data.shape()[1];
-    for (int i = 0; i < channels_; i++) {
-      if (!param.fix_gamma)
-        weight_buf[i] = (gamma.data().dptr<DType>())[i];   // weight
-      else
-        weight_buf[i] = (DType)1.0f;
-    }
-
-    for (int i = 0; i < channels_; i++) {
-      weight_buf[channels_ + i] = (beta.data().dptr<DType>())[i];  // bias
-    }
-
-    std::shared_ptr<const mkldnn::memory> gradw_mem(
-                    new mkldnn::memory(bwd_pd.diff_weights_primitive_desc()));
-    // training but no input mean and variance
-    if (ctx.is_train && !param.use_global_stats) {
-      DType* moving_mean_ptr  = reinterpret_cast<DType *>(moving_mean.data().dptr<DType>());
-      DType* moving_var_ptr   = reinterpret_cast<DType *>(moving_var.data().dptr<DType>());
-      DType* out_mean_ptr     = reinterpret_cast<DType *>(out_mean.data().dptr<DType>());
-      DType* out_var_ptr      = reinterpret_cast<DType *>(out_var.data().dptr<DType>());
-      mkldnn::memory var_mem(bwd_pd.variance_primitive_desc());
-      DType *tmp_var_ptr = reinterpret_cast<DType *>(var_mem.get_data_handle());
-
-      DType minus_mom = (1.0f - param.momentum);
-      for (int i = 0; i < channels_; i++) {
-        moving_mean_ptr[i] = moving_mean_ptr[i] * param.momentum +
-                             out_mean_ptr[i] * minus_mom;
-        float variance = INVSTD_TO_VARIANCE(out_var_ptr[i], param.eps);
-        tmp_var_ptr[i] = variance;
-        moving_var_ptr[i] = moving_var_ptr[i] * param.momentum +
-                            variance * minus_mom;
-      }
-
-      std::shared_ptr<const mkldnn::memory> out_mean_mem(
-                      new mkldnn::memory(bwd_pd.mean_primitive_desc(), out_mean_ptr));
-      std::shared_ptr<const mkldnn::memory> out_var_mem(
-                      new mkldnn::memory(bwd_pd.variance_primitive_desc(), out_var_ptr));
-
-      auto bn_bwd = mkldnn::batch_normalization_backward(bwd_pd,
-                                                         *data_mem,
-                                                         mkldnn::primitive::at(*out_mean_mem),
-                                                         mkldnn::primitive::at(var_mem),
-                                                         *diff_mem,
-                                                         *weight_mem,
-                                                         *gradi_mem,
-                                                         *gradw_mem);
-
-      MKLDNNStream::Get()->RegisterPrim(bn_bwd);
-      MKLDNNStream::Get()->Submit();
-    } else {
-      std::shared_ptr<const mkldnn::memory> imean_mem(
-                      new mkldnn::memory(bwd_pd.mean_primitive_desc(),
-                      moving_mean.data().dptr<DType>()));
-      std::shared_ptr<const mkldnn::memory> ivar_mem(
-                      new mkldnn::memory(bwd_pd.variance_primitive_desc(),
-                      moving_var.data().dptr<DType>()));
-      auto bn_bwd = mkldnn::batch_normalization_backward(bwd_pd,
-                                                         *data_mem,
-                                                         mkldnn::primitive::at(*imean_mem),
-                                                         mkldnn::primitive::at(*ivar_mem),
-                                                         *diff_mem,
-                                                         *weight_mem,
-                                                         *gradi_mem,
-                                                         *gradw_mem);
-
-      MKLDNNStream::Get()->RegisterPrim(bn_bwd);
-      MKLDNNStream::Get()->Submit();
-    }
-
-    // copy data from gradw_mem to in_grad[1] and in_grad[2]
-    DType* gw_buf = reinterpret_cast<DType *>(gradw_mem->get_data_handle());
-    for (int i = 0; i < channels_; i++) {
-      if (!param.fix_gamma)
-        (in_grad[1].data().dptr<DType>())[i] = gw_buf[i];
-      else
-        (in_grad[1].data().dptr<DType>())[i] = 0.0f;
-    }
-
-    for (int i = 0; i < channels_; i++) {
-      (in_grad[2].data().dptr<DType>())[i] = gw_buf[i + channels_];
-    }
-  } else {
-    LOG(FATAL) << "MKLDNN batch normalization backward: should not reach here ...";
-  }
-}
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_USE_MKLDNN
-#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BATCH_NORM_INL_H_
diff --git a/src/operator/nn/mkldnn/mkldnn_concat.cc b/src/operator/nn/mkldnn/mkldnn_concat.cc
deleted file mode 100644
index d3e6e77502..0000000000
--- a/src/operator/nn/mkldnn/mkldnn_concat.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file mkldnn_concat.cc
- * \brief
- * \author Wenting Jiang
-*/
-#include "../concat-inl.h"
-#include "./mkldnn_ops-inl.h"
-#include "./mkldnn_base-inl.h"
-
-#if MXNET_USE_MKLDNN == 1
-namespace mxnet {
-namespace op {
-
-void MKLDNNConcatForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-                         const std::vector<NDArray> &in_data,
-                         const std::vector<OpReqType> &req,
-                         const std::vector<NDArray> &out_data) {
-  TmpMemMgr::Get()->Init(ctx.requested[concat_enum::kTempSpace]);
-  const ConcatParam& param = nnvm::get<ConcatParam>(attrs.parsed);
-  int num_in_data = param.num_args;
-  int concat_dim = param.dim;
-  std::vector<mkldnn::memory::primitive_desc> data_md;
-  std::vector<mkldnn::primitive::at> data_mem;
-  for (int i =0; i < num_in_data; i++) {
-      auto tmp_mem = in_data[i].GetMKLDNNData();
-      auto tmp_pd = tmp_mem->get_primitive_desc();
-      data_md.push_back(tmp_pd);
-      data_mem.push_back(*tmp_mem);
-  }
-  mkldnn::concat::primitive_desc fwd_pd(concat_dim, data_md);
-  auto engine = CpuEngine::Get()->get_engine();
-  auto out_mem = CreateMKLDNNMem(out_data[concat_enum::kOut],
-      fwd_pd.dst_primitive_desc(), req[concat_enum::kOut]);
-  MKLDNNStream::Get()->RegisterPrim(mkldnn::concat(fwd_pd, data_mem, *out_mem.second));
-  CommitOutput(out_data[concat_enum::kOut], out_mem);
-  MKLDNNStream::Get()->Submit();
-}
-
-void MKLDNNConcatBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-                          const std::vector<NDArray>& inputs,
-                          const std::vector<OpReqType>& req,
-                          const std::vector<NDArray>& outputs) {
-  TmpMemMgr::Get()->Init(ctx.requested[concat_enum::kTempSpace]);
-  const ConcatParam& param = nnvm::get<ConcatParam>(attrs.parsed);
-  int num_in_data = param.num_args;
-  int axis_ = param.dim;
-  auto engine = CpuEngine::Get()->get_engine();
-  auto gz_mem = inputs[0].GetMKLDNNData();
-  mkldnn::memory::primitive_desc gz_pd = gz_mem->get_primitive_desc();
-  /* init the offset */
-  mkldnn::memory::dims offsets = {0, 0, 0, 0};
-  for (int i = 0; i < num_in_data; i++) {
-    mkldnn::memory::dims diff_src_tz
-        = {static_cast<int>(inputs[i+1].shape()[0]),
-          static_cast<int>(inputs[i+1].shape()[1]),
-          static_cast<int>(inputs[i+1].shape()[2]),
-          static_cast<int>(inputs[i+1].shape()[3])};
-    auto diff_src_mpd = inputs[i+1].GetMKLDNNData()->get_primitive_desc();
-    auto gradi_mem_ = CreateMKLDNNMem(outputs[i], diff_src_mpd, req[i]);
-    // create view from gy to gxs[i]
-    std::shared_ptr<mkldnn::view::primitive_desc> view_pd;
-    view_pd.reset(new mkldnn::view::primitive_desc(gz_pd, diff_src_tz, offsets));
-    // create reorder primitive from gy to gxs[i]
-    mkldnn::reorder::primitive_desc reorder_pd(
-        view_pd.get()->dst_primitive_desc(), diff_src_mpd);
-    offsets[axis_] += diff_src_tz[axis_];
-    MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(
-            reorder_pd, *gz_mem, *gradi_mem_.second));
-    CommitOutput(outputs[i], gradi_mem_);
-  }
-  MKLDNNStream::Get()->Submit();
-}
-
-}  // namespace op
-}  // namespace mxnet
-#endif
diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc
deleted file mode 100644
index b94850aa62..0000000000
--- a/src/operator/nn/mkldnn/mkldnn_convolution.cc
+++ /dev/null
@@ -1,357 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file mkldnn_convolution.cc
- * \brief
- * \author Da Zheng
-*/
-
-#include "../convolution-inl.h"
-#include "./mkldnn_ops-inl.h"
-#include "./mkldnn_base-inl.h"
-
-#if MXNET_USE_MKLDNN == 1
-namespace mxnet {
-namespace op {
-
-static mkldnn::convolution_forward::primitive_desc GetConvFwdImpl(
-    const ConvolutionParam& param, bool is_train, const NDArray &data,
-    const NDArray &weights, const NDArray *bias, const NDArray &output) {
-  auto prop = is_train ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring;
-  auto data_md = GetMemDesc(data);
-  auto weight_md = GetWeightDesc(weights, param.num_group);
-  auto out_md = GetMemDesc(output);
-  auto engine = CpuEngine::Get()->get_engine();
-  mkldnn::memory::dims strides{0, 0};
-  if (param.stride.ndim() == 2) {
-    strides[0] = param.stride[0];
-    strides[1] = param.stride[1];
-  }
-  mkldnn::memory::dims padding{0, 0};
-  if (param.pad.ndim() == 2) {
-    padding[0] = param.pad[0];
-    padding[1] = param.pad[1];
-  }
-  if (param.dilate.ndim() == 0 && bias == nullptr) {
-    mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct,
-        data_md, weight_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero);
-    return mkldnn::convolution_forward::primitive_desc(desc, engine);
-  } else if (param.dilate.ndim() == 0) {
-    auto bias_md = GetMemDesc(*bias);
-    mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct,
-        data_md, weight_md, bias_md, out_md, strides, padding, padding,
-        mkldnn::padding_kind::zero);
-    return mkldnn::convolution_forward::primitive_desc(desc, engine);
-  } else {
-    mkldnn::memory::dims dilates{0, 0};
-    if (param.dilate.ndim() == 2) {
-      dilates[0] = param.dilate[0] - 1;
-      dilates[1] = param.dilate[1] - 1;
-    }
-    if (bias == nullptr) {
-      mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct,
-          data_md, weight_md, out_md, strides, dilates, padding, padding,
-          mkldnn::padding_kind::zero);
-      return mkldnn::convolution_forward::primitive_desc(desc, engine);
-    } else {
-      auto bias_md = GetMemDesc(*bias);
-      mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct,
-                                             data_md, weight_md, bias_md, out_md, strides,
-                                             dilates, padding, padding,
-                                             mkldnn::padding_kind::zero);
-      return mkldnn::convolution_forward::primitive_desc(desc, engine);
-    }
-  }
-}
-
-static mkldnn::convolution_backward_data::primitive_desc GetConvBwdData(
-    const ConvolutionParam& param, const NDArray &data, const NDArray &weights,
-    const NDArray &output, const mkldnn::convolution_forward::primitive_desc &fwd_pd) {
-  auto data_md = GetMemDesc(data);
-  auto weight_md = GetWeightDesc(weights, param.num_group);
-  auto out_md = GetMemDesc(output);
-  auto engine = CpuEngine::Get()->get_engine();
-  mkldnn::memory::dims strides{0, 0};
-  if (param.stride.ndim() == 2) {
-    strides[0] = param.stride[0];
-    strides[1] = param.stride[1];
-  }
-  mkldnn::memory::dims padding{0, 0};
-  if (param.pad.ndim() == 2) {
-    padding[0] = param.pad[0];
-    padding[1] = param.pad[1];
-  }
-  if (param.dilate.ndim() == 0) {
-    mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct,
-        data_md, weight_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero);
-    return mkldnn::convolution_backward_data::primitive_desc(desc, engine, fwd_pd);
-  } else {
-    mkldnn::memory::dims dilates{0, 0};
-    if (param.dilate.ndim() == 2) {
-      dilates[0] = param.dilate[0] - 1;
-      dilates[1] = param.dilate[1] - 1;
-    }
-    mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct,
-        data_md, weight_md, out_md, strides, dilates, padding, padding,
-        mkldnn::padding_kind::zero);
-    return mkldnn::convolution_backward_data::primitive_desc(desc, engine, fwd_pd);
-  }
-}
-
-static mkldnn::convolution_backward_weights::primitive_desc GetConvBwdWeights(
-    const ConvolutionParam& param, const NDArray &data,
-    const NDArray &weights, const NDArray *bias, const NDArray &output,
-    const mkldnn::convolution_forward::primitive_desc &fwd_pd) {
-  auto data_md = GetMemDesc(data);
-  auto weight_md = GetWeightDesc(weights, param.num_group);
-  auto out_md = GetMemDesc(output);
-  auto engine = CpuEngine::Get()->get_engine();
-  mkldnn::memory::dims strides{0, 0};
-  if (param.stride.ndim() == 2) {
-    strides[0] = param.stride[0];
-    strides[1] = param.stride[1];
-  }
-  mkldnn::memory::dims padding{0, 0};
-  if (param.pad.ndim() == 2) {
-    padding[0] = param.pad[0];
-    padding[1] = param.pad[1];
-  }
-  if (param.dilate.ndim() == 0 && bias == nullptr) {
-    mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
-        data_md, weight_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero);
-    return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd);
-  } else if (param.dilate.ndim() == 0) {
-    auto bias_md = GetMemDesc(*bias);
-    mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
-        data_md, weight_md, bias_md, out_md, strides, padding, padding,
-        mkldnn::padding_kind::zero);
-    return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd);
-  } else {
-    mkldnn::memory::dims dilates{0, 0};
-    if (param.dilate.ndim() == 2) {
-      dilates[0] = param.dilate[0] - 1;
-      dilates[1] = param.dilate[1] - 1;
-    }
-    if (bias == nullptr) {
-      mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
-          data_md, weight_md, out_md, strides, dilates, padding, padding,
-          mkldnn::padding_kind::zero);
-      return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd);
-    } else {
-      auto bias_md = GetMemDesc(*bias);
-      mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
-                                                      data_md, weight_md, bias_md, out_md,
-                                                      strides, dilates, padding, padding,
-                                                      mkldnn::padding_kind::zero);
-      return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd);
-    }
-  }
-}
-
-class MKLDNNConvForward {
-  std::shared_ptr<mkldnn::convolution_forward> fwd;
-  std::shared_ptr<mkldnn::memory> data;
-  std::shared_ptr<mkldnn::memory> weight;
-  std::shared_ptr<mkldnn::memory> bias;
-  std::shared_ptr<mkldnn::memory> out;
-
- public:
-  mkldnn::convolution_forward::primitive_desc fwd_pd;
-
-  MKLDNNConvForward(const ConvolutionParam& param, bool is_train,
-                    const NDArray &data, const NDArray &weights,
-                    const NDArray *bias, const NDArray &output): fwd_pd(
-                        GetConvFwdImpl(param, is_train, data, weights, bias, output)) {
-  }
-
-  void SetNewMem(const mkldnn::memory &data, const mkldnn::memory &weight,
-                 const mkldnn::memory *bias, const mkldnn::memory &output) {
-    if (this->data == nullptr)
-      this->data = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
-              fwd_pd.src_primitive_desc(), data.get_data_handle()));
-    else
-      this->data->set_data_handle(data.get_data_handle());
-
-    if (this->weight == nullptr)
-      this->weight = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
-              fwd_pd.weights_primitive_desc(), weight.get_data_handle()));
-    else
-      this->weight->set_data_handle(weight.get_data_handle());
-
-    if (this->out == nullptr)
-      this->out = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
-              fwd_pd.dst_primitive_desc(), output.get_data_handle()));
-    else
-      this->out->set_data_handle(output.get_data_handle());
-
-    if (bias != nullptr) {
-      if (this->bias == nullptr)
-        this->bias = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
-                fwd_pd.bias_primitive_desc(), bias->get_data_handle()));
-      else
-        this->bias->set_data_handle(bias->get_data_handle());
-      if (this->fwd == nullptr)
-        this->fwd = std::shared_ptr<mkldnn::convolution_forward>(
-            new mkldnn::convolution_forward(fwd_pd, mkldnn::primitive::at(*this->data),
-                                            mkldnn::primitive::at(*this->weight),
-                                            mkldnn::primitive::at(*this->bias),
-                                            *this->out));
-    } else if (this->fwd == nullptr) {
-      this->fwd = std::shared_ptr<mkldnn::convolution_forward>(
-          new mkldnn::convolution_forward(fwd_pd, mkldnn::primitive::at(*this->data),
-                                          mkldnn::primitive::at(*this->weight),
-                                          *this->out));
-    }
-  }
-
-  const mkldnn::convolution_forward &GetFwd() const {
-    return *fwd;
-  }
-};
-
-typedef MKLDNNParamOpSign<ConvolutionParam> MKLDNNConvSignature;
-
-static inline MKLDNNConvForward &GetConvFwd(
-    const nnvm::NodeAttrs& attrs, bool is_train,
-    const NDArray &data, const NDArray &weights,
-    const NDArray *bias, const NDArray &output) {
-  static thread_local std::unordered_map<MKLDNNConvSignature, MKLDNNConvForward, MKLDNNOpHash> fwds;
-  const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
-  MKLDNNConvSignature key(param);
-  key.AddSign(is_train);
-  // Here we can sign the conv op with NDArray because conv primitive will
-  // decide the right layout for the, so we only need to get the shape and the
-  // data type of the arrays.
-  key.AddSign(data);
-  key.AddSign(weights);
-  key.AddSign(output);
-  if (bias)
-    key.AddSign(*bias);
-
-  auto it = fwds.find(key);
-  if (it == fwds.end()) {
-    MKLDNNConvForward fwd(param, is_train, data, weights, bias, output);
-    auto ins_ret = fwds.insert(
-        std::pair<MKLDNNConvSignature, MKLDNNConvForward>(key, fwd));
-    CHECK(ins_ret.second);
-    it = ins_ret.first;
-  }
-  return it->second;
-}
-
-void MKLDNNConvolutionForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-                               const std::vector<NDArray> &in_data,
-                               const std::vector<OpReqType> &req,
-                               const std::vector<NDArray> &out_data) {
-  TmpMemMgr::Get()->Init(ctx.requested[conv::kTempSpace]);
-  const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
-  MKLDNNConvForward &fwd = GetConvFwd(attrs,
-      ctx.is_train, in_data[conv::kData], in_data[conv::kWeight],
-      param.no_bias ? nullptr : &in_data[conv::kBias], out_data[conv::kOut]);
-
-  auto data_mem = in_data[conv::kData].GetMKLDNNDataReorder(fwd.fwd_pd.src_primitive_desc());
-  const mkldnn::memory *weight_mem;
-  if (ctx.is_train) {
-    // TODO(zhengda) kvstore doesn't handle MKLDNN correctly. Let's reorder it
-    // to the default format for now.
-    if (in_data[conv::kWeight].IsMKLDNNData())
-      const_cast<NDArray &>(in_data[conv::kWeight]).Reorder2Default();
-    weight_mem = GetWeights(in_data[conv::kWeight], fwd.fwd_pd.weights_primitive_desc(),
-                            param.num_group);
-  } else {
-    // For inference, we want to reorder the weight array so we don't need to
-    // reorder data every time.
-    const_cast<NDArray &>(in_data[conv::kWeight]).MKLDNNDataReorder(
-        fwd.fwd_pd.weights_primitive_desc());
-    weight_mem = in_data[conv::kWeight].GetMKLDNNData();
-  }
-  auto out_mem = CreateMKLDNNMem(out_data[conv::kOut], fwd.fwd_pd.dst_primitive_desc(),
-                                 req[conv::kOut]);
-  const mkldnn::memory *bias_mem = nullptr;
-  if (!param.no_bias)
-    bias_mem = in_data[conv::kBias].GetMKLDNNDataReorder(fwd.fwd_pd.bias_primitive_desc());
-  fwd.SetNewMem(*data_mem, *weight_mem, bias_mem, *out_mem.second);
-  MKLDNNStream::Get()->RegisterPrim(fwd.GetFwd());
-
-  CommitOutput(out_data[conv::kOut], out_mem);
-  MKLDNNStream::Get()->Submit();
-}
-
-void MKLDNNConvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-                               const std::vector<NDArray>& inputs,
-                               const std::vector<OpReqType>& req,
-                               const std::vector<NDArray>& outputs) {
-  TmpMemMgr::Get()->Init(ctx.requested[conv::kTempSpace]);
-  const std::vector<NDArray> &in_grad = outputs;
-  const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
-  mkldnn::convolution_forward::primitive_desc fwd_pd = GetConvFwdImpl(param, ctx.is_train,
-      inputs[conv::kData + 1], inputs[conv::kWeight + 1],
-      param.no_bias ? nullptr : &inputs[conv::kBias + 1], inputs[conv::kOut]);
-
-  CHECK_NE(req[conv::kWeight], kWriteInplace) << "cannot write weight inplace";
-  mkldnn::convolution_backward_data::primitive_desc bwdData_pd
-    = GetConvBwdData(param, inputs[conv::kData + 1], inputs[conv::kWeight + 1],
-        inputs[conv::kOut], fwd_pd);
-  auto out_grad_mem = inputs[conv::kOut].GetMKLDNNDataReorder(
-      bwdData_pd.diff_dst_primitive_desc());
-  if (req[conv::kData]) {
-    auto weight_mem = GetWeights(inputs[conv::kWeight + 1],
-        bwdData_pd.weights_primitive_desc(), param.num_group);
-    auto in_grad_mem = CreateMKLDNNMem(in_grad[conv::kData],
-        bwdData_pd.diff_src_primitive_desc(), req[conv::kData]);
-    MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_backward_data(bwdData_pd,
-          *out_grad_mem, *weight_mem, *in_grad_mem.second));
-    CommitOutput(in_grad[conv::kData], in_grad_mem);
-  }
-  if (req[conv::kWeight]) {
-    mkldnn::convolution_backward_weights::primitive_desc bwdWeights_pd
-        = GetConvBwdWeights(param, inputs[conv::kData + 1], inputs[conv::kWeight + 1],
-                            param.no_bias ? nullptr : &inputs[conv::kBias + 1],
-                            inputs[conv::kOut], fwd_pd);
-    if (bwdData_pd.diff_dst_primitive_desc() != bwdWeights_pd.diff_dst_primitive_desc())
-      out_grad_mem = inputs[conv::kOut].GetMKLDNNDataReorder(
-          bwdWeights_pd.diff_dst_primitive_desc());
-    auto data_mem = inputs[conv::kData + 1].GetMKLDNNDataReorder(
-        bwdWeights_pd.src_primitive_desc());
-    auto in_grad_weight = CreateMKLDNNWeightGrad(in_grad[conv::kWeight],
-                                                 bwdWeights_pd.diff_weights_primitive_desc(),
-                                                 req[conv::kWeight]);
-    mkldnn_output_t in_grad_bias;
-    if (param.no_bias) {
-      MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_backward_weights(
-              bwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second));
-    } else {
-      in_grad_bias = CreateMKLDNNMem(in_grad[conv::kBias],
-                                     bwdWeights_pd.diff_bias_primitive_desc(),
-                                     req[conv::kBias]);
-      MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_backward_weights(
-              bwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second,
-              *in_grad_bias.second));
-      CommitOutput(in_grad[conv::kBias], in_grad_bias);
-    }
-    CommitOutput(in_grad[conv::kWeight], in_grad_weight);
-  }
-  MKLDNNStream::Get()->Submit();
-}
-
-}  // namespace op
-}  // namespace mxnet
-
-#endif  // MXNET_USE_MKLDNN == 1
diff --git a/src/operator/nn/mkldnn/mkldnn_copy.cc b/src/operator/nn/mkldnn/mkldnn_copy.cc
deleted file mode 100644
index 71d540c969..0000000000
--- a/src/operator/nn/mkldnn/mkldnn_copy.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file mkldnn_softmax.cc
- * \brief
- * \author Da Zheng
-*/
-
-#include "../softmax-inl.h"
-#include "./mkldnn_ops-inl.h"
-#include "./mkldnn_base-inl.h"
-
-#if MXNET_USE_MKLDNN == 1
-namespace mxnet {
-namespace op {
-
-void MKLDNNCopy(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-                const NDArray &in_data, const OpReqType &req,
-                const NDArray &out_data) {
-  TmpMemMgr::Get()->Init(ctx.requested[0]);
-  auto in_mem = in_data.GetMKLDNNData();
-  if (req == kAddTo) {
-    TmpMemMgr::Get()->Init(ctx.requested[0]);
-    // We should try and force the output memory has the same format
-    // as the input memory. If not, we'll have to reorder memory.
-    auto out_mem = out_data.GetMKLDNNData(in_mem->get_primitive_desc());
-    if (out_mem == nullptr)
-      out_mem = out_data.GetMKLDNNData();
-    auto sum_res = TmpMemMgr::Get()->Alloc(out_mem->get_primitive_desc());
-    Sum(*in_mem, *out_mem, *sum_res);
-    const_cast<NDArray &>(out_data).CopyFrom(*sum_res);
-  } else {
-    const_cast<NDArray &>(out_data).CopyFrom(*in_mem);
-  }
-  MKLDNNStream::Get()->Submit();
-}
-
-}   // namespace op
-}   // namespace mxnet
-#endif
diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
deleted file mode 100644
index d336d6dedb..0000000000
--- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
+++ /dev/null
@@ -1,390 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file mkldnn_deconvolution.cc
- * \brief
- * \author Da Zheng, Rong Zhang (rong.a.zhang@intel.com)
-*/
-
-#if MXNET_USE_MKLDNN == 1
-
-#include "../deconvolution-inl.h"
-#include "./mkldnn_ops-inl.h"
-#include "./mkldnn_base-inl.h"
-
-namespace mxnet {
-namespace op {
-
-static inline mkldnn::memory::desc GetBiasDesc(mkldnn::memory::desc md) {
-  mkldnn::memory::dims dims(1);
-  // This is convolution on 4D data. The second dimension is the channel.
-  dims[0] = md.data.dims[1];
-  return mkldnn::memory::desc(dims,
-      static_cast<mkldnn::memory::data_type>(md.data.data_type),
-      mkldnn::memory::format::any);
-}
-
-static mkldnn::convolution_forward::primitive_desc GetDeconvBwd_(
-    const mkldnn::memory::desc &data_md, const mkldnn::memory::desc &weights_md,
-    bool has_bias, const mkldnn::memory::desc &out_md,
-    const mkldnn::engine &engine, const mkldnn::memory::dims &strides,
-    const mkldnn::memory::dims &padding, const mkldnn::memory::dims &dilates) {
-  if (!has_bias) {
-    mkldnn::convolution_forward::desc desc(mkldnn::prop_kind::forward_training,
-        mkldnn::algorithm::convolution_direct, out_md, weights_md, data_md, strides,
-        dilates, padding, padding, mkldnn::padding_kind::zero);
-    return mkldnn::convolution_forward::primitive_desc(desc, engine);
-  } else {
-    auto bias_md = GetBiasDesc(data_md);
-    mkldnn::convolution_forward::desc desc(mkldnn::prop_kind::forward_training,
-        mkldnn::algorithm::convolution_direct, out_md, weights_md, bias_md,
-        data_md, strides, dilates, padding, padding, mkldnn::padding_kind::zero);
-    return mkldnn::convolution_forward::primitive_desc(desc, engine);
-  }
-}
-
-static mkldnn::convolution_backward_data::primitive_desc GetDeconvFwdImpl(
-    const DeconvolutionParam& param, const NDArray &data, const NDArray &weights,
-    bool has_bias, const NDArray &output) {
-  auto data_md = GetMemDesc(data);
-  auto weight_md = GetWeightDesc(weights, param.num_group);
-  auto out_md = GetMemDesc(output);
-  auto engine = CpuEngine::Get()->get_engine();
-  mkldnn::memory::dims strides{0, 0};
-  if (param.stride.ndim() == 2) {
-    strides[0] = param.stride[0];
-    strides[1] = param.stride[1];
-  } else if (param.stride.ndim() == 1) {
-    strides[0] = param.stride[0];
-    strides[1] = param.stride[0];
-  } else {
-    LOG(FATAL) << "Unsupported stride dim";
-  }
-  mkldnn::memory::dims padding{0, 0};
-  if (param.pad.ndim() == 2) {
-    padding[0] = param.pad[0];
-    padding[1] = param.pad[1];
-  } else if (param.pad.ndim() == 1) {
-    padding[0] = param.pad[0];
-    padding[1] = param.pad[0];
-  } else {
-    LOG(FATAL) << "Unsupported pad dim";
-  }
-  mkldnn::memory::dims dilate{0, 0};
-  if (param.dilate.ndim() == 2) {
-    dilate[0] = param.dilate[0] - 1;
-    dilate[1] = param.dilate[1] - 1;
-  }
-  auto bwd_pd = GetDeconvBwd_(data_md, weight_md, has_bias, out_md, engine,
-      strides, padding, dilate);
-  mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct,
-      out_md, weight_md, data_md, strides, dilate, padding, padding,
-      mkldnn::padding_kind::zero);
-  return mkldnn::convolution_backward_data::primitive_desc(desc, engine, bwd_pd);
-}
-
-static mkldnn::convolution_forward::primitive_desc GetDeconvBwdData(
-    const DeconvolutionParam &param, const NDArray &data, const NDArray &weights,
-    bool has_bias, const NDArray &output) {
-  auto data_md = GetMemDesc(data);
-  auto weight_md = GetWeightDesc(weights, param.num_group);
-  auto out_md = GetMemDesc(output);
-  auto engine = CpuEngine::Get()->get_engine();
-  mkldnn::memory::dims strides{0, 0};
-  if (param.stride.ndim() == 2) {
-    strides[0] = param.stride[0];
-    strides[1] = param.stride[1];
-  } else if (param.stride.ndim() == 1) {
-    strides[0] = param.stride[0];
-    strides[1] = param.stride[0];
-  } else {
-    LOG(FATAL) << "Unsupported stride dim";
-  }
-  mkldnn::memory::dims padding{0, 0};
-  if (param.pad.ndim() == 2) {
-    padding[0] = param.pad[0];
-    padding[1] = param.pad[1];
-  } else if (param.pad.ndim() == 1) {
-    padding[0] = param.pad[0];
-    padding[1] = param.pad[0];
-  } else {
-    LOG(FATAL) << "Unsupported pad dim";
-  }
-  mkldnn::memory::dims dilate{0, 0};
-  if (param.dilate.ndim() == 2) {
-    dilate[0] = param.dilate[0] - 1;
-    dilate[1] = param.dilate[1] - 1;
-  }
-  return GetDeconvBwd_(data_md, weight_md, has_bias, out_md, engine,
-      strides, padding, dilate);
-}
-
-static mkldnn::convolution_backward_weights::primitive_desc GetDeconvBwdWeights(
-    const DeconvolutionParam& param, const NDArray &data, const NDArray &weights,
-    bool has_bias, const NDArray &output,
-    const mkldnn::convolution_forward::primitive_desc &fwd_pd) {
-  auto data_md = GetMemDesc(data);
-  auto weight_md = GetWeightDesc(weights, param.num_group);
-  auto out_md = GetMemDesc(output);
-  auto engine = CpuEngine::Get()->get_engine();
-  mkldnn::memory::dims strides{0, 0};
-  if (param.stride.ndim() == 2) {
-    strides[0] = param.stride[0];
-    strides[1] = param.stride[1];
-  } else if (param.stride.ndim() == 1) {
-    strides[0] = param.stride[0];
-    strides[1] = param.stride[0];
-  } else {
-    LOG(FATAL) << "Unsupported stride dim";
-  }
-  mkldnn::memory::dims padding{0, 0};
-  if (param.pad.ndim() == 2) {
-    padding[0] = param.pad[0];
-    padding[1] = param.pad[1];
-  } else if (param.pad.ndim() == 1) {
-    padding[0] = param.pad[0];
-    padding[1] = param.pad[0];
-  } else {
-    LOG(FATAL) << "Unsupported pad dim";
-  }
-  mkldnn::memory::dims dilate{0, 0};
-  if (param.dilate.ndim() == 2) {
-    dilate[0] = param.dilate[0] - 1;
-    dilate[1] = param.dilate[1] - 1;
-  }
-  if (!has_bias) {
-    mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
-        out_md, weight_md, data_md, strides, dilate, padding, padding, mkldnn::padding_kind::zero);
-    return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd);
-  } else {
-    auto bias_md = GetBiasDesc(data_md);
-    mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
-        out_md, weight_md, bias_md, data_md, strides, dilate, padding, padding,
-        mkldnn::padding_kind::zero);
-    return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd);
-  }
-}
-
-class MKLDNNDeconvForward {
-  std::shared_ptr<mkldnn::convolution_backward_data> fwd;
-  std::shared_ptr<mkldnn::memory> data;
-  std::shared_ptr<mkldnn::memory> weight;
-  std::shared_ptr<mkldnn::memory> bias;
-  std::shared_ptr<mkldnn::memory> out;
-  OutDataOp data_op;
-
- public:
-  MKLDNNDeconvForward(const DeconvolutionParam& param,
-                      const NDArray &data,
-                      const NDArray &weights,
-                      bool has_bias,
-                      const NDArray &output);
-  void SetDataHandle(const DeconvolutionParam& param,
-                     const OpContext &ctx,
-                     const std::vector<NDArray> &in_data,
-                     const std::vector<OpReqType> &req,
-                     const std::vector<NDArray> &out_data);
-
-  void Execute(const std::vector<NDArray> &out_data);
-
- private:
-  mkldnn::convolution_backward_data::primitive_desc fwd_pd;
-};  // class MKLDNNDeconvForward
-
-MKLDNNDeconvForward::MKLDNNDeconvForward(const DeconvolutionParam& param,
-                                const NDArray &data,
-                                const NDArray &weights,
-                                bool has_bias,
-                                const NDArray &output)
-                                :fwd_pd(GetDeconvFwdImpl(param, data, weights, has_bias, output)) {
-  this->data = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
-          fwd_pd.diff_dst_primitive_desc()));
-  this->weight = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
-          fwd_pd.weights_primitive_desc()));
-  this->out = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
-          fwd_pd.diff_src_primitive_desc()));
-  this->fwd = std::shared_ptr<mkldnn::convolution_backward_data>(
-    new mkldnn::convolution_backward_data(fwd_pd,
-                                          mkldnn::primitive::at(*this->data),
-                                          mkldnn::primitive::at(*this->weight),
-                                          *this->out));
-}
-
-void MKLDNNDeconvForward::SetDataHandle(const DeconvolutionParam& param,
-                                        const OpContext &ctx,
-                                        const std::vector<NDArray> &in_data,
-                                        const std::vector<OpReqType> &req,
-                                        const std::vector<NDArray> &out_data) {
-  auto data_mem = in_data[deconv::kData].GetMKLDNNDataReorder(
-      fwd_pd.diff_dst_primitive_desc());
-  const mkldnn::memory *weight_mem;
-  if (ctx.is_train) {
-    // TODO(zhengda) kvstore doesn't handle MKLDNN correctly. Let's reorder it
-    // to the default format for now.
-    if (in_data[deconv::kWeight].IsMKLDNNData())
-      const_cast<NDArray &>(in_data[deconv::kWeight]).Reorder2Default();
-    weight_mem = GetWeights(in_data[deconv::kWeight],
-                            fwd_pd.weights_primitive_desc(),
-                            param.num_group);
-  } else {
-    // For inference, we want to reorder the weight array so we don't need to
-    // reorder data every time.
-    const_cast<NDArray &>(in_data[deconv::kWeight]).MKLDNNDataReorder(
-        fwd_pd.weights_primitive_desc());
-    weight_mem = in_data[deconv::kWeight].GetMKLDNNData();
-  }
-  auto out_mem = CreateMKLDNNMem(out_data[deconv::kOut],
-      fwd_pd.diff_src_primitive_desc(), req[deconv::kOut]);
-  auto output = out_mem.second;
-  this->data->set_data_handle(data_mem->get_data_handle());
-  this->weight->set_data_handle(weight_mem->get_data_handle());
-  this->out->set_data_handle(output->get_data_handle());
-  this->data_op = out_mem.first;
-}
-
-void MKLDNNDeconvForward::Execute(const std::vector<NDArray> &out_data) {
-  MKLDNNStream::Get()->RegisterPrim(*fwd);
-  CommitOutput(out_data[deconv::kOut], mkldnn_output_t(this->data_op, this->out.get()));
-  MKLDNNStream::Get()->Submit();
-}
-
-static void MKLDNNDeconvFwdBiasPostProcess(const DeconvolutionParam& param,
-                                           const OpContext &ctx,
-                                           const std::vector<NDArray> &in_data,
-                                           const std::vector<NDArray> &out_data) {
-  // add bias, broadcast bias to dim 1: channel
-  if (!param.no_bias) {
-    // MKLDNN only supports float right now.
-    typedef float DType;
-    Stream<cpu> *s = ctx.get_stream<cpu>();
-    Tensor<cpu, 1, DType> bias = in_data[deconv::kBias].data().get<cpu, 1, DType>(s);
-    // If the output data is stored in a special MKLDNN format, data()
-    // automatically converts its format to the default format.
-    // Unfortunately, MKLDNN doesn't support broadcast.
-    Tensor<cpu, 4, DType> out_cpu = out_data[deconv::kOut].data().get<cpu, 4, DType>(s);
-    out_cpu += mshadow::expr::broadcast<1>(bias, out_cpu.shape_);
-  }
-}
-
-typedef MKLDNNParamOpSign<DeconvolutionParam> MKLDNNDeconvSignature;
-
-static inline MKLDNNDeconvForward &GetDeconvFwd(
-    const nnvm::NodeAttrs& attrs, const NDArray &data,
-    const NDArray &weights, const NDArray *bias,
-    const NDArray &output) {
-  static thread_local
-        std::unordered_map<MKLDNNDeconvSignature, MKLDNNDeconvForward, MKLDNNOpHash> fwds;
-  const DeconvolutionParam& param = nnvm::get<DeconvolutionParam>(attrs.parsed);
-  MKLDNNDeconvSignature key(param);
-  // Here we can sign the conv op with NDArray because conv primitive will
-  // decide the right layout for the, so we only need to get the shape and the
-  // data type of the arrays.
-  key.AddSign(data);
-  key.AddSign(weights);
-  key.AddSign(output);
-  if (bias)
-    key.AddSign(*bias);
-
-  auto it = fwds.find(key);
-  if (it == fwds.end()) {
-    bool has_bias = (bias != nullptr);
-    MKLDNNDeconvForward fwd(param, data, weights, has_bias, output);
-    auto ins_ret = fwds.insert(
-        std::pair<MKLDNNDeconvSignature, MKLDNNDeconvForward>(key, fwd));
-    CHECK(ins_ret.second);
-    it = ins_ret.first;
-  }
-  return it->second;
-}
-
-void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-                                const std::vector<NDArray> &in_data,
-                                const std::vector<OpReqType> &req,
-                                const std::vector<NDArray> &out_data) {
-  TmpMemMgr::Get()->Init(ctx.requested[deconv::kTempSpace]);
-  const DeconvolutionParam& param = nnvm::get<DeconvolutionParam>(attrs.parsed);
-
-  MKLDNNDeconvForward &deconvFwd = GetDeconvFwd(
-      attrs, in_data[deconv::kData], in_data[deconv::kWeight],
-      param.no_bias ? nullptr : &in_data[deconv::kBias], out_data[deconv::kOut]);
-
-  deconvFwd.SetDataHandle(param, ctx, in_data, req, out_data);
-
-  deconvFwd.Execute(out_data);
-
-  MKLDNNDeconvFwdBiasPostProcess(param, ctx, in_data, out_data);
-}
-
-void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-                                 const std::vector<NDArray>& inputs,
-                                 const std::vector<OpReqType>& req,
-                                 const std::vector<NDArray>& outputs) {
-  TmpMemMgr::Get()->Init(ctx.requested[deconv::kTempSpace]);
-  const std::vector<NDArray> &in_grad = outputs;
-  const DeconvolutionParam& param = nnvm::get<DeconvolutionParam>(attrs.parsed);
-  CHECK_NE(req[deconv::kWeight], kWriteInplace) << "cannot write weight inplace";
-  mkldnn::convolution_forward::primitive_desc bwdData_pd = GetDeconvBwdData(
-      param, inputs[deconv::kData + 1], inputs[deconv::kWeight + 1], false,
-      inputs[deconv::kOut]);
-  auto out_grad_mem = inputs[deconv::kOut].GetMKLDNNDataReorder(
-      bwdData_pd.src_primitive_desc());
-  if (req[deconv::kData]) {
-    auto weight_mem = GetWeights(inputs[deconv::kWeight + 1],
-                                 bwdData_pd.weights_primitive_desc(),
-                                 param.num_group);
-    auto in_grad_mem = CreateMKLDNNMem(in_grad[deconv::kData],
-                                       bwdData_pd.dst_primitive_desc(),
-                                       req[deconv::kData]);
-    MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_forward(bwdData_pd,
-          *out_grad_mem, *weight_mem, *in_grad_mem.second));
-    CommitOutput(in_grad[deconv::kData], in_grad_mem);
-  }
-  if (req[deconv::kWeight]) {
-    mkldnn::convolution_backward_weights::primitive_desc bwdWeights_pd
-      = GetDeconvBwdWeights(param, inputs[deconv::kData + 1],
-          inputs[deconv::kWeight + 1], false, inputs[deconv::kOut], bwdData_pd);
-    if (bwdData_pd.src_primitive_desc() != bwdWeights_pd.src_primitive_desc())
-      out_grad_mem = inputs[deconv::kOut].GetMKLDNNDataReorder(
-          bwdWeights_pd.src_primitive_desc());
-    auto data_mem = inputs[deconv::kData + 1].GetMKLDNNDataReorder(
-        bwdWeights_pd.diff_dst_primitive_desc());
-    auto in_grad_weight = CreateMKLDNNWeightGrad(in_grad[deconv::kWeight],
-                                                 bwdWeights_pd.diff_weights_primitive_desc(),
-                                                 req[deconv::kWeight]);
-    MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_backward_weights(
-          bwdWeights_pd, *out_grad_mem, *data_mem, *in_grad_weight.second));
-    CommitOutput(in_grad[deconv::kWeight], in_grad_weight);
-  }
-  MKLDNNStream::Get()->Submit();
-  if (!param.no_bias) {
-    typedef float DType;
-    Stream<cpu> *s = ctx.get_stream<cpu>();
-    Tensor<cpu, 1, DType> gbias = in_grad[deconv::kBias].data().get<cpu, 1, DType>(s);
-    // If there is bias, the out grad has already been converted to the default
-    // format, so this shouldn't cause any performance issues.
-    Tensor<cpu, 4, DType> grad = inputs[deconv::kOut].data().get<cpu, 4, DType>(s);
-    Assign(gbias, req[deconv::kBias], mshadow::expr::sumall_except_dim<1>(grad));
-  }
-}
-
-}  // namespace op
-}  // namespace mxnet
-
-#endif  // MXNET_USE_MKLDNN == 1
diff --git a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc
deleted file mode 100644
index a8b85bbeb1..0000000000
--- a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc
+++ /dev/null
@@ -1,200 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file mkldnn_fully_connected.cc
- * \brief
- * \author Da Zheng
-*/
-
-#include "../fully_connected-inl.h"
-#include "./mkldnn_base-inl.h"
-
-#if MXNET_USE_MKLDNN == 1
-namespace mxnet {
-namespace op {
-
-inline static mkldnn::inner_product_forward::primitive_desc GetIPFwd(
-    const NDArray &data, const NDArray &weight, const NDArray *bias,
-    const mkldnn::memory::desc &out_md) {
-  auto data_md = GetMemDesc(data);
-  auto weight_md = GetMemDesc(weight);
-  auto engine = CpuEngine::Get()->get_engine();
-  if (bias) {
-    auto bias_md = GetMemDesc(*bias);
-    mkldnn::inner_product_forward::desc ipFwd_desc(mkldnn::prop_kind::forward_training,
-        data_md, weight_md, bias_md, out_md);
-    return mkldnn::inner_product_forward::primitive_desc(ipFwd_desc, engine);
-  } else {
-    mkldnn::inner_product_forward::desc ipFwd_desc(mkldnn::prop_kind::forward_training,
-        data_md, weight_md, out_md);
-    return mkldnn::inner_product_forward::primitive_desc(ipFwd_desc, engine);
-  }
-}
-
-inline static mkldnn::inner_product_backward_data::primitive_desc GetIpBwdData(
-    const NDArray &data, const NDArray &weight, const NDArray &output,
-    mkldnn::inner_product_forward::primitive_desc ipFwd_pd) {
-  auto data_md = GetMemDesc(data);
-  auto weight_md = GetMemDesc(weight);
-  auto out_md = GetMemDesc(output);
-  auto engine = CpuEngine::Get()->get_engine();
-  mkldnn::inner_product_backward_data::desc desc(data_md, weight_md, out_md);
-  return mkldnn::inner_product_backward_data::primitive_desc(desc, engine, ipFwd_pd);
-}
-
-inline static mkldnn::inner_product_backward_weights::primitive_desc GetIPBwdWeights(
-    const NDArray &data, const NDArray &weight, const NDArray *bias,
-    const NDArray &output, mkldnn::inner_product_forward::primitive_desc ipFwd_pd) {
-  auto data_md = GetMemDesc(data);
-  auto weight_md = GetMemDesc(weight);
-  auto out_md = GetMemDesc(output);
-  auto engine = CpuEngine::Get()->get_engine();
-  if (bias) {
-    auto bias_md = GetMemDesc(*bias);
-    mkldnn::inner_product_backward_weights::desc ipBwdWeights_desc(data_md,
-        weight_md, bias_md, out_md);
-    return mkldnn::inner_product_backward_weights::primitive_desc(
-        ipBwdWeights_desc, engine, ipFwd_pd);
-  } else {
-    mkldnn::inner_product_backward_weights::desc ipBwdWeights_desc(data_md,
-        weight_md, out_md);
-    return mkldnn::inner_product_backward_weights::primitive_desc(
-        ipBwdWeights_desc, engine, ipFwd_pd);
-  }
-}
-
-void MKLDNNFCForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-                     const std::vector<NDArray> &in_data,
-                     const std::vector<OpReqType> &req,
-                     const std::vector<NDArray> &out_data) {
-  TmpMemMgr::Get()->Init(ctx.requested[fullc::kTempSpace]);
-  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
-  const TShape& ishape = in_data[fullc::kData].shape();
-  const TShape& oshape = out_data[fullc::kOut].shape();
-  NDArray weight = in_data[fullc::kWeight];
-  NDArray data = in_data[fullc::kData];
-  auto out_md = GetMemDesc(out_data[fullc::kOut]);
-  if (data.shape().ndim() != 2 && !param.flatten) {
-    data = data.MKLDNNDataReshape(Shape2(ishape.ProdShape(0, ishape.ndim()-1),
-                                     ishape[ishape.ndim()-1]));
-    mkldnn::memory::dims out_dims{static_cast<int>(oshape.ProdShape(0, oshape.ndim()-1)),
-      static_cast<int>(oshape[ishape.ndim()-1])};
-    out_md = mkldnn::memory::desc(out_dims, get_mkldnn_type(out_data[fullc::kOut].dtype()),
-      mkldnn::memory::format::any);
-  } else if (data.shape().ndim() != 2) {
-    data = data.MKLDNNDataReshape(Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())));
-    mkldnn::memory::dims out_dims{static_cast<int>(oshape[0]),
-      static_cast<int>(oshape.ProdShape(1, oshape.ndim()))};
-    out_md = mkldnn::memory::desc(out_dims, get_mkldnn_type(out_data[fullc::kOut].dtype()),
-      mkldnn::memory::format::any);
-  }
-
-  mkldnn::inner_product_forward::primitive_desc ipFwd_pd = GetIPFwd(data, weight,
-      param.no_bias ? nullptr : &in_data[fullc::kBias], out_md);
-  auto data_mem = data.GetMKLDNNDataReorder(ipFwd_pd.src_primitive_desc());
-  auto weight_mem = weight.GetMKLDNNDataReorder(ipFwd_pd.weights_primitive_desc());
-  auto out_mem = CreateMKLDNNMem(out_data[fullc::kOut],
-      ipFwd_pd.dst_primitive_desc(), req[fullc::kOut]);
-  if (param.no_bias) {
-    MKLDNNStream::Get()->RegisterPrim(mkldnn::inner_product_forward(
-          ipFwd_pd, *data_mem, *weight_mem, *out_mem.second));
-  } else {
-    auto bias_mem = in_data[fullc::kBias].GetMKLDNNDataReorder(ipFwd_pd.bias_primitive_desc());
-    MKLDNNStream::Get()->RegisterPrim(mkldnn::inner_product_forward(ipFwd_pd,
-          *data_mem, *weight_mem, *bias_mem, *out_mem.second));
-  }
-  CommitOutput(out_data[fullc::kOut], out_mem);
-  MKLDNNStream::Get()->Submit();
-}
-
-void MKLDNNFCBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-                      const std::vector<NDArray> &inputs,
-                      const std::vector<OpReqType> &req,
-                      const std::vector<NDArray> &outputs) {
-  TmpMemMgr::Get()->Init(ctx.requested[fullc::kTempSpace]);
-  const std::vector<NDArray> &in_grad = outputs;
-  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
-  const TShape& ishape = inputs[fullc::kData + 1].shape();
-  const TShape& oshape = inputs[fullc::kOut].shape();
-
-  NDArray weight = inputs[fullc::kWeight + 1];
-  NDArray data = inputs[fullc::kData + 1];
-  if (data.shape().ndim() != 2 && !param.flatten)
-    data = data.MKLDNNDataReshape(Shape2(ishape.ProdShape(0, ishape.ndim()-1),
-                                     ishape[ishape.ndim()-1]));
-  else if (data.shape().ndim() != 2)
-    data = data.MKLDNNDataReshape(Shape2(ishape[0],
-                                     ishape.ProdShape(1, ishape.ndim())));
-  NDArray out_grad = inputs[fullc::kOut];
-  if (out_grad.shape().ndim() != 2 && !param.flatten)
-    out_grad = out_grad.MKLDNNDataReshape(Shape2(oshape.ProdShape(0, oshape.ndim()-1),
-                                             oshape[oshape.ndim()-1]));
-  else if (out_grad.shape().ndim() != 2)
-    out_grad = out_grad.MKLDNNDataReshape(Shape2(oshape[0],
-                                             oshape.ProdShape(1, oshape.ndim())));
-
-  mkldnn::inner_product_forward::primitive_desc ipFwd_pd = GetIPFwd(data, weight,
-      param.no_bias ? nullptr : &in_grad[fullc::kBias], GetMemDesc(out_grad));
-
-  CHECK_NE(req[fullc::kWeight], kWriteInplace) << "cannot write weight inplace";
-  if (req[fullc::kData]) {
-    mkldnn::inner_product_backward_data::primitive_desc ipBwdData_pd = GetIpBwdData(
-        data, weight, out_grad, ipFwd_pd);
-    auto out_grad_mem = out_grad.GetMKLDNNDataReorder(
-        ipBwdData_pd.diff_dst_primitive_desc());
-    auto weight_mem = weight.GetMKLDNNDataReorder(ipBwdData_pd.weights_primitive_desc());
-    auto in_grad_mem = CreateMKLDNNMem(in_grad[fullc::kData],
-                                       ipBwdData_pd.diff_src_primitive_desc(),
-                                       req[fullc::kData]);
-    MKLDNNStream::Get()->RegisterPrim(mkldnn::inner_product_backward_data(
-          ipBwdData_pd, *out_grad_mem, *weight_mem, *in_grad_mem.second));
-    CommitOutput(in_grad[fullc::kData], in_grad_mem);
-  }
-  if (req[fullc::kWeight]) {
-    mkldnn::inner_product_backward_weights::primitive_desc ipBwdWeights_pd
-      = GetIPBwdWeights(data, weight, param.no_bias ? nullptr : &in_grad[fullc::kBias],
-          out_grad, ipFwd_pd);
-    auto out_grad_mem = out_grad.GetMKLDNNDataReorder(
-        ipBwdWeights_pd.diff_dst_primitive_desc());
-    auto data_mem = data.GetMKLDNNDataReorder(ipBwdWeights_pd.src_primitive_desc());
-    auto in_grad_weight = CreateMKLDNNWeightGrad(in_grad[fullc::kWeight],
-                                                 ipBwdWeights_pd.diff_weights_primitive_desc(),
-                                                 req[fullc::kWeight]);
-    mkldnn_output_t in_grad_bias;
-    if (param.no_bias) {
-      MKLDNNStream::Get()->RegisterPrim(mkldnn::inner_product_backward_weights(
-            ipBwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second));
-    } else {
-      in_grad_bias = CreateMKLDNNMem(in_grad[fullc::kBias],
-                                     ipBwdWeights_pd.diff_bias_primitive_desc(),
-                                     req[fullc::kBias]);
-      MKLDNNStream::Get()->RegisterPrim(mkldnn::inner_product_backward_weights(
-            ipBwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second,
-            *in_grad_bias.second));
-    }
-    CommitOutput(in_grad[fullc::kWeight], in_grad_weight);
-    CommitOutput(in_grad[fullc::kBias], in_grad_bias);
-  }
-  MKLDNNStream::Get()->Submit();
-}
-
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_USE_MKLDNN == 1
diff --git a/src/operator/nn/mkldnn/mkldnn_lrn-inl.h b/src/operator/nn/mkldnn/mkldnn_lrn-inl.h
deleted file mode 100644
index 9a9bf62b67..0000000000
--- a/src/operator/nn/mkldnn/mkldnn_lrn-inl.h
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file mkldnn_lrn-inl.h 
- * \brief
- * \Author: Patric Zhao, patric.zhao@intel.com
-*/
-#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LRN_INL_H_
-#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LRN_INL_H_
-
-#if MXNET_USE_MKLDNN == 1
-#include <mkldnn.hpp>
-#include "../lrn-inl.h"
-#include "./mkldnn_base-inl.h"
-
-namespace mxnet {
-namespace op {
-
-inline algorithm GetMKLDNNLRNAlgo(const LRNParam &param) {
-  // TODO(Patric): lrn_within_channel will cause core dump in MKLDNN backward
-  //               Need to confirm with MKLDNN team and fix later
-  return algorithm::lrn_across_channels;
-}
-
-inline lrn_forward::primitive_desc GetLRNFwd(const LRNParam &param,
-                                             const bool is_train,
-                                             const memory::desc &src_md) {
-  const auto  engine = CpuEngine::Get()->get_engine();
-  const auto  alg = GetMKLDNNLRNAlgo(param);
-  const float alpha = param.alpha;
-  const float beta = param.beta;
-  const int   nsize = param.nsize;
-  const float k = param.knorm;
-  auto kind = prop_kind::forward_training;
-  if (is_train) {
-    kind = prop_kind::forward_training;
-  } else {
-    kind = prop_kind::forward_scoring;
-  }
-  lrn_forward::desc fwd_desc(kind, alg, src_md, nsize, alpha, beta, k);
-  return mkldnn::lrn_forward::primitive_desc(fwd_desc, engine);
-}
-
-inline mkldnn::lrn_backward::primitive_desc
-GetLRNBwd(const LRNParam &param,
-          const mkldnn::memory::desc &diff_in_md,
-          const mkldnn::memory::desc &diff_md,
-          const lrn_forward::primitive_desc &lrnFwd_desc) {
-  const auto engine = CpuEngine::Get()->get_engine();
-  const auto alg = GetMKLDNNLRNAlgo(param);
-  const float alpha = param.alpha;
-  const float beta = param.beta;
-  const int nsize = param.nsize;
-  const float k = param.knorm;
-
-  lrn_backward::desc lrnBwd_desc(alg, diff_in_md,
-                diff_md, nsize, alpha, beta, k);
-  return mkldnn::lrn_backward::primitive_desc(lrnBwd_desc,
-                               engine, lrnFwd_desc);
-}
-
-void MKLDNNLRNForward(const OpContext &ctx,
-                      const LRNParam &param,
-                      const NDArray &in_data,
-                      const OpReqType req,
-                      const NDArray &out_data) {
-  auto src_mem = in_data.GetMKLDNNData();
-  const auto src_md = src_mem->get_primitive_desc().desc();
-  const auto pdesc = GetLRNFwd(param, ctx.is_train, src_md);
-  auto dst_mem = const_cast<NDArray &>(out_data).CreateMKLDNNData(
-          pdesc.dst_primitive_desc());
-  if (ctx.is_train) {
-    std::shared_ptr<const mkldnn::memory> ws_mem(
-            new mkldnn::memory(pdesc.workspace_primitive_desc()));
-    MKLDNNStream::Get()->RegisterPrim(
-        lrn_forward(pdesc, mkldnn::primitive::at(*src_mem),
-        *ws_mem, *dst_mem));
-    MKLDNNStream::Get()->Submit();
-  } else {
-    MKLDNNStream::Get()->RegisterPrim(
-        lrn_forward(pdesc, mkldnn::primitive::at(*src_mem), *dst_mem));
-    MKLDNNStream::Get()->Submit();
-  }
-}
-
-void MKLDNNLRNBackward(const OpContext &ctx, const LRNParam &param,
-                       const NDArray &out_grad,
-                       const NDArray &in_data,
-                       const OpReqType req,
-                       const NDArray &in_grad) {
-  if (req == kNullOp) {
-    return;
-  }
-  // Repeat FW for getting workspace
-  auto data_mem = in_data.GetMKLDNNData();
-  const auto data_md = data_mem->get_primitive_desc().desc();
-  const auto pdesc_fwd = GetLRNFwd(param, ctx.is_train, data_md);
-
-  // TODO(Patric): To keep the function stateless, we can't pass workspace
-  //               from LRN forward to backward. We have to re-compute
-  //               LRN forward to get the workspace.
-  //               Will refine this code later.
-  std::shared_ptr<const mkldnn::memory> ws_mem(
-          new mkldnn::memory(pdesc_fwd.workspace_primitive_desc()));
-  std::shared_ptr<const mkldnn::memory> dst_temp(
-          new mkldnn::memory(pdesc_fwd.dst_primitive_desc()));
-  MKLDNNStream::Get()->RegisterPrim(
-          lrn_forward(pdesc_fwd, mkldnn::primitive::at(*data_mem),
-          *ws_mem, *dst_temp));
-
-  const auto data_in_md = pdesc_fwd.src_primitive_desc().desc();
-  auto diff_mem = out_grad.GetMKLDNNData();
-  const auto diff_md = diff_mem->get_primitive_desc().desc();
-  const auto pdesc_bwd = GetLRNBwd(param, data_in_md, diff_md, pdesc_fwd);
-  auto diff_src_mem = CreateMKLDNNMem(in_grad,
-          pdesc_bwd.diff_src_primitive_desc(), req);
-
-  MKLDNNStream::Get()->RegisterPrim(
-        lrn_backward(pdesc_bwd, mkldnn::primitive::at(*data_mem),
-        mkldnn::primitive::at(*diff_mem), *ws_mem, *diff_src_mem.second));
-  MKLDNNStream::Get()->Submit();
-}
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_USE_MKLDNN == 1
-#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LRN_INL_H__
diff --git a/src/operator/nn/mkldnn/mkldnn_ops-inl.h b/src/operator/nn/mkldnn/mkldnn_ops-inl.h
deleted file mode 100644
index 9149cb0c6a..0000000000
--- a/src/operator/nn/mkldnn/mkldnn_ops-inl.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file mkldnn_ops-inl.h
- * \brief
- * \author Da Zheng
-*/
-
-#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_OPS_INL_H_
-#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_OPS_INL_H_
-
-#if MXNET_USE_MKLDNN == 1
-
-#include <mxnet/io.h>
-#include <mxnet/base.h>
-#include <mxnet/ndarray.h>
-#include <mxnet/operator.h>
-#include <mxnet/operator_util.h>
-#include <dmlc/logging.h>
-#include <dmlc/optional.h>
-#include <vector>
-#include <mkldnn.hpp>
-
-namespace mxnet {
-namespace op {
-
-/* For fully connected. */
-void MKLDNNFCForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-                     const std::vector<NDArray> &in_data,
-                     const std::vector<OpReqType> &req,
-                     const std::vector<NDArray> &out_data);
-void MKLDNNFCBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-                      const std::vector<NDArray> &inputs,
-                      const std::vector<OpReqType> &req,
-                      const std::vector<NDArray> &outputs);
-
-/* For convolution. */
-void MKLDNNConvolutionForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-                              const std::vector<NDArray> &in_data,
-                              const std::vector<OpReqType> &req,
-                              const std::vector<NDArray> &out_data);
-void MKLDNNConvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-                               const std::vector<NDArray>& inputs,
-                               const std::vector<OpReqType>& req,
-                               const std::vector<NDArray>& outputs);
-
-/* For deconvolution */
-void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-                                const std::vector<NDArray> &in_data,
-                                const std::vector<OpReqType> &req,
-                                const std::vector<NDArray> &out_data);
-void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-                                 const std::vector<NDArray>& inputs,
-                                 const std::vector<OpReqType>& req,
-                                 const std::vector<NDArray>& outputs);
-
-/* For softmax */
-void MKLDNNSoftmaxForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-                          const NDArray &in_data, const OpReqType &req,
-                          const NDArray &out_data);
-
-/* For sum */
-void MKLDNNSumForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-                      const std::vector<NDArray> &inputs, const OpReqType &req,
-                      const NDArray &out_data);
-
-/* For copy */
-void MKLDNNCopy(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-    const NDArray &in_data, const OpReqType &req,
-    const NDArray &out_data);
-
-/* For concat */
-void MKLDNNConcatForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-                         const std::vector<NDArray> &in_data,
-                         const std::vector<OpReqType> &req,
-                         const std::vector<NDArray> &out_data);
-void MKLDNNConcatBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-                          const std::vector<NDArray>& inputs,
-                          const std::vector<OpReqType>& req,
-                          const std::vector<NDArray>& outputs);
-
-/* For activation */
-void MKLDNNActivationForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-                             const NDArray &in_data, const OpReqType &req,
-                             const NDArray &out_data);
-void MKLDNNActivationBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-                              const NDArray &out_grad, const NDArray &in_data,
-                              const OpReqType &req, const NDArray &in_grad);
-
-void Sum(const mkldnn::memory &arr1, const mkldnn::memory &arr2,
-         const mkldnn::memory &out);
-
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_USE_MKLDNN == 1
-
-#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_OPS_INL_H_
diff --git a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h
deleted file mode 100644
index 4f2f71866e..0000000000
--- a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file mkldnn_pooling-inl.h
- * \brief
-*/
-#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_POOLING_INL_H_
-#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_POOLING_INL_H_
-
-#if MXNET_USE_MKLDNN == 1
-
-#include <utility>
-#include <mkldnn.hpp>
-#include "../pooling-inl.h"
-#include "./mkldnn_base-inl.h"
-
-namespace mxnet {
-namespace op {
-
-class MKLDNNPoolingFwd {
- public:
-  MKLDNNPoolingFwd(const mxnet::NDArray &input,
-                   const mxnet::NDArray &output,
-                   const int kernel_h, const int kernel_w,
-                   const int stride_h, const int stride_w,
-                   const int padding_t, const int padding_b,
-                   const int padding_l, const int padding_r,
-                   const mkldnn::algorithm alg_kind,
-                   const bool with_workspace, const bool is_train) :
-                   is_train_(is_train),
-                   with_workspace_(with_workspace),
-                   alg_kind_(alg_kind),
-                   fwd_(nullptr), data_(nullptr), out_(nullptr), workspace_(nullptr) {
-    Init(input, output,
-         kernel_h, kernel_w, stride_h, stride_w,
-         padding_t, padding_b, padding_l, padding_r);
-  }
-
-  ~MKLDNNPoolingFwd() {}
-  void SetDataHandle(const mxnet::NDArray &data,
-                     const mxnet::NDArray &output,
-                     const mxnet::NDArray *workspace = nullptr);
-  void Execute();
-
- private:
-  bool is_train_;
-  bool with_workspace_;
-  mkldnn::algorithm alg_kind_;
-  std::shared_ptr<mkldnn::pooling_forward::primitive_desc> fwd_pd_;
-  std::shared_ptr<mkldnn::pooling_forward> fwd_;
-  std::shared_ptr<mkldnn::memory> data_;
-  std::shared_ptr<mkldnn::memory> out_;
-  std::shared_ptr<mkldnn::memory> workspace_;
-
- private:
-  void Init(const mxnet::NDArray &input,
-            const mxnet::NDArray &output,
-            const int kernel_h, const int kernel_w,
-            const int stride_h, const int stride_w,
-            const int padding_t, const int padding_b,
-            const int padding_l, const int padding_r);
-};
-
-inline bool SupportMKLDNNPooling(const PoolingParam &param) {
-  return param.kernel.ndim() == 2 &&
-         (param.pool_type == pool_enum::kMaxPooling ||
-          param.pool_type == pool_enum::kAvgPooling)
-         // This is a temporary fix. There is a bug in global pooling of MKLDNN.
-         && !param.global_pool;
-}
-
-inline bool SupportMKLDNNPooling(const PoolingParam &param,
-                                 const TShape &dshape) {
-  bool ret = SupportMKLDNNPooling(param);
-  if (!ret)
-    return false;
-
-  if (param.pooling_convention == pool_enum::kValid)
-    return true;
-
-  if (((dshape[2] + 2 * param.pad[0] - param.kernel[0]) % param.stride[0] == 0) &&
-      ((dshape[3] + 2 * param.pad[1] - param.kernel[1]) % param.stride[1] == 0))
-    return true;
-  else
-    return false;
-}
-
-inline bool MKLDNNRequireWorkspace(const PoolingParam &param) {
-  return param.pool_type != pool_enum::kAvgPooling;
-}
-
-typedef MKLDNNParamOpSign<PoolingParam> MKLDNNPoolingSignature;
-void MKLDNNPoolingCompute(const OpContext &ctx, const PoolingParam &param,
-                          const NDArray &in_data, const OpReqType req,
-                          const NDArray &out_data, const NDArray *workspace);
-
-void MKLDNNPoolingGradCompute(const OpContext &ctx, const PoolingParam &param,
-                              const NDArray &out_grad, const NDArray &in_data,
-                              const NDArray *workspace, const OpReqType req,
-                              const NDArray &in_grad);
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_USE_MKLDNN == 1
-#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_POOLING_INL_H_
diff --git a/src/operator/nn/mkldnn/mkldnn_pooling.cc b/src/operator/nn/mkldnn/mkldnn_pooling.cc
deleted file mode 100644
index 6eeecaf072..0000000000
--- a/src/operator/nn/mkldnn/mkldnn_pooling.cc
+++ /dev/null
@@ -1,322 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file mkldnn_pooling.cc
- * \brief
- * \author Tao Lv
-*/
-
-#if MXNET_USE_MKLDNN == 1
-
-#include "./mkldnn_pooling-inl.h"
-
-namespace mxnet {
-namespace op {
-
-void MKLDNNPoolingFwd::Init(const mxnet::NDArray &input, const mxnet::NDArray &output,
-                            const int kernel_h,  const int kernel_w,
-                            const int stride_h,  const int stride_w,
-                            const int padding_t, const int padding_b,
-                            const int padding_l, const int padding_r) {
-  // mkldnn::memory::desc
-  auto src_md = input.GetMKLDNNData()->get_primitive_desc().desc();
-  mkldnn::memory::dims dims = {src_md.data.dims[0],
-                               src_md.data.dims[1],
-                               static_cast<int>(output.shape()[2]),
-                               static_cast<int>(output.shape()[3])};
-  auto dst_md = mkldnn::memory::desc({dims},
-                                     static_cast<mkldnn::memory::data_type>(src_md.data.data_type),
-                                     static_cast<mkldnn::memory::format>(src_md.data.format));
-  const mkldnn::engine engine = CpuEngine::Get()->get_engine();
-  const mkldnn::algorithm alg_kind = this->alg_kind_;
-  if (alg_kind != mkldnn::algorithm::pooling_max &&
-      alg_kind != mkldnn::algorithm::pooling_avg &&
-      alg_kind != mkldnn::algorithm::pooling_avg_include_padding &&
-      alg_kind != mkldnn::algorithm::pooling_avg_exclude_padding) {
-    LOG(FATAL) << "MKLDNN Pooling: algorithm is not supported";
-  }
-
-  mkldnn::prop_kind prop = mkldnn::prop_kind::forward_scoring;
-  if (this->is_train_ && alg_kind != mkldnn::algorithm::pooling_avg) {
-    prop = mkldnn::prop_kind::forward_training;
-  }
-  if (this->is_train_ && prop == mkldnn::prop_kind::forward_scoring) {
-    LOG(INFO) << "MKLDNN Pooling: training with prop_kind is forward_scoring";
-  }
-
-  const mkldnn::memory::dims strides = {stride_h,  stride_w  };
-  const mkldnn::memory::dims pad_l   = {padding_t, padding_l };
-  const mkldnn::memory::dims pad_r   = {padding_b, padding_r };
-  const mkldnn::memory::dims kernel  = {kernel_h,  kernel_w  };
-  // mkldnn::pooling_forward::desc
-  const auto fwd_desc = mkldnn::pooling_forward::desc(prop, alg_kind, src_md, dst_md,
-                                                      strides, kernel, pad_l, pad_r,
-                                                      mkldnn::padding_kind::zero);
-  this->fwd_pd_.reset(new mkldnn::pooling_forward::primitive_desc(fwd_desc, engine));
-  this->data_.reset(new mkldnn::memory(input.GetMKLDNNData()->get_primitive_desc()));
-  this->out_.reset(new mkldnn::memory(this->fwd_pd_->dst_primitive_desc()));
-  if (this->with_workspace_) {
-    this->workspace_.reset(new mkldnn::memory(this->fwd_pd_->workspace_primitive_desc()));
-    this->fwd_.reset(new mkldnn::pooling_forward(*(this->fwd_pd_),
-                                                 mkldnn::primitive::at(*(this->data_)),
-                                                 *(this->out_),
-                                                 *(this->workspace_)));
-  } else {
-    this->fwd_.reset(new mkldnn::pooling_forward(*(this->fwd_pd_),
-                                                 mkldnn::primitive::at(*(this->data_)),
-                                                 *(this->out_)));
-  }
-  return;
-}
-
-void MKLDNNPoolingFwd::SetDataHandle(const mxnet::NDArray &data,
-                                     const mxnet::NDArray &output,
-                                     const mxnet::NDArray *workspace) {
-  // mkldnn::memory
-  auto data_mem = data.GetMKLDNNData();
-  auto out_mem = const_cast<NDArray&>(output).CreateMKLDNNData(
-                                                  this->fwd_pd_->dst_primitive_desc());
-  this->data_->set_data_handle(data_mem->get_data_handle());
-  this->out_->set_data_handle(out_mem->get_data_handle());
-  if (this->with_workspace_ && workspace == nullptr) {
-    LOG(FATAL) << "MKLDNN Pooling: incorrect workspace input";
-  }
-
-  if (this->with_workspace_) {
-    // mkldnn::memory
-    auto ws_mem = workspace->GetMKLDNNData();
-    this->workspace_->set_data_handle(ws_mem->get_data_handle());
-  }
-}
-
-void MKLDNNPoolingFwd::Execute() {
-  if (this->fwd_) {
-    MKLDNNStream::Get()->RegisterPrim(*(this->fwd_));
-    MKLDNNStream::Get()->Submit();
-  } else {
-    LOG(FATAL) << "MKLDNN Pooling: forward primitive is nullptr";
-  }
-}
-
-mkldnn::algorithm GetMKLDNNPoolAlgo(const PoolingParam &param) {
-  switch (param.pool_type) {
-    case pool_enum::kMaxPooling:
-      return mkldnn::algorithm::pooling_max;
-      break;
-    case pool_enum::kAvgPooling:
-      return mkldnn::algorithm::pooling_avg_include_padding;
-      break;
-    default:
-      LOG(FATAL) << "MKLDNN Pooling: Unknown pooling method.";
-      return mkldnn::algorithm::pooling_max;
-  }
-}
-
-mkldnn::pooling_forward::primitive_desc GetPoolingFwd(const PoolingParam &param,
-                                                      const bool is_train,
-                                                      const memory::desc &data_md,
-                                                      const memory::desc &out_md) {
-  CHECK_EQ(param.kernel.ndim(), 2) << "Not Implemented";
-  int kernel_h_, kernel_w_;
-  if (param.global_pool) {
-    kernel_h_ = data_md.data.dims[2];
-    kernel_w_ = data_md.data.dims[3];
-  } else {
-    kernel_h_ = param.kernel[0];
-    kernel_w_ = param.kernel[1];
-  }
-
-  CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero.";
-  CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero.";
-
-  const int pad_t_ = param.pad[0], pad_b_ = param.pad[0];
-  const int pad_l_ = param.pad[1], pad_r_ = param.pad[1];
-  const int stride_h_ = param.stride[0], stride_w_ = param.stride[1];
-
-  const mkldnn::engine engine = CpuEngine::Get()->get_engine();
-  if (param.global_pool) {
-    CHECK(pad_t_ == 0 && pad_l_ == 0 && stride_h_ == 1 && stride_w_ == 1)
-        << "With Global_pooling: true; only pad = 0 and stride = 1";
-  }
-  if (pad_t_ != 0 || pad_l_ != 0) {
-    CHECK(param.pool_type == pool_enum::kAvgPooling ||
-          param.pool_type == pool_enum::kMaxPooling)
-        << "Padding implemented only for average and max pooling.";
-    CHECK_LT(pad_l_, kernel_w_);
-    CHECK_LT(pad_t_, kernel_h_);
-  }
-
-
-  const mkldnn::algorithm alg = GetMKLDNNPoolAlgo(param);
-  mkldnn::prop_kind kind = mkldnn::prop_kind::forward_scoring;
-  if (is_train && alg != algorithm::pooling_avg) {
-    kind = mkldnn::prop_kind::forward_training;
-  }
-
-  const pooling_forward::desc poolingFwd_desc(kind, alg, data_md, out_md,
-                                              {static_cast<int>(stride_h_),
-                                               static_cast<int>(stride_w_)},
-                                              {kernel_h_, kernel_w_},
-                                              {static_cast<int>(pad_t_),
-                                               static_cast<int>(pad_l_)},
-                                              {static_cast<int>(pad_b_),
-                                               static_cast<int>(pad_r_)},
-                                              padding_kind::zero);
-  return mkldnn::pooling_forward::primitive_desc(poolingFwd_desc, engine);
-}
-
-MKLDNNPoolingFwd &GetPoolingFwd(const PoolingParam &param,
-                                const bool is_train,
-                                const NDArray &data,
-                                const NDArray &output) {
-  static thread_local std::unordered_map<MKLDNNPoolingSignature,
-                                         MKLDNNPoolingFwd,
-                                         MKLDNNOpHash> pooling_fwds;
-
-  bool with_workspace = is_train && MKLDNNRequireWorkspace(param);
-  MKLDNNPoolingSignature key(param);
-  key.AddSign(is_train);
-  key.AddSign(with_workspace);
-  key.AddSign(data);
-  key.AddSign(output);
-
-  auto it = pooling_fwds.find(key);
-  if (it == pooling_fwds.end()) {
-    CHECK_EQ(param.kernel.ndim(), 2) << "Not Implemented";
-    auto data_md = data.GetMKLDNNData()->get_primitive_desc().desc();
-    int kernel_h_, kernel_w_;
-    if (param.global_pool) {
-      kernel_h_ = data_md.data.dims[2];
-      kernel_w_ = data_md.data.dims[3];
-    } else {
-      kernel_h_ = param.kernel[0];
-      kernel_w_ = param.kernel[1];
-    }
-
-    CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero.";
-    CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero.";
-
-    const int pad_t_ = param.pad[0], pad_b_ = param.pad[0];
-    const int pad_l_ = param.pad[1], pad_r_ = param.pad[1];
-    const int stride_h_ = param.stride[0], stride_w_ = param.stride[1];
-
-    if (param.global_pool) {
-        CHECK(pad_t_ == 0 && pad_l_ == 0 && stride_h_ == 1 && stride_w_ == 1)
-            << "With Global_pooling: true; only pad = 0 and stride = 1";
-    }
-
-    if (pad_t_ != 0 || pad_l_ != 0) {
-        CHECK(param.pool_type == pool_enum::kAvgPooling ||
-              param.pool_type == pool_enum::kMaxPooling)
-              << "Padding implemented only for average and max pooling.";
-        CHECK_LT(pad_l_, kernel_w_);
-        CHECK_LT(pad_t_, kernel_h_);
-    }
-
-    const mkldnn::algorithm alg = GetMKLDNNPoolAlgo(param);
-    MKLDNNPoolingFwd fwd(data, output, kernel_h_, kernel_w_, stride_h_, stride_w_,
-                         pad_t_, pad_b_, pad_l_, pad_r_, alg, with_workspace, is_train);
-    auto ins_ret = pooling_fwds.insert(
-        std::pair<MKLDNNPoolingSignature, MKLDNNPoolingFwd>(key, fwd));
-    CHECK(ins_ret.second);
-    it = ins_ret.first;
-  }
-  return it->second;
-}
-
-void MKLDNNPoolingCompute(const OpContext &ctx, const PoolingParam &param,
-                          const NDArray &in_data, const OpReqType req,
-                          const NDArray &out_data, const NDArray *workspace) {
-  auto fwd = GetPoolingFwd(param, ctx.is_train, in_data, out_data);
-  fwd.SetDataHandle(in_data, out_data, workspace);
-  fwd.Execute();
-}
-
-void MKLDNNPoolingGradCompute(const OpContext &ctx, const PoolingParam &param,
-                              const NDArray &out_grad, const NDArray &in_data,
-                              const NDArray *workspace, const OpReqType req,
-                              const NDArray &in_grad) {
-  if (req == kNullOp) {
-    return;
-  }
-
-  TmpMemMgr::Get()->Init(ctx.requested[0]);
-  // mkldnn::memory
-  auto diff_dst_mem = out_grad.GetMKLDNNData();
-  auto input_mem = in_data.GetMKLDNNData();
-  mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc();
-  const mkldnn::memory::desc data_md = data_mpd.desc();
-  const memory::dims dims = {data_md.data.dims[0], data_md.data.dims[1],
-                             static_cast<int>(out_grad.shape()[2]),
-                             static_cast<int>(out_grad.shape()[3])};
-  const memory::desc out_md({dims},
-                            static_cast<memory::data_type>(data_md.data.data_type),
-                            static_cast<memory::format>(data_md.data.format));
-  auto pdesc_fwd = GetPoolingFwd(param, ctx.is_train, data_md, out_md);
-
-  const mkldnn::memory::desc diff_md = diff_dst_mem->get_primitive_desc().desc();
-  const memory::dims dims1 = {diff_md.data.dims[0], diff_md.data.dims[1],
-                              static_cast<int>(in_grad.shape()[2]),
-                              static_cast<int>(in_grad.shape()[3])};
-  const memory::desc diff_in_md(
-      {dims1}, static_cast<memory::data_type>(diff_md.data.data_type),
-      static_cast<memory::format>(diff_md.data.format));
-  const mkldnn::engine  cpu_engine = data_mpd.get_engine();
-  const mkldnn::algorithm alg = GetMKLDNNPoolAlgo(param);
-
-  int kernel_h_, kernel_w_;
-  if (param.global_pool) {
-    kernel_h_ = data_md.data.dims[2];
-    kernel_w_ = data_md.data.dims[3];
-  } else {
-    kernel_h_ = param.kernel[0];
-    kernel_w_ = param.kernel[1];
-  }
-  const pooling_backward::desc desc(alg, diff_in_md, diff_md,
-                                    {static_cast<int>(param.stride[0]),
-                                     static_cast<int>(param.stride[1])},
-                                    {kernel_h_, kernel_w_},
-                                    {static_cast<int>(param.pad[0]),
-                                     static_cast<int>(param.pad[1])},
-                                    {static_cast<int>(param.pad[0]),
-                                     static_cast<int>(param.pad[1])},
-                                    mkldnn::padding_kind::zero);
-  const pooling_backward::primitive_desc pdesc(desc, cpu_engine, pdesc_fwd);
-
-  auto diff_src_mem =
-      CreateMKLDNNMem(in_grad, pdesc.diff_src_primitive_desc(), req);
-
-  if (MKLDNNRequireWorkspace(param)) {
-    CHECK(workspace != nullptr);
-    auto workspace_mem = workspace->GetMKLDNNData();
-    MKLDNNStream::Get()->RegisterPrim(
-        pooling_backward(pdesc, *diff_dst_mem, primitive::at(*workspace_mem),
-                         *diff_src_mem.second));
-  } else {
-    MKLDNNStream::Get()->RegisterPrim(
-        pooling_backward(pdesc, *diff_dst_mem, *diff_src_mem.second));
-  }
-  CommitOutput(in_grad, diff_src_mem);
-  MKLDNNStream::Get()->Submit();
-}
-
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_USE_MKLDNN == 1
diff --git a/src/operator/nn/mkldnn/mkldnn_softmax.cc b/src/operator/nn/mkldnn/mkldnn_softmax.cc
deleted file mode 100644
index aa59f13d06..0000000000
--- a/src/operator/nn/mkldnn/mkldnn_softmax.cc
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file mkldnn_softmax.cc
- * \brief
- * \author Da Zheng
-*/
-
-#include "../softmax-inl.h"
-#include "./mkldnn_ops-inl.h"
-#include "./mkldnn_base-inl.h"
-
-#if MXNET_USE_MKLDNN == 1
-namespace mxnet {
-namespace op {
-
-void MKLDNNSoftmaxForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-                          const NDArray &in_data, const OpReqType &req,
-                          const NDArray &out_data) {
-  const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
-  auto input_mem = in_data.GetMKLDNNData();
-  mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc();
-  mkldnn::memory::desc data_md = data_mpd.desc();
-  auto cpu_engine = data_mpd.get_engine();
-  auto prop = ctx.is_train
-    ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring;
-  mkldnn::softmax_forward::desc desc = mkldnn::softmax_forward::desc(prop,
-      data_md, param.axis);
-  mkldnn::softmax_forward::primitive_desc pdesc(desc, cpu_engine);
-
-  auto output_memory = out_data.GetMKLDNNData();
-  MKLDNNStream *stream = MKLDNNStream::Get();
-  stream->RegisterPrim(mkldnn::softmax_forward(pdesc, *input_mem, *output_memory));
-  stream->Submit();
-}
-
-}   // namespace op
-}   // namespace mxnet
-#endif
diff --git a/src/operator/nn/mkldnn/mkldnn_sum.cc b/src/operator/nn/mkldnn/mkldnn_sum.cc
deleted file mode 100644
index f3aeacf17d..0000000000
--- a/src/operator/nn/mkldnn/mkldnn_sum.cc
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file mkldnn_sum.cc
- * \brief
- * \author Da Zheng
-*/
-#include <iostream>
-
-#include "./mkldnn_ops-inl.h"
-#include "./mkldnn_base-inl.h"
-
-#if MXNET_USE_MKLDNN == 1
-namespace mxnet {
-namespace op {
-
-void Sum(const mkldnn::memory &arr1, const mkldnn::memory &arr2,
-         const mkldnn::memory &out) {
-  std::vector<mkldnn::memory::primitive_desc> input_pds(2);
-  std::vector<float> scales(2, 1);
-  std::vector<mkldnn::primitive::at> inputs;
-  input_pds[0] = arr1.get_primitive_desc();
-  input_pds[1] = arr2.get_primitive_desc();
-  CHECK(input_pds[0] == input_pds[1]);
-  inputs.push_back(arr1);
-  inputs.push_back(arr2);
-  // TODO(zhengda) I need to reorder memory here.
-  mkldnn::sum::primitive_desc sum_pd(scales, input_pds);
-  MKLDNNStream::Get()->RegisterPrim(mkldnn::sum(sum_pd, inputs, out));
-}
-
-void MKLDNNSumForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-                      const std::vector<NDArray> &inputs, const OpReqType &req,
-                      const NDArray &out_data) {
-  TmpMemMgr::Get()->Init(ctx.requested[0]);
-  std::vector<mkldnn::primitive::at> in_prims;
-  std::vector<mkldnn::memory::primitive_desc> in_pds(inputs.size());
-  std::vector<float> scales(inputs.size(), 1);
-  in_prims.reserve(inputs.size());
-  for (size_t i = 0; i < inputs.size(); i++) {
-    auto in_mem = inputs[i].GetMKLDNNData();
-    in_prims.push_back(*in_mem);
-    in_pds[i] = in_mem->get_primitive_desc();
-  }
-  mkldnn::sum::primitive_desc pdesc(scales, in_pds);
-
-  auto out_mem = CreateMKLDNNMem(out_data, pdesc.dst_primitive_desc(), req);
-  MKLDNNStream *stream = MKLDNNStream::Get();
-  stream->RegisterPrim(mkldnn::sum(pdesc, in_prims, *out_mem.second));
-  CommitOutput(out_data, out_mem);
-  stream->Submit();
-}
-
-}  // namespace op
-}  // namespace mxnet
-#endif
diff --git a/src/operator/nn/pooling-inl.h b/src/operator/nn/pooling-inl.h
index 7a20f026f7..a32aaa2152 100644
--- a/src/operator/nn/pooling-inl.h
+++ b/src/operator/nn/pooling-inl.h
@@ -21,7 +21,7 @@
  * Copyright (c) 2017 by Contributors
  * \file pooling-inl.h
  * \brief
- * \author Bing Xu, Jun Wu, Da Zheng
+ * \author Bing Xu, Jun Wu
 */
 
 #ifndef MXNET_OPERATOR_NN_POOLING_INL_H_
@@ -78,138 +78,257 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
     DMLC_DECLARE_FIELD(pad).set_default(TShape())
     .describe("Pad for pooling: (y, x) or (d, y, x). Defaults to no padding.");
   }
+};
 
-  bool operator==(const PoolingParam& other) const {
-    return this->kernel             == other.kernel &&
-           this->stride             == other.stride &&
-           this->pad                == other.pad &&
-           this->pool_type          == other.pool_type &&
-           this->pooling_convention == other.pooling_convention &&
-           this->global_pool        == other.global_pool &&
-           this->cudnn_off          == other.cudnn_off;
+template<typename xpu, typename DType>
+class PoolingOp : public Operator {
+ public:
+  explicit PoolingOp(PoolingParam p) {
+    this->param_ = p;
   }
-};
 
-}  // namespace op
-}  // namespace mxnet
+  virtual void Forward(const OpContext& ctx,
+                       const std::vector<TBlob>& in_data,
+                       const std::vector<OpReqType>& req,
+                       const std::vector<TBlob>& out_data,
+                       const std::vector<TBlob>& aux_args) {
+    using namespace mshadow;
+    CHECK_EQ(in_data.size(), 1U);
+    CHECK_EQ(out_data.size(), 1U);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    const TShape& ishape = in_data[pool_enum::kData].shape_;
 
-namespace std {
-template<>
-struct hash<mxnet::op::PoolingParam> {
-  size_t operator()(const mxnet::op::PoolingParam& val) {
-    size_t ret = 0;
-    ret = dmlc::HashCombine(ret, val.kernel);
-    ret = dmlc::HashCombine(ret, val.stride);
-    ret = dmlc::HashCombine(ret, val.pad);
-    ret = dmlc::HashCombine(ret, val.pool_type);
-    ret = dmlc::HashCombine(ret, val.pooling_convention);
-    ret = dmlc::HashCombine(ret, val.global_pool);
-    ret = dmlc::HashCombine(ret, val.cudnn_off);
-    return ret;
+    pool(s, in_data[pool_enum::kData].dptr<DType>(),
+         in_data[pool_enum::kData].shape_,
+         out_data[pool_enum::kOut].shape_,
+         param_.global_pool?
+           TShape(ishape.data()+ishape.ndim()-param_.kernel.ndim(), ishape.data()+ishape.ndim())
+           : param_.kernel,
+         param_.pad,
+         param_.global_pool? TShape(param_.kernel.ndim()) : param_.stride,
+         param_.pool_type,
+         req[pool_enum::kOut],
+         out_data[pool_enum::kOut].dptr<DType>());
   }
-};
-}  // namespace std
 
-namespace mxnet {
-namespace op {
-
-/*
- * When MKLDNN is enabled, we might want 2 outputs instead of one inputs, which
- * also changes the number of inputs for backward.
- */
-int GetNumOutputs(const PoolingParam &param);
-int GetNumBackInputs(const PoolingParam &param);
+  virtual void Backward(const OpContext& ctx,
+                        const std::vector<TBlob>& out_grad,
+                        const std::vector<TBlob>& in_data,
+                        const std::vector<TBlob>& out_data,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<TBlob>& in_grad,
+                        const std::vector<TBlob>& aux_args) {
+    using namespace mshadow;
+    CHECK_EQ(out_grad.size(), 1U);
+    CHECK_EQ(in_data.size(), 1U);
+    CHECK_EQ(out_data.size(), 1U);
+    CHECK_EQ(req.size(), 1U);
+    CHECK_EQ(in_grad.size(), 1U);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    const TShape& ishape = in_data[pool_enum::kData].shape_;
 
-template<typename xpu, typename DType>
-void PoolingForward(const OpContext& ctx, const PoolingParam &param,
-                    const TBlob& in_data, const OpReqType& req,
-                    const TBlob& out_data) {
-  using namespace mshadow;
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-  const TShape& ishape = in_data.shape_;
-
-  pool(s, in_data.dptr<DType>(), in_data.shape_, out_data.shape_,
-       param.global_pool?
-       TShape(ishape.data()+ishape.ndim()-param.kernel.ndim(), ishape.data()+ishape.ndim())
-       : param.kernel,
-       param.pad,
-       param.global_pool? TShape(param.kernel.ndim()) : param.stride,
-       param.pool_type, req, out_data.dptr<DType>());
-}
+    unpool(s, out_grad[pool_enum::kOut].dptr<DType>(),
+           in_data[pool_enum::kData].dptr<DType>(),
+           out_data[pool_enum::kOut].dptr<DType>(),
+           in_grad[pool_enum::kData].shape_,
+           out_grad[pool_enum::kOut].shape_,
+           param_.global_pool?
+             TShape(ishape.data()+ishape.ndim()-param_.kernel.ndim(), ishape.data()+ishape.ndim())
+             : param_.kernel,
+           param_.pad,
+           param_.global_pool? TShape(param_.kernel.ndim()) : param_.stride,
+           param_.pool_type,
+           req[pool_enum::kData],
+           in_grad[pool_enum::kData].dptr<DType>());
+  }
 
-template<typename xpu, typename DType>
-void PoolingBackward(const OpContext& ctx, const PoolingParam &param,
-                     const TBlob& out_grad, const TBlob& in_data,
-                     const TBlob& out_data, const OpReqType& req,
-                     const TBlob& in_grad) {
-  using namespace mshadow;
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-  const TShape& ishape = in_data.shape_;
-
-  unpool(s, out_grad.dptr<DType>(), in_data.dptr<DType>(), out_data.dptr<DType>(),
-         in_grad.shape_, out_grad.shape_,
-         param.global_pool?
-         TShape(ishape.data()+ishape.ndim()-param.kernel.ndim(), ishape.data()+ishape.ndim())
-         : param.kernel,
-         param.pad,
-         param.global_pool? TShape(param.kernel.ndim()) : param.stride,
-         param.pool_type, req, in_grad.dptr<DType>());
-}
+ private:
+  PoolingParam param_;
+};  // class PoolingOp
 
 template<typename xpu>
-void PoolingCompute(const nnvm::NodeAttrs& attrs,
-                    const OpContext& ctx,
-                    const std::vector<TBlob>& inputs,
-                    const std::vector<OpReqType>& req,
-                    const std::vector<TBlob>& outputs) {
-  const PoolingParam& param = nnvm::get<PoolingParam>(attrs.parsed);
-  CHECK_EQ(inputs.size(), 1U);
-  CHECK_EQ(outputs.size(), GetNumOutputs(param));
-  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-    if (pool_enum::kMaxPooling == param.pool_type
-        || pool_enum::kAvgPooling == param.pool_type
-        || pool_enum::kSumPooling == param.pool_type) {
-      PoolingForward<xpu, DType>(ctx, param, inputs[0], req[0], outputs[0]);
+Operator* CreateOp(PoolingParam param, int dtype);
+
+
+#if DMLC_USE_CXX11
+class PoolingProp : public OperatorProperty {
+ public:
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    using namespace mshadow;
+    param_.Init(kwargs);
+    if (param_.kernel.ndim() == 1) {
+      if (param_.stride.ndim() == 0) param_.stride = Shape1(1);
+      if (param_.pad.ndim() == 0) param_.pad = Shape1(0);
+    } else if (param_.kernel.ndim() == 2) {
+      if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1);
+      if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0);
     } else {
-      LOG(FATAL) << "unknown pooling type";
+      CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D pooling not supported";
+      if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1);
+      if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0);
     }
-  });
-}
+    CHECK_EQ(param_.stride.ndim(), param_.kernel.ndim())
+      << "stride and kernel should have the same length";
+    CHECK_EQ(param_.pad.ndim(), param_.kernel.ndim())
+      << "pad and kernel should have the same length";
+  }
 
-template<typename xpu>
-void PoolingGradCompute(const nnvm::NodeAttrs& attrs,
-                        const OpContext& ctx,
-                        const std::vector<TBlob>& inputs,
-                        const std::vector<OpReqType>& req,
-                        const std::vector<TBlob>& outputs) {
-  const PoolingParam& param = nnvm::get<PoolingParam>(attrs.parsed);
-  CHECK_EQ(inputs.size(), GetNumBackInputs(param));
-  CHECK_EQ(outputs.size(), 1U);
-  CHECK_EQ(req.size(), 1U);
-  off_t ograd_idx, in_data_idx, out_data_idx;
-  // When MKLDNN is enabled, the input data may contains arrays for workspace.
-  if (GetNumBackInputs(param) == 5) {
-    ograd_idx = 0;
-    in_data_idx = 2;
-    out_data_idx = 3;
-  } else {
-    ograd_idx = 0;
-    in_data_idx = 1;
-    out_data_idx = 2;
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
   }
-  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-    if (pool_enum::kMaxPooling == param.pool_type
-        || pool_enum::kAvgPooling == param.pool_type
-        || pool_enum::kSumPooling == param.pool_type) {
-      PoolingBackward<xpu, DType>(ctx, param, inputs[ograd_idx],
-                                  inputs[in_data_idx], inputs[out_data_idx],
-                                  req[0], outputs[0]);
-    } else {
-      LOG(FATAL) << "unknown pooling type";
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    CHECK_EQ(in_shape->size(), 1U);
+    const TShape &dshape = (*in_shape)[0];
+    CHECK_GE(dshape.ndim(), 3U) << "Pooling: Input data should be  3D in (batch, channel, x)"
+                                << " Or 4D in (batch, channel, y, x) "
+                                << " Or 5D in (batch, channel, d, y, x)";
+    TShape oshape = dshape;
+    if (dshape.ndim() ==  0) return false;
+    if (param_.kernel.ndim() == 1) {
+      CHECK_EQ(dshape.ndim(), 3U) << "Pooling: Input data should be 3D in (batch, channel, x)";
+      if (param_.global_pool) {
+        oshape[2] = 1;
+      } else {
+        CHECK(param_.kernel[0] <= dshape[2] + 2 * param_.pad[0])
+            << "kernel size (" << param_.kernel[0] << ") exceeds input (" << dshape[2]
+            << " padded to " << (dshape[2] + 2*param_.pad[0]) << ")";
+        if (param_.pooling_convention == pool_enum::kValid) {
+          oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) /
+                              param_.stride[0];
+        } else {
+          oshape[2] = 1 + static_cast<int>(ceil(static_cast<float>(
+                              dshape[2] + 2 * param_.pad[0] -
+                              param_.kernel[0]) / param_.stride[0]));
+        }
+      }
+      out_shape->clear();
+      out_shape->push_back(oshape);  // save output shape
+    } else if (param_.kernel.ndim() == 2) {
+      CHECK_EQ(dshape.ndim(), 4U) << "Pooling: Input data should be 4D in (batch, channel, y, x)";
+      if (param_.global_pool) {
+        oshape[2] = 1;
+        oshape[3] = 1;
+      } else {
+        CHECK(param_.kernel[0] <= dshape[2] + 2 * param_.pad[0])
+            << "kernel size (" << param_.kernel[0] << ") exceeds input (" << dshape[2]
+            << " padded to " << (dshape[2] + 2*param_.pad[0]) << ")";
+        CHECK(param_.kernel[1] <= dshape[3] + 2 * param_.pad[1])
+            << "kernel size (" << param_.kernel[1] << ") exceeds input (" << dshape[3]
+            << " padded to " << (dshape[3] + 2*param_.pad[1]) << ")";
+        if (param_.pooling_convention == pool_enum::kValid) {
+          oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) /
+                              param_.stride[0];
+          oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) /
+                              param_.stride[1];
+        } else {
+          oshape[2] = 1 + static_cast<int>(ceil(static_cast<float>(
+                              dshape[2] + 2 * param_.pad[0] -
+                              param_.kernel[0]) / param_.stride[0]));
+          oshape[3] = 1 + static_cast<int>(ceil(static_cast<float>(
+                              dshape[3] + 2 * param_.pad[1] -
+                              param_.kernel[1]) / param_.stride[1]));
+        }
+      }
+      out_shape->clear();
+      out_shape->push_back(oshape);  // save output shape
+    } else if (param_.kernel.ndim() == 3) {
+      CHECK_EQ(dshape.ndim(), 5U)
+        << "Pooling: Input data should be 5D in (batch, channel, d, y, x)";
+      CHECK_LE(param_.kernel[0], dshape[2] + 2 * param_.pad[0]) << "kernel size exceeds input";
+      CHECK_LE(param_.kernel[1], dshape[3] + 2 * param_.pad[1]) << "kernel size exceeds input";
+      CHECK_LE(param_.kernel[2], dshape[4] + 2 * param_.pad[2]) << "kernel size exceeds input";
+      if (param_.global_pool) {
+        oshape[2] = 1;
+        oshape[3] = 1;
+        oshape[4] = 1;
+      } else {
+        if (param_.pooling_convention == pool_enum::kValid) {
+          oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) /
+                              param_.stride[0];
+          oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) /
+                              param_.stride[1];
+          oshape[4] = 1 + (dshape[4] + 2 * param_.pad[2] - param_.kernel[2]) /
+                              param_.stride[2];
+        } else {
+          oshape[2] = 1 + static_cast<int>(ceil(static_cast<float>(
+                              dshape[2] + 2 * param_.pad[0] -
+                              param_.kernel[0]) / param_.stride[0]));
+          oshape[3] = 1 + static_cast<int>(ceil(static_cast<float>(
+                              dshape[3] + 2 * param_.pad[1] -
+                              param_.kernel[1]) / param_.stride[1]));
+          oshape[4] = 1 + static_cast<int>(ceil(static_cast<float>(
+                              dshape[4] + 2 * param_.pad[2] -
+                              param_.kernel[2]) / param_.stride[2]));
+        }
+      }
+
+      out_shape->clear();
+      out_shape->push_back(oshape);  // save output shape
+    }
+    return true;
+  }
+
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_EQ(in_type->size(), 1U);
+    int dtype = (*in_type)[0];
+
+    if (dtype == -1) {
+      LOG(FATAL) << "Input type to pooling is not specified.";
+      return false;
     }
-  });
-}
 
+    out_type->clear();
+    out_type->push_back(dtype);
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    PoolingProp *prop_sym = new PoolingProp();
+    prop_sym->param_ = this->param_;
+    return prop_sym;
+  }
+
+  std::string TypeString() const override {
+    return "Pooling";
+  }
+
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    return {out_grad[pool_enum::kOut], in_data[pool_enum::kData],
+            out_data[pool_enum::kOut]};
+  }
+
+  std::vector<std::pair<int, void*> > BackwardInplaceOption(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data,
+    const std::vector<void*> &in_grad) const override {
+#if MXNET_USE_CUDNN == 1
+    return {};
+#else
+    return {{in_data[pool_enum::kData], in_grad[pool_enum::kData]}};
+#endif
+  }
+
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented.";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
+
+ private:
+  PoolingParam param_;
+};  // class PoolingProp
+#endif  // DMLC_USE_CXX11
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc
index f719e0753e..8345ea3886 100644
--- a/src/operator/nn/pooling.cc
+++ b/src/operator/nn/pooling.cc
@@ -21,300 +21,78 @@
  * Copyright (c) 2017 by Contributors
  * \file pooling.cc
  * \brief
- * \author Bing Xu, Jun Wu, Da Zheng
+ * \author Bing Xu, Jun Wu
 */
-#include "../elemwise_op_common.h"
 #include "./pooling-inl.h"
+#if MXNET_USE_MKL2017 == 1
+#include <mkl_memory.h>
+#include "../mkl/mkl_memory-inl.h"
+#include "../mkl/mkl_pooling-inl.h"
+#endif  // MXNET_USE_MKL2017
 #if MXNET_USE_NNPACK == 1
 #include "./nnpack/nnpack_pooling-inl.h"
 #endif  // MXNET_USE_NNPACK
-#if MXNET_USE_MKLDNN == 1
-#include "./mkldnn/mkldnn_pooling-inl.h"
-#endif  // MXNET_USE_MKLDNN
 
 namespace mxnet {
 namespace op {
 
-static void PoolingParamParser(nnvm::NodeAttrs *attrs) {
-  using namespace mshadow;
-  PoolingParam param;
-  param.Init(attrs->dict);
-  if (param.kernel.ndim() == 1) {
-    if (param.stride.ndim() == 0) param.stride = Shape1(1);
-    if (param.pad.ndim() == 0) param.pad = Shape1(0);
-  } else if (param.kernel.ndim() == 2) {
-    if (param.stride.ndim() == 0) param.stride = Shape2(1, 1);
-    if (param.pad.ndim() == 0) param.pad = Shape2(0, 0);
-  } else {
-    CHECK_EQ(param.kernel.ndim(), 3U) << param.kernel.ndim()
-                                       << "D pooling not supported";
-    if (param.stride.ndim() == 0) param.stride = Shape3(1, 1, 1);
-    if (param.pad.ndim() == 0) param.pad = Shape3(0, 0, 0);
-  }
-  CHECK_EQ(param.stride.ndim(), param.kernel.ndim())
-      << "stride and kernel should have the same length";
-  CHECK_EQ(param.pad.ndim(), param.kernel.ndim())
-      << "pad and kernel should have the same length";
-  attrs->parsed = std::move(param);
-}
-
-int GetNumOutputs(const PoolingParam &param) {
-#if MXNET_USE_MKLDNN == 1
-  return MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param) ? 2 : 1;
-#else
-  return 1;
-#endif
-}
-
-int GetNumBackInputs(const PoolingParam &param) {
-#if MXNET_USE_MKLDNN == 1
-  return MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param) ? 5 : 3;
-#else
-  return 3;
-#endif
-}
-
-static bool PoolingType(const nnvm::NodeAttrs& attrs,
-                        std::vector<int> *in_attrs,
-                        std::vector<int> *out_attrs) {
-  out_attrs->at(0) = in_attrs->at(0);
-#if MXNET_USE_MKLDNN == 1
-  const PoolingParam &param = nnvm::get<PoolingParam>(attrs.parsed);
-  if (MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param)) {
-    CHECK_GT(out_attrs->size(), 1U);
-    out_attrs->at(1) = mshadow::kInt32;
-  }
-#endif
-  return true;
-}
-
-static bool PoolingShape(const nnvm::NodeAttrs &attrs,
-                         std::vector<TShape> *in_shape,
-                         std::vector<TShape> *out_shape) {
-  const PoolingParam &param = nnvm::get<PoolingParam>(attrs.parsed);
-  CHECK_EQ(in_shape->size(), 1U);
-  const TShape &dshape = (*in_shape)[0];
-  CHECK_GE(dshape.ndim(), 3U)
-      << "Pooling: Input data should be  3D in (batch, channel, x)"
-      << " Or 4D in (batch, channel, y, x) "
-      << " Or 5D in (batch, channel, d, y, x)";
-  TShape oshape = dshape;
-  if (dshape.ndim() == 0) return false;
-  if (param.kernel.ndim() == 1) {
-    CHECK_EQ(dshape.ndim(), 3U)
-        << "Pooling: Input data should be 3D in (batch, channel, x)";
-    if (param.global_pool) {
-      oshape[2] = 1;
-    } else {
-      CHECK(param.kernel[0] <= dshape[2] + 2 * param.pad[0])
-          << "kernel size (" << param.kernel[0] << ") exceeds input ("
-          << dshape[2] << " padded to " << (dshape[2] + 2 * param.pad[0])
-          << ")";
-      if (param.pooling_convention == pool_enum::kValid) {
-        oshape[2] = 1 +
-                    (dshape[2] + 2 * param.pad[0] - param.kernel[0]) /
-                        param.stride[0];
-      } else {
-        oshape[2] = 1 + static_cast<int>(ceil(
-                            static_cast<float>(dshape[2] + 2 * param.pad[0] -
-                                               param.kernel[0]) /
-                            param.stride[0]));
+template<>
+Operator *CreateOp<cpu>(PoolingParam param, int dtype) {
+  Operator *op = NULL;
+#if MXNET_USE_MKL2017 == 1
+    if (param.kernel.ndim() == 2
+      && ((param.pool_type == pool_enum::kMaxPooling)
+      || (param.pool_type == pool_enum::kAvgPooling))) {
+      switch (dtype) {
+      case mshadow::kFloat32:
+        return new MKLPoolingOp<cpu, float>(param);
+      case mshadow::kFloat64:
+        return new MKLPoolingOp<cpu, double>(param);
+      default:
+        break;
       }
     }
-    out_shape->clear();
-    out_shape->push_back(oshape);  // save output shape
-#if MXNET_USE_MKLDNN == 1
-    if (MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param))
-      out_shape->push_back(oshape);   // for workspace
 #endif
-  } else if (param.kernel.ndim() == 2) {
-    CHECK_EQ(dshape.ndim(), 4U)
-        << "Pooling: Input data should be 4D in (batch, channel, y, x)";
-    if (param.global_pool) {
-      oshape[2] = 1;
-      oshape[3] = 1;
-    } else {
-      CHECK(param.kernel[0] <= dshape[2] + 2 * param.pad[0])
-          << "kernel size (" << param.kernel[0] << ") exceeds input ("
-          << dshape[2] << " padded to " << (dshape[2] + 2 * param.pad[0])
-          << ")";
-      CHECK(param.kernel[1] <= dshape[3] + 2 * param.pad[1])
-          << "kernel size (" << param.kernel[1] << ") exceeds input ("
-          << dshape[3] << " padded to " << (dshape[3] + 2 * param.pad[1])
-          << ")";
-      if (param.pooling_convention == pool_enum::kValid) {
-        oshape[2] = 1 +
-                    (dshape[2] + 2 * param.pad[0] - param.kernel[0]) /
-                        param.stride[0];
-        oshape[3] = 1 +
-                    (dshape[3] + 2 * param.pad[1] - param.kernel[1]) /
-                        param.stride[1];
-      } else {
-        oshape[2] = 1 + static_cast<int>(ceil(
-                            static_cast<float>(dshape[2] + 2 * param.pad[0] -
-                                               param.kernel[0]) /
-                            param.stride[0]));
-        oshape[3] = 1 + static_cast<int>(ceil(
-                            static_cast<float>(dshape[3] + 2 * param.pad[1] -
-                                               param.kernel[1]) /
-                            param.stride[1]));
-      }
+#if MXNET_USE_NNPACK == 1
+  // NNPACK only support max-pooling with kernel = 2, stride = 2, pooling_convention
+  // = kFull(note that the default value is kValid in MXNet)
+  if ((param.pool_type == pool_enum::kMaxPooling)
+    && (param.pooling_convention == pool_enum::kFull)
+    && (param.kernel.ndim() == 2) && (param.stride.ndim() == 2)
+    && (param.kernel[0] == 2) && (param.kernel[1] == 2)
+    && (param.stride[0] == 2) && (param.stride[1] == 2)) {
+    switch (dtype) {
+    case mshadow::kFloat32:
+      return new NNPACKPoolingOp<cpu, float>(param);
+    default:
+      break;
     }
-    out_shape->clear();
-    out_shape->push_back(oshape);  // save output shape
-#if MXNET_USE_MKLDNN == 1
-    if (MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param))
-      out_shape->push_back(oshape);   // for workspace
+  }
 #endif
-  } else if (param.kernel.ndim() == 3) {
-    CHECK_EQ(dshape.ndim(), 5U)
-        << "Pooling: Input data should be 5D in (batch, channel, d, y, x)";
-    CHECK_LE(param.kernel[0], dshape[2] + 2 * param.pad[0])
-        << "kernel size exceeds input";
-    CHECK_LE(param.kernel[1], dshape[3] + 2 * param.pad[1])
-        << "kernel size exceeds input";
-    CHECK_LE(param.kernel[2], dshape[4] + 2 * param.pad[2])
-        << "kernel size exceeds input";
-    if (param.global_pool) {
-      oshape[2] = 1;
-      oshape[3] = 1;
-      oshape[4] = 1;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    if (pool_enum::kMaxPooling == param.pool_type
+        || pool_enum::kAvgPooling == param.pool_type
+        || pool_enum::kSumPooling == param.pool_type) {
+      op = new PoolingOp<cpu, DType>(param);
     } else {
-      if (param.pooling_convention == pool_enum::kValid) {
-        oshape[2] = 1 +
-                    (dshape[2] + 2 * param.pad[0] - param.kernel[0]) /
-                        param.stride[0];
-        oshape[3] = 1 +
-                    (dshape[3] + 2 * param.pad[1] - param.kernel[1]) /
-                        param.stride[1];
-        oshape[4] = 1 +
-                    (dshape[4] + 2 * param.pad[2] - param.kernel[2]) /
-                        param.stride[2];
-      } else {
-        oshape[2] = 1 + static_cast<int>(ceil(
-                            static_cast<float>(dshape[2] + 2 * param.pad[0] -
-                                               param.kernel[0]) /
-                            param.stride[0]));
-        oshape[3] = 1 + static_cast<int>(ceil(
-                            static_cast<float>(dshape[3] + 2 * param.pad[1] -
-                                               param.kernel[1]) /
-                            param.stride[1]));
-        oshape[4] = 1 + static_cast<int>(ceil(
-                            static_cast<float>(dshape[4] + 2 * param.pad[2] -
-                                               param.kernel[2]) /
-                            param.stride[2]));
-      }
+      LOG(FATAL) << "unknown pooling type";
+      return NULL;
     }
+  });
 
-    out_shape->clear();
-    out_shape->push_back(oshape);  // save output shape
-#if MXNET_USE_MKLDNN == 1
-    if (MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param))
-      out_shape->push_back(oshape);   // for workspace
-#endif
-  }
-  return true;
-}
-
-#if MXNET_USE_MKLDNN == 1
-void PoolingComputeExCPU(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
-                         const std::vector<NDArray> &inputs,
-                         const std::vector<OpReqType> &req,
-                         const std::vector<NDArray> &outputs) {
-  const PoolingParam &param = nnvm::get<PoolingParam>(attrs.parsed);
-  const NDArray *workspace = nullptr;
-  if (MKLDNNRequireWorkspace(param)) {
-    CHECK_GT(outputs.size(), 1U);
-    workspace = &outputs[1];
-  }
-  if (SupportMKLDNN(inputs[0])
-      && SupportMKLDNNPooling(param, inputs[0].shape())) {
-    MKLDNN_OPCHECK_INIT(false, 1, inputs, outputs);
-    MKLDNNPoolingCompute(ctx, param, inputs[0], req[0], outputs[0], workspace);
-    MKLDNN_OPCHECK_RUN(PoolingCompute<cpu>, attrs, ctx, inputs, req, outputs);
-    return;
-  }
-  FallBackCompute(PoolingCompute<cpu>, attrs, ctx, inputs, req, outputs);
-}
-
-void PoolingGradComputeExCPU(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
-                             const std::vector<NDArray> &inputs,
-                             const std::vector<OpReqType> &req,
-                             const std::vector<NDArray> &outputs) {
-  const PoolingParam &param = nnvm::get<PoolingParam>(attrs.parsed);
-  const NDArray &out_grad = inputs[0];
-  const NDArray *workspace = nullptr;
-  const NDArray *in_data = nullptr;
-  if (MKLDNNRequireWorkspace(param)) {
-    // The first two elements are the gradient of the outputs in forward.
-    // The third is the input of forward.
-    // The fourth and the fifth are the outputs of forward.
-    CHECK_EQ(inputs.size(), 5U);
-    in_data = &inputs[2];
-    workspace = &inputs[4];
-  } else {
-    CHECK_EQ(inputs.size(), 3U);
-    in_data = &inputs[1];
-  }
-  const NDArray &in_grad = outputs[0];
-  if (SupportMKLDNN(inputs[0])
-      && SupportMKLDNNPooling(param, inputs[0].shape())) {
-    MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
-    MKLDNNPoolingGradCompute(ctx, param, out_grad, *in_data, workspace,
-                             req[0], in_grad);
-    MKLDNN_OPCHECK_RUN(PoolingGradCompute<cpu>, attrs, ctx, inputs, req,
-                       outputs);
-    return;
-  }
-  FallBackCompute(PoolingGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
-}
-#endif
-
-inline static bool PoolingStorageType(const nnvm::NodeAttrs &attrs,
-                                      const int dev_mask,
-                                      DispatchMode *dispatch_mode,
-                                      std::vector<int> *in_attrs,
-                                      std::vector<int> *out_attrs) {
-  CHECK_EQ(in_attrs->size(), 1);
-
-#if MXNET_USE_MKLDNN == 1
-  const PoolingParam &param = nnvm::get<PoolingParam>(attrs.parsed);
-  if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNPooling(param)) {
-    return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                               dispatch_mode, DispatchMode::kFComputeEx);
-  }
-#else
-  CHECK_EQ(out_attrs->size(), 1);
-#endif
-  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                             dispatch_mode, DispatchMode::kFCompute);
+  return op;
 }
 
-inline static bool BackwardPoolingStorageType(const nnvm::NodeAttrs &attrs,
-                                              const int dev_mask,
-                                              DispatchMode *dispatch_mode,
-                                              std::vector<int> *in_attrs,
-                                              std::vector<int> *out_attrs) {
-  const PoolingParam &param = nnvm::get<PoolingParam>(attrs.parsed);
-  CHECK_EQ(in_attrs->size(), GetNumBackInputs(param));
-  CHECK_EQ(out_attrs->size(), 1);
-
-#if MXNET_USE_MKLDNN == 1
-  if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNPooling(param)) {
-    return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                               dispatch_mode, DispatchMode::kFComputeEx);
-  }
-#else
-  CHECK_EQ(in_attrs->size(), 3);
-#endif
-  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                             dispatch_mode, DispatchMode::kFCompute);
+// DO_BIND_DISPATCH comes from operator_common.h
+Operator* PoolingProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                     std::vector<int> *in_type) const {
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
 }
 
 DMLC_REGISTER_PARAMETER(PoolingParam);
 
-NNVM_REGISTER_OP(Pooling)
-    .describe(R"code(Performs pooling on the input.
+MXNET_REGISTER_OP_PROPERTY(Pooling, PoolingProp)
+.describe(R"code(Performs pooling on the input.
 
 The shapes for 1-D pooling are
 
@@ -353,61 +131,8 @@ For 3-D pooling, an additional *depth* dimension is added before
 height, width)*.
 
 )code" ADD_FILELINE)
-.set_num_inputs(1)
-.set_num_outputs([](const NodeAttrs& attrs) {
-  const PoolingParam &param = nnvm::get<PoolingParam>(attrs.parsed);
-  return GetNumOutputs(param);
-})
-#if MXNET_USE_MKLDNN == 1
-.set_attr<nnvm::FNumVisibleOutputs>("FNumVisibleOutputs",
-                                    [](const NodeAttrs& attrs) { return 1; })
-#endif
-.set_attr<nnvm::FListInputNames>("FListInputNames",
-    [](const NodeAttrs& attrs) {
-  return std::vector<std::string>{"data"};
-})
-.set_attr<nnvm::FListOutputNames>("FListOutputNames",
-    [](const NodeAttrs& attrs) {
-  return std::vector<std::string>{"output"};
-})
-.set_attr_parser(PoolingParamParser)
-.set_attr<FInferStorageType>("FInferStorageType", PoolingStorageType)
-.set_attr<nnvm::FInferType>("FInferType", PoolingType)
-.set_attr<nnvm::FInferShape>("FInferShape", PoolingShape)
-.set_attr<FCompute>("FCompute<cpu>", PoolingCompute<cpu>)
-#if MXNET_USE_MKLDNN == 1
-.set_attr<FComputeEx>("FComputeEx<cpu>", PoolingComputeExCPU)
-#endif
-.set_attr<nnvm::FGradient>("FGradient",
-                           ElemwiseGradUseInOut{"_backward_Pooling"})
-.add_argument("data", "NDArray-or-Symbol",
-              "Input data to the pooling operator.")
+.add_argument("data", "NDArray-or-Symbol", "Input data to the pooling operator.")
 .add_arguments(PoolingParam::__FIELDS__());
 
-NNVM_REGISTER_OP(_backward_Pooling)
-.set_num_outputs(1)
-.set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<nnvm::FInplaceOption>(
-    "FInplaceOption",
-    [](const NodeAttrs &attrs) {
-#if MXNET_USE_CUDNN == 1
-  return std::vector<std::pair<int, int> >();
-#else
-  return std::vector<std::pair<int, int> >{{1, 0}};
-#endif
-})
-#if MXNET_USE_MKLDNN == 1
-.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
-  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-})
-#endif
-.set_attr<FInferStorageType>("FInferStorageType",
-                             BackwardPoolingStorageType)
-.set_attr_parser(PoolingParamParser)
-#if MXNET_USE_MKLDNN == 1
-.set_attr<FComputeEx>("FComputeEx<cpu>", PoolingGradComputeExCPU)
-#endif
-.set_attr<FCompute>("FCompute<cpu>", PoolingGradCompute<cpu>);
-
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/pooling.cu b/src/operator/nn/pooling.cu
index c3bcecfc77..dcebe67982 100644
--- a/src/operator/nn/pooling.cu
+++ b/src/operator/nn/pooling.cu
@@ -21,7 +21,7 @@
  * Copyright (c) 2017 by Contributors
  * \file pooling.cu
  * \brief
- * \author Bing Xu, Jun Wu, Da Zheng
+ * \author Bing Xu, Jun Wu
 */
 #include <vector>
 #include "./pooling-inl.h"
@@ -32,112 +32,38 @@
 namespace mxnet {
 namespace op {
 
-#if MXNET_USE_CUDNN == 1
-template<typename DType>
-static CuDNNPoolingOp<DType> &GetCuDNNPoolingOp(const PoolingParam &param) {
-#if DMLC_CXX11_THREAD_LOCAL
-  static thread_local CuDNNPoolingOp<DType> op;
-#else
-  static MX_THREAD_LOCAL CuDNNPoolingOp<DType> op;
-#endif
-  op.Init(param);
-  return op;
-}
-#endif
-
 template<>
-void PoolingCompute<gpu>(const nnvm::NodeAttrs& attrs,
-                         const OpContext& ctx,
-                         const std::vector<TBlob>& inputs,
-                         const std::vector<OpReqType>& req,
-                         const std::vector<TBlob>& outputs) {
-  const PoolingParam& param = nnvm::get<PoolingParam>(attrs.parsed);
-  CHECK_EQ(inputs.size(), 1U);
-  CHECK_EQ(outputs.size(), GetNumOutputs(param));
-
+Operator *CreateOp<gpu>(PoolingParam param, int dtype) {
+  Operator *op = NULL;
 #if MXNET_USE_CUDNN == 1
   if (!param.cudnn_off && param.kernel.ndim() > 1) {
-    MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+    MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
       switch (param.pool_type) {
         case pool_enum::kMaxPooling:
-        case pool_enum::kAvgPooling:
-          GetCuDNNPoolingOp<DType>(param).Forward(ctx, inputs[0], req[0], outputs[0]);
-          return;
-        case pool_enum::kSumPooling:
-          LOG(WARNING) << "Sum pooling is not supported by cudnn, MXNet sum pooling is applied.";
+          op = new CuDNNPoolingOp<DType>(param);
           break;
-      }
-    });
-  }
-#endif  // MXNET_USE_CUDNN
-
-  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-    if (pool_enum::kMaxPooling == param.pool_type
-        || pool_enum::kAvgPooling == param.pool_type
-        || pool_enum::kSumPooling == param.pool_type) {
-      PoolingForward<gpu, DType>(ctx, param, inputs[0], req[0], outputs[0]);
-    } else {
-      LOG(FATAL) << "unknown pooling type";
-    }
-  });
-}
-
-template<>
-void PoolingGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
-                             const OpContext& ctx,
-                             const std::vector<TBlob>& inputs,
-                             const std::vector<OpReqType>& req,
-                             const std::vector<TBlob>& outputs) {
-  const PoolingParam& param = nnvm::get<PoolingParam>(attrs.parsed);
-  CHECK_EQ(inputs.size(), GetNumBackInputs(param));
-  CHECK_EQ(outputs.size(), 1U);
-  CHECK_EQ(req.size(), 1U);
-  off_t ograd_idx, in_data_idx, out_data_idx;
-  // When MKLDNN is enabled, the input data may contains arrays for workspace.
-  if (GetNumBackInputs(param) == 5) {
-    ograd_idx = 0;
-    in_data_idx = 2;
-    out_data_idx = 3;
-  } else {
-    ograd_idx = 0;
-    in_data_idx = 1;
-    out_data_idx = 2;
-  }
-
-#if MXNET_USE_CUDNN == 1
-  if (!param.cudnn_off && param.kernel.ndim() > 1) {
-    MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-      switch (param.pool_type) {
-        case pool_enum::kMaxPooling:
         case pool_enum::kAvgPooling:
-          GetCuDNNPoolingOp<DType>(param).Backward(ctx, inputs[ograd_idx],
-              inputs[in_data_idx], inputs[out_data_idx], req[0], outputs[0]);
-          return;
+          op = new CuDNNPoolingOp<DType>(param);
+          break;
         case pool_enum::kSumPooling:
           LOG(WARNING) << "Sum pooling is not supported by cudnn, MXNet sum pooling is applied.";
           break;
       }
     });
   }
+  if (op) return op;
 #endif  // MXNET_USE_CUDNN
-
-  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
     if (pool_enum::kMaxPooling == param.pool_type
         || pool_enum::kAvgPooling == param.pool_type
         || pool_enum::kSumPooling == param.pool_type) {
-      PoolingBackward<gpu, DType>(ctx, param, inputs[ograd_idx],
-          inputs[in_data_idx], inputs[out_data_idx], req[0], outputs[0]);
+      op = new PoolingOp<gpu, DType>(param);
     } else {
       LOG(FATAL) << "unknown pooling type";
     }
   });
+  return op;
 }
 
-NNVM_REGISTER_OP(Pooling)
-.set_attr<FCompute>("FCompute<gpu>", PoolingCompute<gpu>);
-
-NNVM_REGISTER_OP(_backward_Pooling)
-.set_attr<FCompute>("FCompute<gpu>", PoolingGradCompute<gpu>);
-
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/softmax.cc b/src/operator/nn/softmax.cc
index 0f559475d1..4686fb8c0d 100644
--- a/src/operator/nn/softmax.cc
+++ b/src/operator/nn/softmax.cc
@@ -25,54 +25,11 @@
 #include "./softmax-inl.h"
 #include "../tensor/elemwise_unary_op.h"
 #include "../tensor/elemwise_binary_op.h"
-#include "mkldnn/mkldnn_base-inl.h"
-#include "mkldnn/mkldnn_ops-inl.h"
 
 namespace mxnet {
 namespace op {
 DMLC_REGISTER_PARAMETER(SoftmaxParam);
 
-#if MXNET_USE_MKLDNN == 1
-static void SoftmaxComputeExCPU(const nnvm::NodeAttrs& attrs,
-                                const OpContext& ctx,
-                                const std::vector<NDArray>& inputs,
-                                const std::vector<OpReqType>& req,
-                                const std::vector<NDArray>& outputs) {
-  const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
-  // It seems MKLDNN softmax doesn't support training.
-  // and it only supports non-negative axis.
-  if (SupportMKLDNN(inputs[0]) && !ctx.is_train && param.axis >= 0) {
-    MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
-    MKLDNNSoftmaxForward(attrs, ctx, inputs[0], req[0], outputs[0]);
-    auto fn = SoftmaxCompute<cpu, mxnet_op::softmax_fwd>;
-    MKLDNN_OPCHECK_RUN(fn, attrs, ctx, inputs, req, outputs);
-    return;
-  }
-  FallBackCompute(SoftmaxCompute<cpu, mxnet_op::softmax_fwd>, attrs, ctx,
-                  inputs, req, outputs);
-}
-#endif
-
-inline static bool SoftmaxStorageType(const nnvm::NodeAttrs& attrs,
-                                      const int dev_mask,
-                                      DispatchMode* dispatch_mode,
-                                      std::vector<int> *in_attrs,
-                                      std::vector<int> *out_attrs) {
-  CHECK_EQ(in_attrs->size(), 1);
-  CHECK_EQ(out_attrs->size(), 1);
-
-  DispatchMode wanted_mode;
-#if MXNET_USE_MKLDNN == 1
-  // We only run MKLDNN op if it runs on CPU.
-  if (dev_mask == mshadow::cpu::kDevMask)
-    wanted_mode = DispatchMode::kFComputeEx;
-  else
-#endif
-    wanted_mode = DispatchMode::kFCompute;
-  return storage_type_assign(out_attrs, static_cast<NDArrayStorageType>((*in_attrs)[0]),
-                             dispatch_mode, wanted_mode);
-}
-
 MXNET_OPERATOR_REGISTER_UNARY(softmax)
 .describe(R"code(Applies the softmax function.
 
@@ -97,10 +54,6 @@ Example::
 )code" ADD_FILELINE)
 .set_attr_parser(ParamParser<SoftmaxParam>)
 .set_attr<FCompute>("FCompute<cpu>", SoftmaxCompute<cpu, mxnet_op::softmax_fwd>)
-#if MXNET_USE_MKLDNN == 1
-.set_attr<FComputeEx>("FComputeEx<cpu>", SoftmaxComputeExCPU)
-#endif
-.set_attr<FInferStorageType>("FInferStorageType", SoftmaxStorageType)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_softmax"})
 .add_arguments(SoftmaxParam::__FIELDS__());
 
diff --git a/src/operator/nn/softmax_activation-inl.h b/src/operator/nn/softmax_activation-inl.h
index b1d542e406..500bf51ccd 100644
--- a/src/operator/nn/softmax_activation-inl.h
+++ b/src/operator/nn/softmax_activation-inl.h
@@ -21,7 +21,7 @@
  * Copyright (c) 2015 by Contributors
  * \file softmax_activation-inl.h
  * \brief SoftmaxActivation operator
- * \author Junyuan Xie, Da Zheng
+ * \author Junyuan Xie
 */
 #ifndef MXNET_OPERATOR_NN_SOFTMAX_ACTIVATION_INL_H_
 #define MXNET_OPERATOR_NN_SOFTMAX_ACTIVATION_INL_H_
@@ -61,74 +61,153 @@ struct SoftmaxActivationParam : public dmlc::Parameter<SoftmaxActivationParam> {
   }
 };
 
+/**
+ * \brief This is the implementation of softmax_activation operator.
+ * \tparam xpu The device that the op will be executed on.
+ */
 template<typename xpu>
-void SoftmaxActivationCompute(const nnvm::NodeAttrs& attrs,
-                              const OpContext& ctx,
-                              const std::vector<TBlob>& inputs,
-                              const std::vector<OpReqType>& reqs,
-                              const std::vector<TBlob>& outputs) {
-  using namespace mshadow;
-  using namespace mshadow::expr;
-  const SoftmaxActivationParam& param = nnvm::get<SoftmaxActivationParam>(attrs.parsed);
-  CHECK_EQ(inputs.size(), 1U);
-  CHECK_EQ(outputs.size(), 1U);
-  const TBlob &in_data = inputs[softmax_activation::kData];
-  const OpReqType &req = reqs[softmax_activation::kOut];
-  const TBlob &out_data = outputs[softmax_activation::kOut];
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-  if (param.mode == softmax_activation::kInstance) {
-    Tensor<xpu, 2> data = in_data.FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> out = out_data.FlatTo2D<xpu, real_t>(s);
-    Softmax(out, data);
-  } else {
-    CHECK_GE(in_data.ndim(), 3)
+class SoftmaxActivationOp : public Operator {
+ public:
+  explicit SoftmaxActivationOp(SoftmaxActivationParam p) {
+    this->param_ = p;
+  }
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 1U);
+    CHECK_EQ(out_data.size(), 1U);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    if (param_.mode == softmax_activation::kInstance) {
+      Tensor<xpu, 2> data = in_data[softmax_activation::kData].FlatTo2D<xpu, real_t>(s);
+      Tensor<xpu, 2> out = out_data[softmax_activation::kOut].FlatTo2D<xpu, real_t>(s);
+      Softmax(out, data);
+    } else {
+      CHECK_GE(in_data[softmax_activation::kData].ndim(), 3)
         << "Input need to have a least 3 dimensions when mode=channel";
-    int n = in_data.size(0);
-    int k = in_data.size(1);
-    Shape<3> s3 = Shape3(n, k, static_cast<int>(in_data.Size()/n/k));
-    Tensor<xpu, 3, real_t> data = in_data.get_with_shape<xpu, 3, real_t>(s3, s);
-    Tensor<xpu, 3, real_t> out = out_data.get_with_shape<xpu, 3, real_t>(s3, s);
-    Softmax(out, data);
+      int n = in_data[softmax_activation::kData].size(0);
+      int k = in_data[softmax_activation::kData].size(1);
+      Shape<3> s3 = Shape3(n, k, static_cast<int>(in_data[softmax_activation::kData].Size()/n/k));
+      Tensor<xpu, 3, real_t> data =
+        in_data[softmax_activation::kData].get_with_shape<xpu, 3, real_t>(s3, s);
+      Tensor<xpu, 3, real_t> out =
+        out_data[softmax_activation::kOut].get_with_shape<xpu, 3, real_t>(s3, s);
+      Softmax(out, data);
+    }
   }
-}
 
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), 1U);
+    CHECK(in_data.size() == 1 && in_grad.size() == 1);
+    CHECK_EQ(req.size(), 1U);
+    // Use 3d tensor for both mode -> {instance, channel}. Get shapes
+    int total_size = in_grad[softmax_activation::kData].Size();
+    int batch_size = in_grad[softmax_activation::kData].shape_[0];
+    int channel_num = in_grad[softmax_activation::kData].shape_[1];
+    int rest_size = total_size / (batch_size * channel_num);
+    const Shape<3> data_shape = Shape3(batch_size, channel_num, rest_size);
+    // Get tensors
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 3> m_out_grad =
+      out_grad[softmax_activation::kOut].get_with_shape<xpu, 3, real_t>(data_shape, s);
+    Tensor<xpu, 3> m_out_data =
+      out_data[softmax_activation::kOut].get_with_shape<xpu, 3, real_t>(data_shape, s);
+    Tensor<xpu, 3> m_in_grad =
+      in_grad[softmax_activation::kData].get_with_shape<xpu, 3, real_t>(data_shape, s);
+    // get requested temp space
+    Tensor<xpu, 2> workspace = ctx.requested[softmax_activation::kTempSpace].get_space<xpu>(
+        Shape2(batch_size, rest_size), s);
+    workspace = reduce_with_axis<red::sum, false>(m_out_grad * m_out_data, 1);
+    Assign(m_in_grad, req[softmax_activation::kData],
+        m_out_data * (m_out_grad - broadcast_with_axis(workspace, 0, channel_num)));
+  }
+
+ private:
+  SoftmaxActivationParam param_;
+};  // class SoftmaxActivationOp
+
+// Decalre Factory function, used for dispatch specialization
 template<typename xpu>
-void SoftmaxActivationGradCompute(const nnvm::NodeAttrs& attrs,
-                                  const OpContext& ctx,
-                                  const std::vector<TBlob>& inputs,
-                                  const std::vector<OpReqType>& reqs,
-                                  const std::vector<TBlob>& outputs) {
-  using namespace mshadow;
-  using namespace mshadow::expr;
-  CHECK_EQ(inputs.size(), 2U);
-  CHECK_EQ(outputs.size(), 1);
-  CHECK_EQ(reqs.size(), 1);
-  const TBlob &out_grad = inputs[0];
-  const TBlob &out_data = inputs[1];
-  const OpReqType &req = reqs[0];
-  const TBlob &in_grad = outputs[0];
-  // Use 3d tensor for both mode -> {instance, channel}. Get shapes
-  int total_size = in_grad.Size();
-  int batch_size = in_grad.shape_[0];
-  int channel_num = in_grad.shape_[1];
-  int rest_size = total_size / (batch_size * channel_num);
-  const Shape<3> data_shape = Shape3(batch_size, channel_num, rest_size);
-  // Get tensors
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-  Tensor<xpu, 3> m_out_grad =
-      out_grad.get_with_shape<xpu, 3, real_t>(data_shape, s);
-  Tensor<xpu, 3> m_out_data =
-      out_data.get_with_shape<xpu, 3, real_t>(data_shape, s);
-  Tensor<xpu, 3> m_in_grad =
-      in_grad.get_with_shape<xpu, 3, real_t>(data_shape, s);
-  // get requested temp space
-  Tensor<xpu, 2> workspace = ctx.requested[softmax_activation::kTempSpace].get_space<xpu>(
-      Shape2(batch_size, rest_size), s);
-  workspace = reduce_with_axis<red::sum, false>(m_out_grad * m_out_data, 1);
-  Assign(m_in_grad, req,
-         m_out_data * (m_out_grad - broadcast_with_axis(workspace, 0, channel_num)));
-}
+Operator* CreateOp(SoftmaxActivationParam type);
+
+#if DMLC_USE_CXX11
+class SoftmaxActivationProp : public OperatorProperty {
+ public:
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    param_.Init(kwargs);
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), 1U) << "Input:[data]";
+    const TShape &dshape = in_shape->at(softmax_activation::kData);
+    if (dshape.ndim() == 0) return false;
+    out_shape->clear();
+    out_shape->push_back(dshape);
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    auto ptr = new SoftmaxActivationProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  std::string TypeString() const override {
+    return "SoftmaxActivation";
+  }
+
+  // decalre dependency and inplace optimization options
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    return {out_grad[softmax_activation::kOut], out_data[softmax_activation::kOut]};
+  }
+
+  std::vector<ResourceRequest> BackwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
 
+  std::vector<std::pair<int, void*> > BackwardInplaceOption(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data,
+    const std::vector<void*> &in_grad) const override {
+    return {{out_grad[softmax_activation::kOut], in_grad[softmax_activation::kData]}};
+  }
+
+  std::vector<std::pair<int, void*> > ForwardInplaceOption(
+    const std::vector<int> &in_data,
+    const std::vector<void*> &out_data) const override {
+    return {{in_data[softmax_activation::kData], out_data[softmax_activation::kOut]}};
+  }
+
+  Operator* CreateOperator(Context ctx) const override;
+
+ private:
+  SoftmaxActivationParam param_;
+};
+#endif  // DMLC_USE_CXX11
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_NN_SOFTMAX_ACTIVATION_INL_H_
diff --git a/src/operator/nn/softmax_activation.cc b/src/operator/nn/softmax_activation.cc
index bdfd8b065d..657b382c6e 100644
--- a/src/operator/nn/softmax_activation.cc
+++ b/src/operator/nn/softmax_activation.cc
@@ -21,18 +21,26 @@
  * Copyright (c) 2015 by Contributors
  * \file activation.cc
  * \brief softmax_activation op
- * \author Junyuan Xie, Da Zheng
+ * \author Junyuan Xie
 */
 #include "./softmax_activation-inl.h"
-#include "../tensor/elemwise_unary_op.h"
 #include "../mshadow_op.h"
 
 namespace mxnet {
 namespace op {
+template<>
+Operator *CreateOp<cpu>(SoftmaxActivationParam param) {
+  return new SoftmaxActivationOp<cpu>(param);
+}
+
+// DO_BIND_DISPATCH comes from operator_common.h
+Operator *SoftmaxActivationProp::CreateOperator(Context ctx) const {
+  DO_BIND_DISPATCH(CreateOp, param_);
+}
 
 DMLC_REGISTER_PARAMETER(SoftmaxActivationParam);
 
-MXNET_OPERATOR_REGISTER_UNARY(SoftmaxActivation)
+MXNET_REGISTER_OP_PROPERTY(SoftmaxActivation, SoftmaxActivationProp)
 .describe(R"code(Applies softmax activation to input. This is intended for internal layers.
 
 .. note::
@@ -57,22 +65,8 @@ Example::
    [  6.56221947e-03   5.95310994e-04   9.73919690e-01   1.78379621e-02   1.08472735e-03]]
 
 )code" ADD_FILELINE)
-.set_attr_parser(ParamParser<SoftmaxActivationParam>)
-.set_attr<FCompute>("FCompute<cpu>", SoftmaxActivationCompute<cpu>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_SoftmaxActivation"})
+.add_argument("data", "NDArray-or-Symbol", "Input array to activation function.")
 .add_arguments(SoftmaxActivationParam::__FIELDS__());
 
-NNVM_REGISTER_OP(_backward_SoftmaxActivation)
-.set_num_outputs(1)
-.set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs){
-  return std::vector<std::pair<int, int> >{{0, 0}};
-})
-.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
-  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-})
-.set_attr_parser(ParamParser<SoftmaxActivationParam>)
-.set_attr<FCompute>("FCompute<cpu>", SoftmaxActivationGradCompute<cpu>);
-
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/softmax_activation.cu b/src/operator/nn/softmax_activation.cu
index f3997e0005..0810483e12 100644
--- a/src/operator/nn/softmax_activation.cu
+++ b/src/operator/nn/softmax_activation.cu
@@ -21,7 +21,7 @@
  * Copyright (c) 2015 by Contributors
  * \file softmax_activation.cu
  * \brief
- * \author Junyuan Xie, Da Zheng
+ * \author Junyuan Xie
 */
 #include "./softmax_activation-inl.h"
 #include "../mshadow_op.h"
@@ -31,51 +31,14 @@
 
 namespace mxnet {
 namespace op {
-
+template<>
+Operator *CreateOp<gpu>(SoftmaxActivationParam param) {
 #if MXNET_USE_CUDNN == 1
-
-static inline CuDNNSoftmaxActivationOp &GetCuDNNSoftmaxActOp(const SoftmaxActivationParam& param) {
-#if DMLC_CXX11_THREAD_LOCAL
-  static thread_local CuDNNSoftmaxActivationOp op;
+  return new CuDNNSoftmaxActivationOp(param);
 #else
-  static MX_THREAD_LOCAL CuDNNSoftmaxActivationOp op;
-#endif
-  op.Init(param);
-  return op;
-}
-
-template<>
-void SoftmaxActivationCompute<gpu>(const nnvm::NodeAttrs& attrs,
-                                   const OpContext& ctx,
-                                   const std::vector<TBlob>& inputs,
-                                   const std::vector<OpReqType>& req,
-                                   const std::vector<TBlob>& outputs) {
-  const SoftmaxActivationParam& param = nnvm::get<SoftmaxActivationParam>(attrs.parsed);
-  CHECK_EQ(inputs.size(), 1U);
-  CHECK_EQ(outputs.size(), 1U);
-  GetCuDNNSoftmaxActOp(param).Forward(ctx, inputs[0], req[0], outputs[0]);
+  return new SoftmaxActivationOp<gpu>(param);
+#endif  // MXNET_USE_CUDNN
 }
-
-template<>
-void SoftmaxActivationGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
-                                       const OpContext& ctx,
-                                       const std::vector<TBlob>& inputs,
-                                       const std::vector<OpReqType>& req,
-                                       const std::vector<TBlob>& outputs) {
-  const SoftmaxActivationParam& param = nnvm::get<SoftmaxActivationParam>(attrs.parsed);
-  CHECK_EQ(inputs.size(), 2U);
-  CHECK_EQ(outputs.size(), 1);
-  CHECK_EQ(req.size(), 1);
-  GetCuDNNSoftmaxActOp(param).Backward(ctx, inputs[0], inputs[1], req[0], outputs[0]);
-}
-#endif
-
-NNVM_REGISTER_OP(SoftmaxActivation)
-.set_attr<FCompute>("FCompute<gpu>", SoftmaxActivationCompute<gpu>);
-
-NNVM_REGISTER_OP(_backward_SoftmaxActivation)
-.set_attr<FCompute>("FCompute<gpu>", SoftmaxActivationGradCompute<gpu>);
-
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/nn/upsampling-inl.h b/src/operator/nn/upsampling-inl.h
index 4b9159edd1..f660609ace 100644
--- a/src/operator/nn/upsampling-inl.h
+++ b/src/operator/nn/upsampling-inl.h
@@ -35,7 +35,6 @@
 #include <string>
 #include <utility>
 #include "../operator_common.h"
-#include "./deconvolution-inl.h"
 
 namespace mxnet {
 namespace op {
@@ -83,147 +82,253 @@ struct UpSamplingParam : public dmlc::Parameter<UpSamplingParam> {
 };  // struct UpSamplingParam
 
 template<typename xpu, typename DType>
-void UpSamplingForward(const OpContext &ctx, const UpSamplingParam &param,
+class UpSamplingNearestOp : public Operator {
+ public:
+  explicit UpSamplingNearestOp(UpSamplingParam p) {
+    this->param_ = p;
+  }
+
+  virtual void Forward(const OpContext &ctx,
                        const std::vector<TBlob> &in_data,
                        const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data) {
-  using namespace mshadow;
-  using namespace mshadow::expr;
-  CHECK_EQ(in_data.size(), static_cast<size_t>(param.num_args));
-  CHECK_EQ(out_data.size(), 1U);
-  if (req[up_enum::kOut] == kNullOp) {
-    return;
-  }
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-  Tensor<xpu, 4, DType> out = out_data[up_enum::kOut].get<xpu, 4, DType>(s);
-  if (param.num_args > 1) {
-    int begin = 0;
-    for (int i = 0; i < param.num_args; ++i) {
-      Tensor<xpu, 4, DType> data = in_data[i].get<xpu, 4, DType>(s);
-      int end = begin + data.size(1);
-      int scale = out_data[up_enum::kOut].size(2)/in_data[i].size(2);
-      if (param.multi_input_mode == up_enum::kSum) {
-        if (i == 0) {
-          Assign(out, req[up_enum::kOut], upsampling_nearest(data, scale));
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), static_cast<size_t>(param_.num_args));
+    CHECK_EQ(out_data.size(), 1U);
+    if (req[up_enum::kOut] == kNullOp) {
+      return;
+    }
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4, DType> out = out_data[up_enum::kOut].get<xpu, 4, DType>(s);
+    if (param_.num_args > 1) {
+      int begin = 0;
+      for (int i = 0; i < param_.num_args; ++i) {
+        Tensor<xpu, 4, DType> data = in_data[i].get<xpu, 4, DType>(s);
+        int end = begin + data.size(1);
+        int scale = out_data[up_enum::kOut].size(2)/in_data[i].size(2);
+        if (param_.multi_input_mode == up_enum::kSum) {
+          if (i == 0) {
+            Assign(out, req[up_enum::kOut], upsampling_nearest(data, scale));
+          } else {
+            out += upsampling_nearest(data, scale);
+          }
         } else {
-          out += upsampling_nearest(data, scale);
+          Assign(slice<1>(out, begin, end), req[up_enum::kOut], upsampling_nearest(data, scale));
         }
-      } else {
-        Assign(slice<1>(out, begin, end), req[up_enum::kOut], upsampling_nearest(data, scale));
+        begin = end;
       }
-      begin = end;
+    } else {
+      Tensor<xpu, 4, DType> data = in_data[up_enum::kData].get<xpu, 4, DType>(s);
+      Assign(out, req[up_enum::kOut], upsampling_nearest(data, param_.scale));
     }
-  } else {
-    Tensor<xpu, 4, DType> data = in_data[up_enum::kData].get<xpu, 4, DType>(s);
-    Assign(out, req[up_enum::kOut], upsampling_nearest(data, param.scale));
   }
-}
 
-template<typename xpu, typename DType>
-void UpSamplingBackward(const OpContext &ctx, const UpSamplingParam &param,
-                        const TBlob &out_grad, const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad) {
-  using namespace mshadow;
-  using namespace mshadow::expr;
-  CHECK_EQ(in_grad.size(), static_cast<size_t>(param.num_args));
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-  Tensor<xpu, 4, DType> grad = out_grad.get<xpu, 4, DType>(s);
-  if (param.num_args > 1) {
-    int begin = 0;
-    for (int i = 0; i < param.num_args; ++i) {
-      Tensor<xpu, 4, DType> input_grad = in_grad[i].get<xpu, 4, DType>(s);
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), 1U);
+    CHECK_EQ(in_grad.size(), static_cast<size_t>(param_.num_args));
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4, DType> grad = out_grad[up_enum::kOut].get<xpu, 4, DType>(s);
+    if (param_.num_args > 1) {
+      int begin = 0;
+      for (int i = 0; i < param_.num_args; ++i) {
+        Tensor<xpu, 4, DType> input_grad = in_grad[i].get<xpu, 4, DType>(s);
+        mshadow::Shape<2> in_shape = Shape2(input_grad.shape_[2], input_grad.shape_[3]);
+        int end = begin + input_grad.size(1);
+        int scale = grad.size(2)/in_shape[0];
+        if (param_.multi_input_mode == up_enum::kSum) {
+          Assign(input_grad, req[i],
+                 pool<mshadow::red::sum>(grad,
+                                         in_shape,
+                                         scale,
+                                         scale,
+                                         scale,
+                                         scale));
+        } else {
+          Assign(input_grad, req[i],
+                 pool<mshadow::red::sum>(slice<1>(grad, begin, end),
+                                         in_shape,
+                                         scale,
+                                         scale,
+                                         scale,
+                                         scale));
+        }
+        begin = end;
+      }
+    } else {
+      Tensor<xpu, 4, DType> input_grad = in_grad[up_enum::kData].get<xpu, 4, DType>(s);
       mshadow::Shape<2> in_shape = Shape2(input_grad.shape_[2], input_grad.shape_[3]);
-      int end = begin + input_grad.size(1);
-      int scale = grad.size(2)/in_shape[0];
-      if (param.multi_input_mode == up_enum::kSum) {
-        Assign(input_grad, req[i],
-               pool<mshadow::red::sum>(grad,
-                                       in_shape,
-                                       scale,
-                                       scale,
-                                       scale,
-                                       scale));
+      Assign(input_grad, req[up_enum::kData],
+             pool<mshadow::red::sum>(grad,
+                                     in_shape,
+                                     param_.scale,
+                                     param_.scale,
+                                     param_.scale,
+                                     param_.scale));
+    }
+  }
+
+ private:
+  UpSamplingParam param_;
+};  // class UpSamplingNearestOp
+
+template<typename xpu>
+Operator *CreateOp(UpSamplingParam param, int dtype);
+
+
+#if DMLC_USE_CXX11
+class UpSamplingProp : public OperatorProperty {
+ public:
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    param_.Init(kwargs);
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  std::vector<std::string> ListArguments() const override {
+    if (param_.sample_type == up_enum::kNearest) {
+      std::vector<std::string> ret;
+      for (int i = 0; i < param_.num_args; ++i) {
+        ret.push_back(std::string("arg") + std::to_string(i));
+      }
+      return ret;
+    } else {
+      return {"data", "weight"};
+    }
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    CHECK_GE(in_shape->size(), 1U);
+    const TShape &dshape = (*in_shape)[0];
+    TShape oshape = dshape;
+    if (param_.sample_type == up_enum::kNearest) {
+      CHECK_EQ(in_shape->size(), static_cast<size_t>(param_.num_args));
+      oshape[1] = 0;
+      for (auto& shape : *in_shape) {
+        CHECK_EQ(shape.ndim(), 4U) << \
+          "UpSamplingNearest: Input data should be 4D in (batch, channel, y, x)";
+        int oh = dshape[2]*param_.scale, ow = dshape[3]*param_.scale;
+        CHECK_EQ(oh%shape[2], 0U) << "UpSamplingNearest: input height of " << shape[2] << \
+          "does not divide output height of " << oh;
+        CHECK_EQ(ow%shape[3], 0U) << "UpSamplingNearest: input width of " << shape[3] << \
+          "does not divide output width of " << ow;
+        if (param_.multi_input_mode == up_enum::kSum) {
+          CHECK(oshape[1] == 0 || oshape[1] == shape[1]) << \
+            "Number of channels must be the same when multi_input_mode==sum";
+          oshape[1] = shape[1];
+        } else {
+          oshape[1] += shape[1];
+        }
+      }
+    } else {
+      CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
+      CHECK_EQ(dshape.ndim(), 4U) << \
+        "UpSamplingBilinear: Input data should be 4D in (batch, channel, y, x)";
+      if (dshape.ndim() ==  0) return false;
+      int kernel = 2 * param_.scale - param_.scale % 2;
+      SHAPE_ASSIGN_CHECK(*in_shape,
+                         up_enum::kWeight,
+                         mshadow::Shape4(dshape[1], 1, kernel, kernel));
+      oshape = dshape;
+    }
+    oshape[2] = dshape[2] * param_.scale;
+    oshape[3] = dshape[3] * param_.scale;
+    out_shape->clear();
+    out_shape->push_back(oshape);
+    return true;
+  }
+
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_GE(in_type->size(), 1U);
+    int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "First input must have specified type";
+    for (index_t i = 0; i < in_type->size(); ++i) {
+      if ((*in_type)[i] == -1) {
+        (*in_type)[i] = dtype;
       } else {
-        Assign(input_grad, req[i],
-               pool<mshadow::red::sum>(slice<1>(grad, begin, end),
-                                       in_shape,
-                                       scale,
-                                       scale,
-                                       scale,
-                                       scale));
+        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
       }
-      begin = end;
     }
-  } else {
-    Tensor<xpu, 4, DType> input_grad = in_grad[up_enum::kData].get<xpu, 4, DType>(s);
-    mshadow::Shape<2> in_shape = Shape2(input_grad.shape_[2], input_grad.shape_[3]);
-    Assign(input_grad, req[up_enum::kData],
-           pool<mshadow::red::sum>(grad,
-                                   in_shape,
-                                   param.scale,
-                                   param.scale,
-                                   param.scale,
-                                   param.scale));
-  }
-}
-
-static inline DeconvolutionParam GetDeconvolutionParam(const UpSamplingParam& param) {
-  DeconvolutionParam p = DeconvolutionParam();
-  int kernel = 2 * param.scale - param.scale % 2;
-  int stride = param.scale;
-  int pad = static_cast<int>(ceil((param.scale - 1) / 2.));
-  p.workspace = param.workspace;
-  p.num_group = param.num_filter;
-  p.num_filter = param.num_filter;
-  p.no_bias =  true;
-  int shape[] = {1, 1};
-  p.dilate = TShape(shape, shape + 2);
-  shape[0] = shape[1] = kernel;
-  p.kernel = TShape(shape, shape + 2);
-  shape[0] = shape[1] = stride;
-  p.stride = TShape(shape, shape + 2);
-  shape[0] = shape[1] = pad;
-  p.pad = TShape(shape, shape + 2);
-  return p;
-}
+    out_type->clear();
+    out_type->push_back(dtype);
+    return true;
+  }
 
-template<typename xpu>
-void UpSamplingCompute(const nnvm::NodeAttrs& attrs,
-                       const OpContext& ctx, const std::vector<TBlob>& inputs,
-                       const std::vector<OpReqType>& req,
-                       const std::vector<TBlob>& outputs) {
-  const UpSamplingParam& param = nnvm::get<UpSamplingParam>(attrs.parsed);
-  if (param.sample_type == up_enum::kNearest) {
-    MSHADOW_REAL_TYPE_SWITCH(inputs[deconv::kData].type_flag_, DType, {
-      UpSamplingForward<xpu, DType>(ctx, param, inputs, req, outputs);
-    });
-  } else if (param.sample_type == up_enum::kBilinear) {
-    DeconvolutionParam p = GetDeconvolutionParam(param);
-    _DeconvolutionCompute<xpu>(p, ctx, inputs, req, outputs);
-  } else {
-    LOG(FATAL) << "Unknown sample type";
-  }
-}
+  OperatorProperty* Copy() const override {
+    auto ptr = new UpSamplingProp();
+    ptr->param_ = this->param_;
+    return ptr;
+  }
+
+  std::string TypeString() const override {
+    return "UpSampling";
+  }
+
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    if (param_.sample_type == up_enum::kNearest) {
+      return {out_grad[up_enum::kOut]};
+    } else {
+      return {out_grad[up_enum::kOut], in_data[up_enum::kData], in_data[up_enum::kWeight]};
+    }
+  }
+
+  std::vector<std::pair<int, void*> > BackwardInplaceOption(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data,
+    const std::vector<void*> &in_grad) const override {
+    return {};
+  }
+
+  std::vector<ResourceRequest> ForwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    if (param_.sample_type == up_enum::kNearest) {
+      return {};
+    } else {
+      return {ResourceRequest::kTempSpace};
+    }
+  }
+
+  std::vector<ResourceRequest> BackwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    if (param_.sample_type == up_enum::kNearest) {
+      return {};
+    } else {
+      return {ResourceRequest::kTempSpace};
+    }
+  }
+
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
 
-template<typename xpu>
-void UpSamplingGradCompute(const nnvm::NodeAttrs& attrs,
-                           const OpContext& ctx, const std::vector<TBlob>& inputs,
-                           const std::vector<OpReqType>& req,
-                           const std::vector<TBlob>& outputs) {
-  const UpSamplingParam& param = nnvm::get<UpSamplingParam>(attrs.parsed);
-  if (param.sample_type == up_enum::kNearest) {
-    MSHADOW_REAL_TYPE_SWITCH(inputs[deconv::kData].type_flag_, DType, {
-      CHECK_EQ(inputs.size(), 1U);
-      UpSamplingBackward<xpu, DType>(ctx, param, inputs[0], req, outputs);
-    });
-  } else if (param.sample_type == up_enum::kBilinear) {
-    DeconvolutionParam p = GetDeconvolutionParam(param);
-    _DeconvolutionGradCompute<xpu>(p, ctx, inputs, req, outputs);
-  } else {
-    LOG(FATAL) << "Unknown sample type";
-  }
-}
 
+ private:
+  UpSamplingParam param_;
+};  // class UpSamplingProp
+#endif  // DMLC_USE_CXX11
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/nn/upsampling.cc b/src/operator/nn/upsampling.cc
index 44b619ac95..8942e35ab3 100644
--- a/src/operator/nn/upsampling.cc
+++ b/src/operator/nn/upsampling.cc
@@ -21,7 +21,7 @@
  * Copyright (c) 2015 by Contributors
  * \file upsampling_nearest.cc
  * \brief
- * \author Bing Xu, Da Zheng
+ * \author Bing Xu
 */
 
 #include "./upsampling-inl.h"
@@ -30,123 +30,51 @@
 
 namespace mxnet {
 namespace op {
-
-static bool UpSamplingShape(const nnvm::NodeAttrs& attrs,
-                            std::vector<TShape> *in_shape, std::vector<TShape> *out_shape) {
-  const UpSamplingParam& param_ = nnvm::get<UpSamplingParam>(attrs.parsed);
-  CHECK_GE(in_shape->size(), 1U);
-  const TShape &dshape = (*in_shape)[0];
-  TShape oshape = dshape;
-  if (param_.sample_type == up_enum::kNearest) {
-    CHECK_EQ(in_shape->size(), static_cast<size_t>(param_.num_args));
-    oshape[1] = 0;
-    for (auto& shape : *in_shape) {
-      CHECK_EQ(shape.ndim(), 4U) << \
-        "UpSamplingNearest: Input data should be 4D in (batch, channel, y, x)";
-      int oh = dshape[2]*param_.scale, ow = dshape[3]*param_.scale;
-      CHECK_EQ(oh%shape[2], 0U) << "UpSamplingNearest: input height of " << shape[2] << \
-        "does not divide output height of " << oh;
-      CHECK_EQ(ow%shape[3], 0U) << "UpSamplingNearest: input width of " << shape[3] << \
-        "does not divide output width of " << ow;
-      if (param_.multi_input_mode == up_enum::kSum) {
-        CHECK(oshape[1] == 0 || oshape[1] == shape[1]) << \
-                         "Number of channels must be the same when multi_input_mode==sum";
-        oshape[1] = shape[1];
-      } else {
-        oshape[1] += shape[1];
-      }
-    }
-  } else {
-    CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
-    CHECK_EQ(dshape.ndim(), 4U) << \
-      "UpSamplingBilinear: Input data should be 4D in (batch, channel, y, x)";
-    if (dshape.ndim() ==  0) return false;
-    int kernel = 2 * param_.scale - param_.scale % 2;
-    SHAPE_ASSIGN_CHECK(*in_shape,
-        up_enum::kWeight,
-        mshadow::Shape4(dshape[1], 1, kernel, kernel));
-    oshape = dshape;
-  }
-  oshape[2] = dshape[2] * param_.scale;
-  oshape[3] = dshape[3] * param_.scale;
-  out_shape->clear();
-  out_shape->push_back(oshape);
-  return true;
-}
-
-static inline std::vector<std::string> ListArguments(const UpSamplingParam& param) {
-  if (param.sample_type == up_enum::kNearest) {
-    std::vector<std::string> ret;
-    for (int i = 0; i < param.num_args; ++i) {
-      ret.push_back(std::string("arg") + std::to_string(i));
-    }
-    return ret;
-  } else {
-    return {"data", "weight"};
-  }
-}
-
-static bool UpSamplingType(const nnvm::NodeAttrs& attrs,
-                           std::vector<int> *in_type, std::vector<int> *out_type) {
-  const UpSamplingParam& param = nnvm::get<UpSamplingParam>(attrs.parsed);
-  CHECK_GE(in_type->size(), 1U);
-  int dtype = (*in_type)[0];
-  CHECK_NE(dtype, -1) << "First input must have specified type";
-  for (index_t i = 0; i < in_type->size(); ++i) {
-    if ((*in_type)[i] == -1) {
-      (*in_type)[i] = dtype;
+template<>
+Operator *CreateOp<cpu>(UpSamplingParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    if (param.sample_type == up_enum::kNearest) {
+      op = new UpSamplingNearestOp<cpu, DType>(param);
+    } else if (param.sample_type == up_enum::kBilinear) {
+      DeconvolutionParam p = DeconvolutionParam();
+      int kernel = 2 * param.scale - param.scale % 2;
+      int stride = param.scale;
+      int pad = static_cast<int>(ceil((param.scale - 1) / 2.));
+      p.workspace = param.workspace;
+      p.num_group = param.num_filter;
+      p.num_filter = param.num_filter;
+      p.no_bias =  true;
+      int shape[] = {1, 1};
+      p.dilate = TShape(shape, shape + 2);
+      shape[0] = shape[1] = kernel;
+      p.kernel = TShape(shape, shape + 2);
+      shape[0] = shape[1] = stride;
+      p.stride = TShape(shape, shape + 2);
+      shape[0] = shape[1] = pad;
+      p.pad = TShape(shape, shape + 2);
+      op = new DeconvolutionOp<cpu, DType>(p);
     } else {
-      UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments(param)[i]);
+      LOG(FATAL) << "Unknown sample type";
     }
-  }
-  out_type->clear();
-  out_type->push_back(dtype);
-  return true;
+  });
+  return op;
 }
 
-struct UpSamplingGrad {
-  const char *op_name;
-  std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
-                                          const std::vector<nnvm::NodeEntry>& ograds) const {
-    const UpSamplingParam& param_ = nnvm::get<UpSamplingParam>(n->attrs.parsed);
-    std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
-    if (param_.sample_type != up_enum::kNearest) {
-      heads.push_back(n->inputs[up_enum::kData]);
-      heads.push_back(n->inputs[up_enum::kWeight]);
-    }
-    return MakeGradNode(op_name, n, heads, n->attrs.dict);
-  }
-};
+Operator* UpSamplingProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                           std::vector<int> *in_type) const {
+  DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
+}
 
 DMLC_REGISTER_PARAMETER(UpSamplingParam);
 
-NNVM_REGISTER_OP(UpSampling)
+MXNET_REGISTER_OP_PROPERTY(UpSampling, UpSamplingProp)
 .describe("Performs nearest neighbor/bilinear up sampling to inputs.")
-.set_num_inputs([](const NodeAttrs& attrs) {
-  const UpSamplingParam& params = nnvm::get<UpSamplingParam>(attrs.parsed);
-  return params.sample_type == up_enum::kNearest ? params.num_args : 2;
-})
-.set_num_outputs(1)
-.set_attr_parser(ParamParser<UpSamplingParam>)
-.set_attr<nnvm::FListInputNames>("FListInputNames",
-    [](const NodeAttrs& attrs) {
-  return ListArguments(nnvm::get<UpSamplingParam>(attrs.parsed));
-})
-.set_attr<nnvm::FInferShape>("FInferShape", UpSamplingShape)
-.set_attr<nnvm::FInferType>("FInferType", UpSamplingType)
-.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
-  const UpSamplingParam& param = nnvm::get<UpSamplingParam>(n.parsed);
-  if (param.sample_type == up_enum::kNearest) {
-    return std::vector<ResourceRequest>();
-  } else {
-    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-  }
-})
-.set_attr<FCompute>("FCompute<cpu>", UpSamplingCompute<cpu>)
-.set_attr<nnvm::FGradient>("FGradient", UpSamplingGrad{"_backward_UpSampling"})
-.set_attr<std::string>("key_var_num_args", "num_args")
 .add_argument("data", "NDArray-or-Symbol[]", "Array of tensors to upsample")
 .add_arguments(UpSamplingParam::__FIELDS__())
+.set_key_var_num_args("num_args");
+
+NNVM_REGISTER_OP(UpSampling)
 .set_attr<nnvm::FSetInputVarAttrOnCompose>("FSetInputVarAttrOnCompose",
     [](const nnvm::NodeAttrs& attrs, nnvm::NodePtr var, const int index) {
       if (var->attrs.dict.find("__init__") != var->attrs.dict.end()) return;
@@ -154,23 +82,5 @@ NNVM_REGISTER_OP(UpSampling)
         var->attrs.dict["__init__"] = "[\"bilinear\", {}]";
       }
     });
-
-NNVM_REGISTER_OP(_backward_UpSampling)
-.set_num_outputs([](const NodeAttrs& attrs) {
-  const UpSamplingParam& params = nnvm::get<UpSamplingParam>(attrs.parsed);
-  return params.sample_type == up_enum::kNearest ? params.num_args : 2;
-})
-.set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
-  const UpSamplingParam& param = nnvm::get<UpSamplingParam>(n.parsed);
-  if (param.sample_type == up_enum::kNearest) {
-    return std::vector<ResourceRequest>();
-  } else {
-    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-  }
-})
-.set_attr_parser(ParamParser<UpSamplingParam>)
-.set_attr<FCompute>("FCompute<cpu>", UpSamplingGradCompute<cpu>);
-
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/upsampling.cu b/src/operator/nn/upsampling.cu
index c5ff2fafd6..f83535a2b2 100644
--- a/src/operator/nn/upsampling.cu
+++ b/src/operator/nn/upsampling.cu
@@ -21,7 +21,7 @@
  * Copyright (c) 2015 by Contributors
  * \file upsampling_nearest.cc
  * \brief
- * \author Bing Xu, Da Zheng
+ * \author Bing Xu
 */
 
 #include "./deconvolution-inl.h"
@@ -29,12 +29,36 @@
 
 namespace mxnet {
 namespace op {
-
-NNVM_REGISTER_OP(UpSampling)
-.set_attr<FCompute>("FCompute<gpu>", UpSamplingCompute<gpu>);
-
-NNVM_REGISTER_OP(_backward_UpSampling)
-.set_attr<FCompute>("FCompute<gpu>", UpSamplingGradCompute<gpu>);
+template<>
+Operator *CreateOp<gpu>(UpSamplingParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    if (param.sample_type == up_enum::kNearest) {
+      op = new UpSamplingNearestOp<gpu, DType>(param);
+    } else if (param.sample_type == up_enum::kBilinear) {
+      DeconvolutionParam p = DeconvolutionParam();
+      int kernel = 2 * param.scale - param.scale % 2;
+      int stride = param.scale;
+      int pad = static_cast<int>(ceil((param.scale - 1) / 2.));
+      p.workspace = param.workspace;
+      p.num_group = param.num_filter;
+      p.num_filter = param.num_filter;
+      p.no_bias =  true;
+      int shape[] = {1, 1};
+      p.dilate = TShape(shape, shape + 2);
+      shape[0] = shape[1] = kernel;
+      p.kernel = TShape(shape, shape + 2);
+      shape[0] = shape[1] = stride;
+      p.stride = TShape(shape, shape + 2);
+      shape[0] = shape[1] = pad;
+      p.pad = TShape(shape, shape + 2);
+      op = new DeconvolutionOp<gpu, DType>(p);
+    } else {
+      LOG(FATAL) << "Unknown sample type";
+    }
+  });
+  return op;
+}
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/cast_storage-inl.h b/src/operator/tensor/cast_storage-inl.h
index e345bb2193..ed20027385 100644
--- a/src/operator/tensor/cast_storage-inl.h
+++ b/src/operator/tensor/cast_storage-inl.h
@@ -27,15 +27,11 @@
 #include <dmlc/timer.h>
 #include <mxnet/ndarray.h>
 #include <vector>
-#include <algorithm>
 #include "../mxnet_op.h"
 #include "../operator_common.h"
 #ifdef __CUDACC__
 #include "./cast_storage-inl.cuh"
 #endif  // __CUDACC__
-#if MXNET_USE_MKLDNN == 1
-#include "../nn/mkldnn/mkldnn_base-inl.h"
-#endif
 
 
 namespace mxnet {
@@ -346,20 +342,8 @@ void CastStorageComputeImpl(const OpContext& ctx,
   } else if (src_stype == kCSRStorage && dst_stype == kDefaultStorage) {
     TBlob ret = output.data();
     CastStorageCsrDnsImpl<xpu>(ctx, input, &ret);
-#if MXNET_USE_MKLDNN == 1
-  } else if (src_stype == kDefaultStorage && dst_stype == kDefaultStorage) {
-    CHECK_EQ(output.ctx().dev_type, input.ctx().dev_type);
-    // If one of them uses the MKLDNN layout.
-    if (input.IsMKLDNNData() || output.IsMKLDNNData()) {
-      auto in_mem = input.GetMKLDNNData();
-      const_cast<NDArray &>(output).CopyFrom(*in_mem);
-      MKLDNNStream::Get()->Submit();
-    } else {
-      mxnet_op::copy(ctx.get_stream<xpu>(), output.data(), input.data());
-    }
-#endif
   } else {
-    LOG(FATAL) << "Not implemented from " << src_stype << " to " << dst_stype;
+    LOG(FATAL) << "Not implemented";
   }
 }
 
@@ -392,14 +376,8 @@ inline bool CastStorageInferStorageType(const nnvm::NodeAttrs& attrs,
   // dns -> dns, dns -> rsp, dns -> csr
   if (!dispatched && in_stype == kDefaultStorage && param_stype == kDefaultStorage) {
     // dns -> dns
-    DispatchMode mode = DispatchMode::kFCompute;
-#if MXNET_USE_MKLDNN == 1
-    // If we use MKLDNN and the arrays are in CPU memory, the array may store
-    // MKLDNN layout, we should convert its layout explicitly.
-    if (dev_mask == kCPU)
-      mode = DispatchMode::kFComputeEx;
-#endif
-    dispatched = storage_type_assign(out_attrs, kDefaultStorage, dispatch_mode, mode);
+    dispatched = storage_type_assign(out_attrs, kDefaultStorage,
+                                     dispatch_mode, DispatchMode::kFCompute);
   }
   if (!dispatched && in_stype == kDefaultStorage &&
     (param_stype == kRowSparseStorage || param_stype == kCSRStorage)) {
diff --git a/src/operator/tensor/elemwise_binary_op_basic.cc b/src/operator/tensor/elemwise_binary_op_basic.cc
index d73edc7235..d7e5e04ce8 100644
--- a/src/operator/tensor/elemwise_binary_op_basic.cc
+++ b/src/operator/tensor/elemwise_binary_op_basic.cc
@@ -24,68 +24,11 @@
  */
 #include "./elemwise_unary_op.h"
 #include "./elemwise_binary_op-inl.h"
-#include "../nn/mkldnn/mkldnn_ops-inl.h"
-#include "../nn/mkldnn/mkldnn_base-inl.h"
 
 namespace mxnet {
 namespace op {
 
-static void ElemwiseAddEx(const nnvm::NodeAttrs& attrs,
-                          const OpContext& ctx,
-                          const std::vector<NDArray>& inputs,
-                          const std::vector<OpReqType>& req,
-                          const std::vector<NDArray>& outputs) {
-  CHECK_EQ(inputs.size(), 2U);
-  CHECK_EQ(outputs.size(), 1U);
-#if MXNET_USE_MKLDNN == 1
-  if (SupportMKLDNN(inputs[0]) && SupportMKLDNN(inputs[1])) {
-    MKLDNNSumForward(attrs, ctx, inputs, req[0], outputs[0]);
-    return;
-  } else if (inputs[0].storage_type() == kDefaultStorage
-             && inputs[1].storage_type() == kDefaultStorage) {
-    // This happens if inputs are supposed to be in MKLDNN format
-    // but MKLDNN doesn't support the data type or the shape. We're
-    // forced to convert it to the default format.
-    std::vector<TBlob> in_blobs(2);
-    std::vector<TBlob> out_blobs(1);
-    in_blobs[0] = inputs[0].data();
-    in_blobs[1] = inputs[1].data();
-    out_blobs[0] = outputs[0].data();
-    ElemwiseBinaryOp::Compute<cpu, op::mshadow_op::plus>(attrs, ctx, in_blobs,
-                                                         req, out_blobs);
-    return;
-  }
-#endif
-  ElemwiseBinaryOp::ComputeEx<cpu, op::mshadow_op::plus>(attrs, ctx, inputs,
-                                                         req, outputs);
-}
-
-static inline bool ElemwiseAddStorageType(const nnvm::NodeAttrs& attrs,
-                                          const int dev_mask,
-                                          DispatchMode* dispatch_mode,
-                                          std::vector<int> *in_attrs,
-                                          std::vector<int> *out_attrs) {
-  CHECK_EQ(in_attrs->size(), 2);
-  CHECK_EQ(out_attrs->size(), 1);
-  bool ret = ElemwiseStorageType<2, 1, true, true, true>(attrs, dev_mask, dispatch_mode,
-                                                         in_attrs, out_attrs);
-#if MXNET_USE_MKLDNN == 1
-  if (dev_mask == mshadow::cpu::kDevMask
-      && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)
-      && out_attrs->at(0) == kDefaultStorage) {
-    *dispatch_mode = DispatchMode::kFComputeEx;
-  }
-#endif
-  return ret;
-}
-
-MXNET_OPERATOR_REGISTER_BINARY(elemwise_add)
-.set_attr<FInferStorageType>("FInferStorageType", ElemwiseAddStorageType)
-.set_attr<FCompute>("FCompute<cpu>", ElemwiseBinaryOp::Compute<cpu, op::mshadow_op::plus>)
-.set_attr<FComputeEx>("FComputeEx<cpu>", ElemwiseAddEx)
-.set_attr<FResourceRequest>("FResourceRequest",  /* For Sparse CSR */
-                            [](const NodeAttrs& attrs) {
-                            return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};})
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(elemwise_add, op::mshadow_op::plus)
 MXNET_ADD_SPARSE_OP_ALIAS(elemwise_add)
 .add_alias("_add").add_alias("_plus").add_alias("_Plus")
 .describe(R"code(Adds arguments element-wise.
@@ -103,41 +46,6 @@ The storage type of ``elemwise_add`` output depends on storage types of inputs
 // this must differ from elemwise_add to prevent add to optimization in forward pass.
 MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(_grad_add, op::mshadow_op::plus);
 
-static void _backward_ElemwiseAddEx(const nnvm::NodeAttrs& attrs,
-                                    const OpContext& ctx,
-                                    const std::vector<NDArray>& inputs,
-                                    const std::vector<OpReqType>& req,
-                                    const std::vector<NDArray>& outputs) {
-  CHECK_EQ(inputs.size(), 1U);
-  CHECK_EQ(outputs.size(), 2U);
-#if MXNET_USE_MKLDNN == 1
-  if (inputs[0].IsMKLDNNData()) {
-    MKLDNNCopy(attrs, ctx, inputs[0], req[0], outputs[0]);
-    MKLDNNCopy(attrs, ctx, inputs[0], req[1], outputs[1]);
-    return;
-  }
-#endif
-  ElemwiseBinaryOp::BackwardUseNoneEx<cpu, mshadow_op::identity, mshadow_op::identity>(
-      attrs, ctx, inputs, req, outputs);
-}
-
-static inline bool ElemwiseAddBackwardStorageType(const nnvm::NodeAttrs& attrs,
-                                                  const int dev_mask,
-                                                  DispatchMode* dispatch_mode,
-                                                  std::vector<int> *in_attrs,
-                                                  std::vector<int> *out_attrs) {
-  CHECK_EQ(in_attrs->size(), 1);
-  CHECK_EQ(out_attrs->size(), 2);
-  bool ret = ElemwiseStorageType<1, 2, true, true, true>(attrs, dev_mask, dispatch_mode,
-                                                         in_attrs, out_attrs);
-#if MXNET_USE_MKLDNN == 1
-  if (dev_mask == mshadow::cpu::kDevMask) {
-    *dispatch_mode = DispatchMode::kFComputeEx;
-  }
-#endif
-  return ret;
-}
-
 NNVM_REGISTER_OP(_backward_add)
 .set_num_inputs(1)
 .set_num_outputs(2)
@@ -147,15 +55,13 @@ NNVM_REGISTER_OP(_backward_add)
                                   return std::vector<std::pair<int, int> >{{0, 0},
                                                                            {0, 1}};
                                 })
-#if MXNET_USE_MKLDNN == 1
-.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
-  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-})
-#endif
 .set_attr<FCompute>("FCompute<cpu>", ElemwiseBinaryOp::BackwardUseNone<
   cpu, mshadow_op::identity, mshadow_op::identity>)
-.set_attr<FComputeEx>("FComputeEx<cpu>", _backward_ElemwiseAddEx)
-.set_attr<FInferStorageType>("FInferStorageType", ElemwiseAddBackwardStorageType);
+.set_attr<FComputeEx>("FComputeEx<cpu>",
+                      ElemwiseBinaryOp::BackwardUseNoneEx<cpu, mshadow_op::identity,
+                      mshadow_op::identity>)
+.set_attr<FInferStorageType>("FInferStorageType",
+                             ElemwiseStorageType<1, 2, true, true, true>);
 
 MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(elemwise_sub, op::mshadow_op::minus)
 MXNET_ADD_SPARSE_OP_ALIAS(elemwise_sub)
diff --git a/src/operator/tensor/elemwise_binary_scalar_op_basic.cc b/src/operator/tensor/elemwise_binary_scalar_op_basic.cc
index 6118ddf19c..8c12218be0 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op_basic.cc
+++ b/src/operator/tensor/elemwise_binary_scalar_op_basic.cc
@@ -65,7 +65,7 @@ static bool BinaryScalarStorageTypeWithDenseResultStorageType(const NodeAttrs& a
   const auto dispatch_ex = invalid_ctx ? DispatchMode::kFComputeFallback
                                        : DispatchMode::kFComputeEx;
   const double alpha = nnvm::get<double>(attrs.parsed);
-  if (common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)) {
+  if (instype == kDefaultStorage) {
     dispatched = storage_type_assign(&out_attrs[0],
       kDefaultStorage, dispatch_mode, DispatchMode::kFCompute);
   }
@@ -89,7 +89,7 @@ static bool BinaryScalarStorageType(const nnvm::NodeAttrs& attrs,
   const auto in_stype = in_attrs->at(0);
   auto &out_stype = out_attrs->at(0);
   bool dispatched = false;
-  if (!dispatched && (in_stype == kDefaultStorage)) {
+  if (!dispatched && in_stype == kDefaultStorage) {
     // dns -> dns
     dispatched = storage_type_assign(&out_stype, kDefaultStorage,
                                      dispatch_mode, DispatchMode::kFCompute);
diff --git a/src/operator/tensor/elemwise_sum.cc b/src/operator/tensor/elemwise_sum.cc
index 10154bc964..b31dbb2598 100644
--- a/src/operator/tensor/elemwise_sum.cc
+++ b/src/operator/tensor/elemwise_sum.cc
@@ -24,8 +24,6 @@
 */
 #include "./elemwise_sum.h"
 #include "../../ndarray/ndarray_function.h"
-#include "../nn/mkldnn/mkldnn_ops-inl.h"
-#include "../../common/utils.h"
 
 namespace mxnet {
 namespace op {
@@ -81,28 +79,9 @@ bool ElementWiseSumForwardInferStorageType(const nnvm::NodeAttrs& attrs,
                                            std::vector<int> *out_attrs) {
   CHECK(!in_attrs->empty());
   CHECK_EQ(out_attrs->size(), 1U);
-  bool ret = ElemwiseStorageAttr<false, true, false>(attrs, dev_mask, dispatch_mode,
-                                                     in_attrs, out_attrs);
-#if MXNET_USE_MKLDNN == 1
-  // We should always use FComputeEx.
-  if (dev_mask == mshadow::cpu::kDevMask
-      && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)
-      && out_attrs->at(0) == kDefaultStorage) {
-    *dispatch_mode = DispatchMode::kFComputeEx;
-  }
-#endif
-  return ret;
-}
-
-#if MXNET_USE_MKLDNN == 1
-static inline bool IsMKLDNNData(const std::vector<NDArray> &arrs) {
-  for (auto &arr : arrs) {
-    if (!arr.IsMKLDNNData())
-      return false;
-  }
-  return true;
+  return ElemwiseStorageAttr<false, true, false>(attrs, dev_mask, dispatch_mode,
+                                                 in_attrs, out_attrs);
 }
-#endif
 
 void ElementWiseSumComputeExCPU(const nnvm::NodeAttrs& attrs,
                                 const OpContext& ctx,
@@ -113,28 +92,13 @@ void ElementWiseSumComputeExCPU(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(outputs.size(), 1U);
   CHECK_EQ(req.size(), 1U);
   if (req[0] == kNullOp) return;
+  CHECK_EQ(req[0], kWriteTo) << "ElementWiseSumComputeExCPU only supports req = kWriteTo";
   if (inputs[0].storage_type() == kRowSparseStorage) {
     mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
     Resource rsc = ResourceManager::Get()->Request(ctx.run_ctx.get_ctx(),
         ResourceRequest(ResourceRequest::kTempSpace));
     NDArray out_nd = outputs[0];
     mxnet::ndarray::ElementwiseSum<cpu>(s, rsc, inputs, &out_nd);
-#if MXNET_USE_MKLDNN == 1
-  } else if (IsMKLDNNData(inputs)) {
-    MKLDNNSumForward(attrs, ctx, inputs, req[0], outputs[0]);
-#endif
-  } else if (common::ContainsOnlyStorage(inputs, kDefaultStorage)) {
-    // This case happens when we want to create an MKLDNN NDArray but the type
-    // or the shape isn't supported by MKLDNN. In this case, NDArray falls back
-    // to the default storage type and, thus, we have to handle the default
-    // storage in FComputeEx.
-    std::vector<TBlob> in_blobs(inputs.size());
-    std::vector<TBlob> out_blobs(outputs.size());
-    for (size_t i = 0; i < in_blobs.size(); i++)
-      in_blobs[i] = inputs[i].data();
-    for (size_t i = 0; i < out_blobs.size(); i++)
-      out_blobs[i] = outputs[i].data();
-    ElementWiseSumCompute<cpu>(attrs, ctx, in_blobs, req, out_blobs);
   } else {
     LogUnimplementedOp(attrs, ctx, inputs, req, outputs);
   }
diff --git a/src/operator/tensor/elemwise_unary_op_basic.cc b/src/operator/tensor/elemwise_unary_op_basic.cc
index cca3b2c9ff..13a58d0165 100644
--- a/src/operator/tensor/elemwise_unary_op_basic.cc
+++ b/src/operator/tensor/elemwise_unary_op_basic.cc
@@ -24,7 +24,6 @@
 #include <mxnet/base.h>
 #include "elemwise_unary_op.h"
 #include "./elemwise_binary_op-inl.h"
-#include "../nn/mkldnn/mkldnn_ops-inl.h"
 
 namespace mxnet {
 namespace op {
@@ -108,64 +107,12 @@ MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(_backward_sigmoid,
                                                unary_bwd<mshadow_op::sigmoid_grad>);
 
 // copy
-static void CopyEx(const nnvm::NodeAttrs& attrs,
-                   const OpContext& ctx,
-                   const std::vector<NDArray>& inputs,
-                   const std::vector<OpReqType>& req,
-                   const std::vector<NDArray>& outputs) {
-  CHECK_EQ(inputs.size(), 1U);
-  CHECK_EQ(outputs.size(), 1U);
-  const auto in_stype = inputs[0].storage_type();
-  const auto out_stype = outputs[0].storage_type();
-#if MXNET_USE_MKLDNN == 1
-  if (inputs[0].IsMKLDNNData()) {
-    MKLDNNCopy(attrs, ctx, inputs[0], req[0], outputs[0]);
-    return;
-  } else if (in_stype == kDefaultStorage && out_stype == kDefaultStorage) {
-    // This happens if inputs are supposed to be in MKLDNN format
-    // but MKLDNN doesn't support the data type or the shape. We're
-    // forced to convert it to the default format.
-    std::vector<TBlob> in_blobs {inputs[0].data()};
-    std::vector<TBlob> out_blobs {outputs[0].data()};
-    UnaryOp::IdentityCompute<cpu>(attrs, ctx, in_blobs, req, out_blobs);
-    return;
-  }
-#endif
-  UnaryOp::IdentityComputeEx<cpu>(attrs, ctx, inputs, req, outputs);
-}
-
-static inline bool CopyStorageType(const nnvm::NodeAttrs& attrs,
-                                   const int dev_mask,
-                                   DispatchMode* dispatch_mode,
-                                   std::vector<int> *in_attrs,
-                                   std::vector<int> *out_attrs) {
-  CHECK_EQ(in_attrs->size(), 1);
-  CHECK_EQ(out_attrs->size(), 1);
-  bool ret = ElemwiseStorageType<1, 1, false, true, true>(attrs, dev_mask, dispatch_mode,
-                                                          in_attrs, out_attrs);
-#if MXNET_USE_MKLDNN == 1
-  // We have to make sure all inputs are default layouts. Otherwise, we might
-  // want to fallback.
-  if (dev_mask == mshadow::cpu::kDevMask
-      && in_attrs->at(0) == kDefaultStorage
-      && out_attrs->at(0) == kDefaultStorage) {
-    *dispatch_mode = DispatchMode::kFComputeEx;
-  }
-#endif
-  return ret;
-}
-
 MXNET_OPERATOR_REGISTER_UNARY(_copy)
 .MXNET_DESCRIBE("Returns a copy of the input.")
 .add_alias("identity")
-.set_attr<FInferStorageType>("FInferStorageType", CopyStorageType)
+.set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, true>)
 .set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
-.set_attr<FComputeEx>("FComputeEx<cpu>", CopyEx)
-#if MXNET_USE_MKLDNN == 1
-.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
-  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-})
-#endif
+.set_attr<FComputeEx>("FComputeEx<cpu>", UnaryOp::IdentityComputeEx<cpu>)
 .set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity",
   [](const NodeAttrs& attrs){
     return std::vector<bool>{true};
@@ -180,14 +127,9 @@ NNVM_REGISTER_OP(_backward_copy)
   [](const NodeAttrs& attrs){
     return std::vector<std::pair<int, int> >{{0, 0}};
   })
-.set_attr<FInferStorageType>("FInferStorageType", CopyStorageType)
+.set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, true>)
 .set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
-.set_attr<FComputeEx>("FComputeEx<cpu>", CopyEx)
-#if MXNET_USE_MKLDNN == 1
-.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
-  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-})
-#endif
+.set_attr<FComputeEx>("FComputeEx<cpu>", UnaryOp::IdentityComputeEx<cpu>)
 .set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity",
   [](const NodeAttrs& attrs){
     return std::vector<bool>{true};
diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc
index 25c233318f..9167fcfe7e 100644
--- a/src/operator/tensor/matrix_op.cc
+++ b/src/operator/tensor/matrix_op.cc
@@ -25,8 +25,6 @@
 // this will be invoked by gcc and compile CPU version
 #include "./matrix_op-inl.h"
 #include "./elemwise_unary_op.h"
-#include "../nn/mkldnn/mkldnn_ops-inl.h"
-#include "../nn/mkldnn/mkldnn_base-inl.h"
 
 namespace mxnet {
 namespace op {
@@ -182,51 +180,6 @@ If the argument `reverse` is set to 1, then the special values are inferred from
 .add_argument("data", "NDArray-or-Symbol", "Input data to reshape.")
 .add_arguments(ReshapeParam::__FIELDS__());
 
-static void FlattenEx(const nnvm::NodeAttrs& attrs,
-                      const OpContext& ctx,
-                      const std::vector<NDArray>& inputs,
-                      const std::vector<OpReqType>& req,
-                      const std::vector<NDArray>& outputs) {
-  CHECK_EQ(inputs.size(), 1U);
-  CHECK_EQ(outputs.size(), 1U);
-#if MXNET_USE_MKLDNN == 1
-  const auto in_stype = inputs[0].storage_type();
-  const auto out_stype = outputs[0].storage_type();
-  if (inputs[0].IsMKLDNNData()) {
-    MKLDNNCopy(attrs, ctx, inputs[0], req[0], outputs[0]);
-    // If the output is a special MKLDNN layout and the number of dimensions
-    // is larger than 2, we should use the default layout.
-    if (outputs[0].IsMKLDNNData() && inputs[0].shape().ndim() > 2)
-      const_cast<NDArray &>(outputs[0]).Reorder2Default();
-    return;
-  } else {
-    // This happens if inputs are supposed to be in MKLDNN format
-    // but MKLDNN doesn't support the data type or the shape. We're
-    // forced to convert it to the default format.
-    FallBackCompute(UnaryOp::IdentityCompute<cpu>, attrs, ctx, inputs, req, outputs);
-    return;
-  }
-#endif
-}
-
-static inline bool FlattenStorageType(const nnvm::NodeAttrs& attrs,
-                                   const int dev_mask,
-                                   DispatchMode* dispatch_mode,
-                                   std::vector<int> *in_attrs,
-                                   std::vector<int> *out_attrs) {
-  CHECK_EQ(in_attrs->size(), 1);
-  CHECK_EQ(out_attrs->size(), 1);
-  bool ret = ElemwiseStorageType<1, 1, false, false, false>(attrs, dev_mask, dispatch_mode,
-                                                            in_attrs, out_attrs);
-#if MXNET_USE_MKLDNN == 1
-  if (dev_mask == mshadow::cpu::kDevMask
-      && in_attrs->at(0) == kDefaultStorage
-      && out_attrs->at(0) == kDefaultStorage) {
-    *dispatch_mode = DispatchMode::kFComputeEx;
-  }
-#endif
-  return ret;
-}
 
 NNVM_REGISTER_OP(Flatten)
 .add_alias("flatten")
@@ -257,15 +210,8 @@ Example::
 .set_num_outputs(1)
 .set_attr<nnvm::FInferShape>("FInferShape", FlattenShape)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
-.set_attr<FInferStorageType>("FInferStorageType", FlattenStorageType)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{ "_backward_copy" })
 .set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
-.set_attr<FComputeEx>("FComputeEx<cpu>", FlattenEx)
-#if MXNET_USE_MKLDNN == 1
-.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
-  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-})
-#endif
 .set_attr<nnvm::FInplaceOption>("FInplaceOption",
   [](const NodeAttrs& attrs) {
     return std::vector<std::pair<int, int> >{{0, 0}};
diff --git a/src/storage/cpu_device_storage.h b/src/storage/cpu_device_storage.h
index 52df4dd2bb..f0dd61f01a 100644
--- a/src/storage/cpu_device_storage.h
+++ b/src/storage/cpu_device_storage.h
@@ -54,13 +54,7 @@ class CPUDeviceStorage {
   /*!
    * \brief Alignment of allocation.
    */
-#if MXNET_USE_MKLDNN == 1
-  // MKLDNN requires special alignment. 4096 is used by the MKLDNN library in
-  // memory allocation.
-  static constexpr size_t alignment_ = 4096;
-#else
   static constexpr size_t alignment_ = 16;
-#endif
 };  // class CPUDeviceStorage
 
 inline void* CPUDeviceStorage::Alloc(size_t size) {
diff --git a/tests/ci_build/ci_build.sh b/tests/ci_build/ci_build.sh
index 1d6d64be38..794a4c55ee 100755
--- a/tests/ci_build/ci_build.sh
+++ b/tests/ci_build/ci_build.sh
@@ -178,7 +178,6 @@ ${DOCKER_BINARY} run --rm --pid=host \
     -e "CI_BUILD_GID=$(id -g)" \
     -e "CUDA_ARCH=-gencode arch=compute_52,code=[sm_52,compute_52] --fatbin-options -compress-all" \
     -e "MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0" \
-    -e "ARCH_OPT=-mavx2" \
     ${CI_DOCKER_EXTRA_PARAMS[@]} \
     ${DOCKER_IMG_NAME} \
     ${PRE_COMMAND} \
diff --git a/tests/cpp/include/test_core_op.h b/tests/cpp/include/test_core_op.h
index 570911c235..6a220bdad6 100644
--- a/tests/cpp/include/test_core_op.h
+++ b/tests/cpp/include/test_core_op.h
@@ -209,13 +209,6 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
           requested.emplace_back(r);
         } else if (req.type == ResourceRequest::kRandom) {
           requested.emplace_back(ResourceManager::Get()->Request(ctx->run_ctx.ctx, req));
-        } else if (req.type == ResourceRequest::kParallelRandom) {
-          Resource rm = ResourceManager::Get()->Request(ctx->run_ctx.ctx, req);
-          if (ctx->run_ctx.ctx.dev_mask() == Context::kCPU) {
-            common::random::RandGenerator<cpu, DType>::AllocState(
-                rm.get_parallel_random<cpu, DType>());
-          }
-          requested.emplace_back(rm);
         } else {
           LOG(FATAL) << "resource type not yet supported";
         }
@@ -321,9 +314,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
       // Set up forward
       attrs_ = ParseAttrs(op_, args);
 
-      int num_inputs = op_->num_inputs;
-      if (op_->get_num_inputs)
-        num_inputs = op_->get_num_inputs(attrs_);
+      const int num_inputs = op_->num_inputs;
 
       if (!inputs.empty()) {
         CHECK_EQ(inputs.size(), static_cast<size_t>(num_inputs));
@@ -349,8 +340,8 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
 
       inputs_.reserve(num_inputs);
       inputs_p.reserve(num_inputs);
-      outputs_.reserve(inferred_num_outputs);
-      outputs_p.reserve(inferred_num_outputs);
+      outputs_.reserve(num_visible_outputs);
+      outputs_p.reserve(num_visible_outputs);
 
       for (size_t i = 0; i < static_cast<size_t>(num_inputs); ++i) {
         CHECK_LT(i, static_cast<int>(shapes.size()));
@@ -359,7 +350,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
         inputs_p.emplace_back(&*inputs_.rbegin());
       }
 
-      for (size_t i = 0; i < static_cast<size_t>(inferred_num_outputs); ++i) {
+      for (size_t i = 0; i < static_cast<size_t>(num_visible_outputs); ++i) {
         // If supplied and valid, pass from the supplied outputs vector
         // Otherwise use empty for forward pass, or zero-filled for backward pass
         outputs_.emplace_back(i < outputs.size()
diff --git a/tests/cpp/include/test_op_runner.h b/tests/cpp/include/test_op_runner.h
index 1e00e30a1b..0992c41f76 100644
--- a/tests/cpp/include/test_op_runner.h
+++ b/tests/cpp/include/test_op_runner.h
@@ -137,8 +137,7 @@ class OperatorRunner {
              const test::op::kwargs_t& kwargs,
              int dim = 0,
              size_t count = 1,
-             const std::vector<TShape>& timing_shapes = {},
-             bool backward = true) {
+             const std::vector<TShape>& timing_shapes = {}) {
     if (mxnet::test::quick_test) {
       total_iterations_ = 2;
       count = 1;
@@ -226,7 +225,7 @@ class OperatorRunner {
           CHECK(false) << "Unsupported dimension count: " << (D + 1);
       }
       if (info.executor_) {
-        if (info.executor_->HasBackward() && backward) {
+        if (info.executor_->HasBackward()) {
           RunGenericOperatorBackward(&info, count);
         }
         timing += info.executor_->GetTiming();
diff --git a/tests/cpp/operator/activation_perf.cc b/tests/cpp/operator/activation_perf.cc
index 1bd8ca89c9..e482848705 100644
--- a/tests/cpp/operator/activation_perf.cc
+++ b/tests/cpp/operator/activation_perf.cc
@@ -26,7 +26,7 @@
 #include <gtest/gtest.h>
 #include <mxnet/tensor_blob.h>
 #include "../include/test_op_runner.h"
-#include "../include/test_core_op.h"
+#include "../include/test_legacy_op.h"
 #include "../../src/operator/nn/activation-inl.h"
 
 using namespace mxnet;
@@ -41,10 +41,8 @@ TEST(ACTIVATION_PERF, ExecuteBidirectional) {
   TShape shape({5, 5});
   kwargs_t kwargs = basic_activation_args;
   kwargs.push_back({"act_type", "tanh"});
-
-  test::op::CoreOperatorRunner<float> runner;
-  runner.RunBidirectional(false, { shape }, test::op::CoreOpExecutor<float>::ArgsWithOpName(
-          kwargs, "Activation", "_backward_Activation"), 1);
+  test::op::LegacyOpRunner<mxnet::op::ActivationProp, float, float> runner;
+  runner.RunBidirectional(false, { shape }, kwargs, 1);
 }
 
 /*!
@@ -54,12 +52,10 @@ TEST(ACTIVATION_PERF, TimingCPU) {
   kwargs_t kwargs = basic_activation_args;
   // Which math function is arbitrary since it will have roughly constant timing among approaches
   kwargs.push_back({"act_type", "tanh"});
-  kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "Activation",
-                                                           "_backward_Activation");
-  TShape shape({10, 10, 10, 10});
-  test::op::CoreOperatorRunner<float> runner;
-  runner.RunBidirectional(false, { shape }, kwargs, 1);
-
+  test::op::LegacyOpRunner<mxnet::op::ActivationProp, float, float> runner;
+  runner.RunBidirectional(false,
+                          { TShape({10, 10, 10, 10}) },
+                          kwargs, 1);  // prime code and cache
   std::vector <TShape> shapes;
   if (test::performance_run) {
     shapes = {
@@ -88,11 +84,11 @@ TEST(ACTIVATION_PERF, TimingGPU) {
   kwargs_t kwargs = basic_activation_args;
   // Which math function is arbitrary since it will have roughly constant timing among approaches
   kwargs.push_back({"act_type", "tanh"});
-  kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "Activation",
-                                                           "_backward_Activation");
-  TShape shape({10, 10, 10, 10});
-  test::op::CoreOperatorRunner<float> runner;
-  runner.RunBidirectional(true, { shape }, kwargs, 1);
+  test::OperatorRunner<mxnet::op::ActivationProp,
+    test::op::LegacyOperatorExecutor<float, float>> runner;
+  runner.RunBidirectional(true,
+                          { TShape({10, 10, 10, 10}) },
+                          kwargs, 1);  // prime code and cache
   std::vector <TShape> shapes = {
       {1,  1, 28,  28},
       {1,  3, 28,  28},
diff --git a/tests/cpp/operator/batchnorm_test.cc b/tests/cpp/operator/batchnorm_test.cc
index 607b980468..179e42a383 100644
--- a/tests/cpp/operator/batchnorm_test.cc
+++ b/tests/cpp/operator/batchnorm_test.cc
@@ -24,14 +24,11 @@
  * \author Chris Olivier
 */
 
-#if 0
-
 #include <dmlc/logging.h>
 #include <mxnet/tensor_blob.h>
 #include "../../src/operator/nn/batch_norm-inl.h"
 #include "../../src/operator/batch_norm_v1-inl.h"
 #include "./test_legacy_op.h"
-#include "./test_core_op.h"
 #include "executor/exec_pass.h"
 
 using namespace mxnet;
@@ -1830,5 +1827,3 @@ TEST(BATCH_NORM, Test2DBackwardMixedComplex_gpu_cpu_ugs) {
 }
 
 #endif  // MXNET_USE_CUDA
-
-#endif
diff --git a/tests/cpp/operator/dropout_perf.cc b/tests/cpp/operator/dropout_perf.cc
index c28b9bd480..90bf6ebb0d 100644
--- a/tests/cpp/operator/dropout_perf.cc
+++ b/tests/cpp/operator/dropout_perf.cc
@@ -26,7 +26,7 @@
 #include <gtest/gtest.h>
 #include <mxnet/tensor_blob.h>
 #include "../include/test_op_runner.h"
-#include "../include/test_core_op.h"
+#include "../include/test_legacy_op.h"
 #include "../../src/operator/nn/dropout-inl.h"
 
 using namespace mxnet;
@@ -41,10 +41,8 @@ TEST(DROPOUT_PERF, ExecuteBidirectional) {
   TShape shape({5, 5});
   kwargs_t kwargs = basic_dropout_args;
   kwargs.push_back({"mode", "always"});
-  test::op::CoreOperatorRunner<float> runner;
-  kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "Dropout",
-                                                           "_backward_Dropout");
-  runner.RunGenericOperatorForward(false, { shape }, kwargs, 1);
+  test::op::LegacyOpRunner<mxnet::op::DropoutProp, float, float> runner;
+  runner.RunBidirectional(false, { shape }, kwargs, 1);
 }
 
 /*!
@@ -54,11 +52,10 @@ TEST(DROPOUT_PERF, TimingCPU) {
   kwargs_t kwargs = basic_dropout_args;
 // Which math function is arbitrary since it will have roughly constant timing among approaches
   kwargs.push_back({"mode", "always"});
-  TShape shape({10, 10, 10, 10});
-  test::op::CoreOperatorRunner<float> runner;
-  kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "Dropout",
-                                                           "_backward_Dropout");
-  runner.RunGenericOperatorForward(false, { shape }, kwargs, 1);
+  test::op::LegacyOpRunner<mxnet::op::DropoutProp, float, float> runner;
+  runner.RunBidirectional(false,
+                          { TShape({10, 10, 10, 10}) },
+                          kwargs, 1);  // prime code and cache
   std::vector <TShape> shapes;
   if (test::performance_run) {
     shapes = {
@@ -75,9 +72,7 @@ TEST(DROPOUT_PERF, TimingCPU) {
     };
   }
   for (const TShape &shape : shapes) {
-    kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "Dropout",
-                                                             "_backward_Dropout");
-    runner.TimingTest("Dropout Operator CPU", false, false, kwargs, 2, 10, { shape }, false);
+    runner.TimingTest("Dropout Operator CPU", false, false, kwargs, 2, 10, { shape });
   }
 }
 
@@ -89,11 +84,11 @@ TEST(DROPOUT_PERF, TimingGPU) {
   kwargs_t kwargs = basic_dropout_args;
   // Which math function is arbitrary since it will have roughly constant timing among approaches
   kwargs.push_back({"mode", "always"});
-  TShape shape({10, 10, 10, 10});
-  test::op::CoreOperatorRunner<float> runner;
-  kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "Dropout",
-                                                           "_backward_Dropout");
-  runner.RunGenericOperatorForward(true, { shape }, kwargs, 1);
+  test::OperatorRunner<mxnet::op::DropoutProp,
+    test::op::LegacyOperatorExecutor<float, float>> runner;
+  runner.RunBidirectional(true,
+                          { TShape({10, 10, 10, 10}) },
+                          kwargs, 1);  // prime code and cache
   std::vector <TShape> shapes = {
     {1,  1, 28,  28},
     {1,  3, 28,  28},
@@ -102,9 +97,8 @@ TEST(DROPOUT_PERF, TimingGPU) {
     {20, 3, 128, 128}
   };
   for (const TShape &shape : shapes) {
-    kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "Dropout",
-                                                             "_backward_Dropout");
-    runner.TimingTest("Dropout Operator GPU", true, false, kwargs, 2, 10, { shape }, false);
+    runner.TimingTest("Dropout Operator GPU", true, false, kwargs, 2, 10, { shape });
   }
 }
 #endif  // MXNET_USE_CUDA == 1
+
diff --git a/tests/cpp/operator/fully_conn_perf.cc b/tests/cpp/operator/fully_conn_perf.cc
index 0ea4082a3f..2acfacdddc 100644
--- a/tests/cpp/operator/fully_conn_perf.cc
+++ b/tests/cpp/operator/fully_conn_perf.cc
@@ -29,25 +29,21 @@
 #include <nnvm/tuple.h>
 #include "../../src/operator/nn/fully_connected-inl.h"
 #include "../include/test_op_runner.h"
-#include "../include/test_core_op.h"
+#include "../include/test_legacy_op.h"
 
 using namespace mxnet;
 
 typedef std::vector<std::pair<std::string, std::string> > kwargs_t;
 
-const kwargs_t basic_fullyconn_args = { {"num_hidden", "250"}, {"no_bias", "true"} };
+const kwargs_t basic_fullyconn_args = { {"num_hidden", "250"} };
 /*!
  * \brief Generic bidirectional sanity test
  */
 TEST(FULLY_CONNECTED, ExecuteBidirectionalFullyConnected) {
-  TShape shape1({5, 5});
-  TShape shape2({250, 5});
+  TShape shape({5, 5});
   kwargs_t kwargs = basic_fullyconn_args;
-  test::op::CoreOperatorRunner<float> runner;
-  runner.set_verbose(true);
-  kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "FullyConnected",
-                                                           "_backward_FullyConnected");
-  runner.RunGenericOperatorForward(false, { shape1, shape2 }, kwargs, 1);
+  test::op::LegacyOpRunner<mxnet::op::FullyConnectedProp, float, float> runner;
+  runner.RunBidirectional(false, { shape }, kwargs, 1);
 }
 
 /*!
@@ -55,12 +51,10 @@ TEST(FULLY_CONNECTED, ExecuteBidirectionalFullyConnected) {
  */
 TEST(FULLY_CONNECTED, FullyConnectedTimingCPU) {
   kwargs_t kwargs = basic_fullyconn_args;
-  TShape shape1({10, 10, 10, 10});
-  TShape shape2({250, 1000});
-  test::op::CoreOperatorRunner<float> runner;
-  kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "FullyConnected",
-                                                           "_backward_FullyConnected");
-  runner.RunGenericOperatorForward(false, { shape1, shape2 }, kwargs, 1);
+  test::op::LegacyOpRunner<mxnet::op::FullyConnectedProp, float, float> runner;
+  runner.RunBidirectional(false,
+                          { TShape({10, 10, 10, 10}) },
+                          kwargs, 1);  // prime code and cache
   std::vector <TShape> shapes;
   if (test::performance_run) {
     shapes = {
@@ -77,11 +71,7 @@ TEST(FULLY_CONNECTED, FullyConnectedTimingCPU) {
     };
   }
   for (const TShape& shape : shapes) {
-    TShape shape2({250, static_cast<nnvm::dim_t>(shape.ProdShape(1, shape.ndim()))});
-    kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "FullyConnected",
-                                                             "_backward_FullyConnected");
-    runner.TimingTest("Fully connected CPU", false, false, kwargs, 2, 10,
-                      { shape, shape2 }, false);
+    runner.TimingTest("Fully connected CPU", false, false, kwargs, 2, 10, { shape });
   }
 }
 
@@ -91,12 +81,12 @@ TEST(FULLY_CONNECTED, FullyConnectedTimingCPU) {
  */
 TEST(FULLY_CONNECTED, FullyConnectedTimingGPU) {
   kwargs_t kwargs = basic_fullyconn_args;
-  TShape shape1({10, 10, 10, 10});
-  TShape shape2({250, 1000});
-  test::op::CoreOperatorRunner<float> runner;
-  kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "FullyConnected",
-                                                           "_backward_FullyConnected");
-  runner.RunGenericOperatorForward(true, { shape1, shape2 }, kwargs, 1);
+  test::OperatorRunner<mxnet::op::FullyConnectedProp,
+    test::op::LegacyOperatorExecutor<float, float>>
+    runner;
+  runner.RunBidirectional(true,
+                          { TShape({10, 10, 10, 10}) },
+                          kwargs, 1);  // prime code and cache
   std::vector <TShape> shapes;
   if (test::performance_run) {
     shapes = {
@@ -113,11 +103,7 @@ TEST(FULLY_CONNECTED, FullyConnectedTimingGPU) {
     };
   }
   for (const TShape& shape : shapes) {
-    TShape shape2({250, shape.ProdShape(1, shape.ndim())});
-    kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "FullyConnected",
-                                                             "_backward_FullyConnected");
-    runner.TimingTest("Fully connected GPU", true, false, kwargs, 2, 10,
-                      { shape, shape2 }, false);
+    runner.TimingTest("Fully connected GPU", true, false, kwargs, 2, 10, { shape });
   }
 }
 #endif  // MXNET_USE_CUDA == 1
diff --git a/tests/cpp/operator/mkldnn.cc b/tests/cpp/operator/mkldnn.cc
deleted file mode 100644
index a8a3d26fac..0000000000
--- a/tests/cpp/operator/mkldnn.cc
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- *  \file mkldnn.cc
- *  \brief test functions in mkldnn.
- *  \author Da Zheng
- */
-
-#if MXNET_USE_MKLDNN == 1
-
-#include "gtest/gtest.h"
-#include "../../src/operator/nn/mkldnn/mkldnn_base-inl.h"
-
-bool test_mem_align(void *mem, size_t size, size_t alignment, size_t space) {
-  void *ret1, *ret2;
-  size_t space1, space2;
-  space1 = space;
-  space2 = space;
-  ret1 = mxnet::AlignMem(mem, size, alignment, &space1);
-  ret2 = std::align(alignment, size, mem, space2);
-  EXPECT_EQ(ret1, ret2);
-  EXPECT_EQ(space1, space2);
-  return ret1 == ret2;
-}
-
-TEST(MKLDNN_UTIL_FUNC, AlignMem) {
-  size_t alignment = 4096;
-  void *mem;
-  size_t size, space;
-
-  // When mem has been aligned.
-  mem = reinterpret_cast<void *>(0x10000);
-  size = 1000;
-  space = 10000;
-  test_mem_align(mem, size, alignment, space);
-
-  // When mem isn't aligned and we have enough space for alignment.
-  mem = reinterpret_cast<void *>(0x10010);
-  size = 1000;
-  space = 10000;
-  test_mem_align(mem, size, alignment, space);
-
-  // When mem isn't aligned and we don't have enough memory for alignment
-  mem = reinterpret_cast<void *>(0x10010);
-  size = 1000;
-  space = 1001;
-  test_mem_align(mem, size, alignment, space);
-
-  for (size_t i = 0; i < 10000; i++) {
-    mem = reinterpret_cast<void *>(random());
-    size = random() % 2000;
-    space = random() % 2000;
-    test_mem_align(mem, size, alignment, space);
-  }
-}
-#endif
diff --git a/tests/python/gpu/test_gluon_model_zoo_gpu.py b/tests/python/gpu/test_gluon_model_zoo_gpu.py
deleted file mode 100644
index bc35b0b323..0000000000
--- a/tests/python/gpu/test_gluon_model_zoo_gpu.py
+++ /dev/null
@@ -1,163 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from __future__ import print_function
-import mxnet as mx
-import numpy as np
-import copy
-from mxnet import autograd
-from mxnet.gluon.model_zoo.vision import get_model
-from mxnet.test_utils import assert_almost_equal
-import sys
-
-def eprint(*args, **kwargs):
-    print(*args, file=sys.stderr, **kwargs)
-
-VAL_DATA='data/val-5k-256.rec'
-def download_data():
-    return mx.test_utils.download(
-        'http://data.mxnet.io/data/val-5k-256.rec', VAL_DATA)
-
-def test_inference():
-    all_models = ['resnet50_v1', 'vgg19_bn', 'alexnet', #'inceptionv3',
-                  'densenet201', 'squeezenet1.0', 'mobilenet0.25']
-
-    batch_size = 10
-    download_data()
-    for model_name in all_models:
-        eprint('testing inference on %s'%model_name)
-
-        data_shape = (3, 224, 224) if 'inception' not in model_name else (3, 299, 299)
-        dataIter = mx.io.ImageRecordIter(
-            path_imgrec        = VAL_DATA,
-            label_width        = 1,
-            preprocess_threads = 1,
-            batch_size         = batch_size,
-            data_shape         = data_shape,
-            label_name         = 'softmax_label',
-            rand_crop          = False,
-            rand_mirror        = False)
-        data_batch = dataIter.next()
-        data = data_batch.data[0]
-        label = data_batch.label[0]
-        gpu_data = data.as_in_context(mx.gpu())
-        gpu_label = label.as_in_context(mx.gpu())
-
-        # This is to create a model and run the model once to initialize
-        # all parameters.
-        cpu_model = get_model(model_name)
-        cpu_model.collect_params().initialize(ctx=mx.cpu())
-        cpu_model(mx.nd.array(data, ctx=mx.cpu()))
-        gpu_model = get_model(model_name)
-        gpu_model.collect_params().initialize(ctx=mx.gpu())
-        gpu_model(mx.nd.array(data, ctx=mx.gpu()))
-
-        # Force the two models have the same parameters.
-        cpu_params = cpu_model.collect_params()
-        gpu_params = gpu_model.collect_params()
-        for k in cpu_params.keys():
-            k = k.replace(cpu_params.prefix, '')
-            cpu_param = cpu_params.get(k)
-            gpu_param = gpu_params.get(k)
-            gpu_param.set_data(cpu_param.data().as_in_context(mx.gpu()))
-
-        # Run inference.
-        with autograd.record(train_mode=False):
-            cpu_out = cpu_model(mx.nd.array(data, ctx=mx.cpu()))
-            gpu_out = gpu_model(gpu_data)
-        out = cpu_out.asnumpy()
-        max_val = np.max(out)
-        assert_almost_equal(out / max_val, gpu_out.asnumpy() / max_val, rtol=1e-2, atol=1e-2)
-
-def get_nn_model(name):
-    if "densenet" in name:
-        return get_model(name, dropout=0)
-    else:
-        return get_model(name)
-
-def test_training():
-    # We use network models without dropout for testing.
-    # TODO(zhengda) mobilenet can't pass this test even without MKLDNN.
-    all_models = ['resnet18_v1', 'densenet121']
-
-    batch_size = 10
-    label = mx.nd.random.uniform(low=0, high=10, shape=(batch_size)).astype('int32')
-
-    download_data()
-    dataIter = mx.io.ImageRecordIter(
-        path_imgrec        = VAL_DATA,
-        label_width        = 1,
-        preprocess_threads = 1,
-        batch_size         = batch_size,
-        data_shape         = (3, 224, 224),
-        label_name         = 'softmax_label',
-        rand_crop          = False,
-        rand_mirror        = False)
-    data_batch = dataIter.next()
-    data = data_batch.data[0]
-    label = data_batch.label[0]
-    gpu_data = data.as_in_context(mx.gpu())
-    gpu_label = label.as_in_context(mx.gpu())
-    softmax_cross_entropy = mx.gluon.loss.SoftmaxCrossEntropyLoss()
-
-    for model_name in all_models:
-        eprint('testing %s'%model_name)
-        #data = mx.nd.random.uniform(shape=(100, 3, 224, 224))
-
-        # This is to create a model and run the model once to initialize
-        # all parameters.
-        cpu_model = get_nn_model(model_name)
-        cpu_model.collect_params().initialize(ctx=mx.cpu())
-        cpu_model(mx.nd.array(data, ctx=mx.cpu()))
-        gpu_model = get_nn_model(model_name)
-        gpu_model.collect_params().initialize(ctx=mx.gpu())
-        gpu_model(mx.nd.array(data, ctx=mx.gpu()))
-
-        # Force the two models have the same parameters.
-        cpu_params = cpu_model.collect_params()
-        gpu_params = gpu_model.collect_params()
-        for k in cpu_params.keys():
-            k = k.replace(cpu_params.prefix, '')
-            cpu_param = cpu_params.get(k)
-            gpu_param = gpu_params.get(k)
-            gpu_param.set_data(cpu_param.data().as_in_context(mx.gpu()))
-
-        cpu_trainer = mx.gluon.Trainer(cpu_params, 'sgd', {'learning_rate': 0.1})
-        gpu_trainer = mx.gluon.Trainer(gpu_params, 'sgd', {'learning_rate': 0.1})
-
-        # Run forward and backward once.
-        with autograd.record():
-            cpu_out = cpu_model(mx.nd.array(data, ctx=mx.cpu()))
-            gpu_out = gpu_model(gpu_data)
-            cpu_loss = softmax_cross_entropy(cpu_out, label)
-            gpu_loss = softmax_cross_entropy(gpu_out, gpu_label)
-        assert_almost_equal(cpu_out.asnumpy(), gpu_out.asnumpy(), rtol=1e-2, atol=1e-2)
-        cpu_loss.backward()
-        gpu_loss.backward()
-        cpu_trainer.step(batch_size)
-        gpu_trainer.step(batch_size)
-
-        # Compare the parameters of the two models.
-        for k in cpu_params.keys():
-            k = k.replace(cpu_params.prefix, '')
-            cpu_param = cpu_params.get(k)
-            gpu_param = gpu_params.get(k)
-            assert_almost_equal(cpu_param.data().asnumpy(), gpu_param.data().asnumpy(), rtol=1e-2, atol=1e-2)
-
-if __name__ == '__main__':
-    import nose
-    nose.runmodule()
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 5ae489529c..55bb30cc7d 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -987,13 +987,6 @@ def test_activation_with_type():
     check_consistency(sym, ctx_list)
 
 
-def test_lrn():
-    sym = mx.sym.LRN(alpha=0.0001, beta=0.75, knorm=2, nsize=5, name='lrn')
-    ctx_list = [{'ctx': mx.gpu(0), 'lrn_data': (2, 6, 10, 10), 'type_dict': {'lrn_data': np.float32}},
-                {'ctx': mx.cpu(0), 'lrn_data': (2, 6, 10, 10), 'type_dict': {'lrn_data': np.float32}}]
-    check_consistency(sym, ctx_list)
-
-
 def test_embedding_with_type():
     def test_embedding_helper(data_types, weight_types, low_pad, high_pad):
         NVD = [[20, 10, 20], [200, 10, 300]]


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services

Mime
View raw message