tvm-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tqc...@apache.org
Subject [incubator-tvm] branch master updated: [VTA] HW sources refactor (#5188)
Date Tue, 31 Mar 2020 05:17:48 GMT
This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-tvm.git


The following commit(s) were added to refs/heads/master by this push:
     new 4683c3f  [VTA] HW sources refactor (#5188)
4683c3f is described below

commit 4683c3f55c51e3e79fa3b099ae7a764130be261e
Author: Thierry Moreau <tmoreau@octoml.ai>
AuthorDate: Mon Mar 30 22:17:36 2020 -0700

    [VTA] HW sources refactor (#5188)
    
    * refactor
    
    * path udpate
---
 .gitmodules                                        |    3 +
 3rdparty/vta-hw                                    |    1 +
 Makefile                                           |    2 +-
 cmake/modules/VTA.cmake                            |    2 +-
 docs/vta/dev/config.rst                            |    2 +-
 docs/vta/dev/hardware.rst                          |   10 +-
 docs/vta/install.md                                |   30 +-
 tests/scripts/task_build.sh                        |    1 +
 tests/scripts/task_cpp_unittest.sh                 |    2 +
 tests/scripts/task_python_vta_fsim.sh              |    2 +-
 tests/scripts/task_python_vta_tsim.sh              |    2 +-
 vta/python/vta/environment.py                      |    2 +-
 vta/tutorials/autotvm/tune_relay_vta.py            |    2 +-
 vta/tutorials/frontend/deploy_classification.py    |    2 +-
 vta/tutorials/frontend/deploy_detection.py         |    2 +-
 vta/tutorials/matrix_multiply.py                   |    2 +-
 vta/tutorials/optimize/convolution_opt.py          |    2 +-
 vta/tutorials/optimize/matrix_multiply_opt.py      |    2 +-
 vta/vta-hw/apps/gemm/CMakeLists.txt                |   51 -
 vta/vta-hw/apps/gemm/Makefile                      |   42 -
 vta/vta-hw/apps/gemm/README.md                     |   50 -
 vta/vta-hw/apps/gemm/hardware/chisel/Makefile      |  112 --
 vta/vta-hw/apps/gemm/hardware/chisel/build.sbt     |   69 -
 .../gemm/hardware/chisel/project/build.properties  |   20 -
 .../apps/gemm/hardware/chisel/project/plugins.sbt  |   20 -
 .../chisel/src/main/scala/accel/Accel.scala        |   62 -
 .../chisel/src/main/scala/accel/Compute.scala      |  241 ----
 .../chisel/src/main/scala/accel/RegFile.scala      |  123 --
 .../chisel/src/test/scala/dut/TestAccel.scala      |   70 -
 vta/vta-hw/apps/gemm/python/__init__.py            |   18 -
 vta/vta-hw/apps/gemm/python/tsim.py                |   73 -
 vta/vta-hw/apps/gemm/src/driver.cc                 |  177 ---
 vta/vta-hw/apps/gemm/tests/python/chisel_accel.py  |  204 ---
 vta/vta-hw/apps/tsim_example/CMakeLists.txt        |   51 -
 vta/vta-hw/apps/tsim_example/Makefile              |   46 -
 vta/vta-hw/apps/tsim_example/README.md             |   87 --
 .../apps/tsim_example/hardware/chisel/Makefile     |  116 --
 .../apps/tsim_example/hardware/chisel/build.sbt    |   69 -
 .../hardware/chisel/project/build.properties       |   20 -
 .../hardware/chisel/project/plugins.sbt            |   21 -
 .../chisel/src/main/scala/accel/Accel.scala        |   62 -
 .../chisel/src/main/scala/accel/Compute.scala      |  129 --
 .../chisel/src/main/scala/accel/RegFile.scala      |  124 --
 .../chisel/src/test/scala/dut/TestAccel.scala      |   70 -
 .../apps/tsim_example/hardware/verilog/Makefile    |  107 --
 .../apps/tsim_example/hardware/verilog/src/Accel.v |  139 --
 .../tsim_example/hardware/verilog/src/Compute.v    |  180 ---
 .../tsim_example/hardware/verilog/src/RegFile.v    |  184 ---
 .../tsim_example/hardware/verilog/src/TestAccel.v  |  128 --
 vta/vta-hw/apps/tsim_example/python/__init__.py    |   18 -
 vta/vta-hw/apps/tsim_example/python/tsim.py        |   73 -
 vta/vta-hw/apps/tsim_example/src/driver.cc         |  164 ---
 .../apps/tsim_example/tests/python/chisel_accel.py |   40 -
 .../tsim_example/tests/python/verilog_accel.py     |   40 -
 vta/vta-hw/config/README.md                        |   25 -
 vta/vta-hw/config/de10nano_sample.json             |   13 -
 vta/vta-hw/config/fsim_sample.json                 |   13 -
 vta/vta-hw/config/pkg_config.py                    |  310 -----
 vta/vta-hw/config/pynq_sample.json                 |   13 -
 vta/vta-hw/config/tsim_sample.json                 |   13 -
 vta/vta-hw/config/ultra96_sample.json              |   13 -
 vta/vta-hw/config/vta_config.json                  |   13 -
 vta/vta-hw/config/vta_config.py                    |  230 ----
 vta/vta-hw/hardware/chisel/.gitignore              |    1 -
 vta/vta-hw/hardware/chisel/Makefile                |  205 ---
 vta/vta-hw/hardware/chisel/README.md               |   30 -
 vta/vta-hw/hardware/chisel/build.sbt               |   72 -
 .../hardware/chisel/project/build.properties       |   20 -
 vta/vta-hw/hardware/chisel/project/plugins.sbt     |   21 -
 vta/vta-hw/hardware/chisel/scalastyle-config.xml   |  128 --
 .../chisel/src/main/resources/verilog/VTAHostDPI.v |   98 --
 .../chisel/src/main/resources/verilog/VTAMemDPI.v  |  106 --
 .../chisel/src/main/resources/verilog/VTASimDPI.v  |   78 --
 .../chisel/src/main/scala/core/Compute.scala       |  208 ---
 .../chisel/src/main/scala/core/Configs.scala       |   48 -
 .../hardware/chisel/src/main/scala/core/Core.scala |  120 --
 .../chisel/src/main/scala/core/Decode.scala        |  230 ----
 .../chisel/src/main/scala/core/EventCounters.scala |   67 -
 .../chisel/src/main/scala/core/Fetch.scala         |  195 ---
 .../hardware/chisel/src/main/scala/core/ISA.scala  |  148 --
 .../hardware/chisel/src/main/scala/core/Load.scala |  132 --
 .../chisel/src/main/scala/core/LoadUop.scala       |  225 ---
 .../chisel/src/main/scala/core/Semaphore.scala     |   44 -
 .../chisel/src/main/scala/core/Store.scala         |  114 --
 .../chisel/src/main/scala/core/TensorAlu.scala     |  308 -----
 .../chisel/src/main/scala/core/TensorGemm.scala    |  413 ------
 .../chisel/src/main/scala/core/TensorLoad.scala    |  302 ----
 .../chisel/src/main/scala/core/TensorStore.scala   |  256 ----
 .../chisel/src/main/scala/core/TensorUtil.scala    |  355 -----
 .../chisel/src/main/scala/core/package.scala       |   23 -
 .../chisel/src/main/scala/dpi/VTAHostDPI.scala     |  163 ---
 .../chisel/src/main/scala/dpi/VTAMemDPI.scala      |  184 ---
 .../chisel/src/main/scala/dpi/VTASimDPI.scala      |   39 -
 .../chisel/src/main/scala/interface/axi/AXI.scala  |  312 -----
 .../chisel/src/main/scala/shell/Configs.scala      |   80 --
 .../chisel/src/main/scala/shell/IntelShell.scala   |   72 -
 .../chisel/src/main/scala/shell/SimShell.scala     |   99 --
 .../hardware/chisel/src/main/scala/shell/VCR.scala |  206 ---
 .../hardware/chisel/src/main/scala/shell/VME.scala |  260 ----
 .../chisel/src/main/scala/shell/VTAShell.scala     |   57 -
 .../chisel/src/main/scala/shell/XilinxShell.scala  |  119 --
 .../hardware/chisel/src/main/scala/test/Test.scala |   37 -
 .../chisel/src/main/scala/util/Config.scala        |  115 --
 .../scala/util/GenericParameterizedBundle.scala    |   44 -
 .../chisel/src/main/scala/vta/Configs.scala        |   66 -
 .../chisel/src/test/scala/unittest/AluTest.scala   |  104 --
 .../chisel/src/test/scala/unittest/Launcher.scala  |   60 -
 .../chisel/src/test/scala/unittest/MvmTest.scala   |   91 --
 .../src/test/scala/unittest/utils/Helper.scala     |   29 -
 .../test/scala/unittest/utils/RandomArray.scala    |   40 -
 .../src/test/scala/unittest/utils/TestRunner.scala |   91 --
 vta/vta-hw/hardware/dpi/tsim_device.cc             |  171 ---
 vta/vta-hw/hardware/intel/Makefile                 |   95 --
 vta/vta-hw/hardware/intel/README.md                |   18 -
 .../hardware/intel/scripts/compile_design.tcl      |  177 ---
 vta/vta-hw/hardware/intel/scripts/de10_nano_top.v  |  110 --
 .../hardware/intel/scripts/ip/vta/vta_hw.tcl       |  167 ---
 vta/vta-hw/hardware/intel/scripts/set_attrs.py     |   80 --
 vta/vta-hw/hardware/intel/scripts/set_clocks.sdc   |   41 -
 vta/vta-hw/hardware/intel/scripts/soc_system.tcl   |  760 ----------
 vta/vta-hw/hardware/xilinx/.gitignore              |    4 -
 vta/vta-hw/hardware/xilinx/Makefile                |   72 -
 vta/vta-hw/hardware/xilinx/README.md               |   18 -
 vta/vta-hw/hardware/xilinx/scripts/hls.tcl         |  138 --
 vta/vta-hw/hardware/xilinx/scripts/hsi.tcl         |   26 -
 vta/vta-hw/hardware/xilinx/scripts/vivado.tcl      |  437 ------
 vta/vta-hw/hardware/xilinx/sim/vta_test.cc         |   67 -
 vta/vta-hw/hardware/xilinx/src/vta.cc              |  742 ----------
 vta/vta-hw/hardware/xilinx/src/vta.h               |  233 ----
 vta/vta-hw/include/vta/dpi/module.h                |   67 -
 vta/vta-hw/include/vta/dpi/tsim.h                  |  116 --
 vta/vta-hw/include/vta/driver.h                    |  147 --
 vta/vta-hw/include/vta/hw_spec.h                   |  415 ------
 vta/vta-hw/include/vta/sim_tlpp.h                  |  161 ---
 vta/vta-hw/src/de10nano/cma_api.cc                 |   27 -
 vta/vta-hw/src/de10nano/cma_api.h                  |   95 --
 vta/vta-hw/src/de10nano/de10nano_driver.cc         |  179 ---
 vta/vta-hw/src/de10nano/de10nano_driver.h          |   63 -
 vta/vta-hw/src/de10nano/de10nano_mgr.h             |  551 --------
 vta/vta-hw/src/dpi/module.cc                       |  426 ------
 vta/vta-hw/src/pynq/pynq_driver.cc                 |  162 ---
 vta/vta-hw/src/pynq/pynq_driver.h                  |   66 -
 vta/vta-hw/src/sim/sim_driver.cc                   |  550 --------
 vta/vta-hw/src/sim/sim_tlpp.cc                     |  213 ---
 vta/vta-hw/src/tsim/tsim_driver.cc                 |  232 ----
 vta/vta-hw/src/vmem/virtual_memory.cc              |  149 --
 vta/vta-hw/src/vmem/virtual_memory.h               |  129 --
 vta/vta-hw/tests/hardware/common/test_lib.cc       | 1448 --------------------
 vta/vta-hw/tests/hardware/common/test_lib.h        |  340 -----
 vta/vta-hw/tests/hardware/metal_test/Makefile      |   55 -
 vta/vta-hw/tests/hardware/metal_test/metal_test.cc |   70 -
 151 files changed, 39 insertions(+), 19107 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 7814597..a1367c9 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,3 +7,6 @@
 [submodule "3rdparty/rang"]
 	path = 3rdparty/rang
 	url = https://github.com/agauniyal/rang
+[submodule "3rdparty/vta-hw"]
+	path = 3rdparty/vta-hw
+	url = https://github.com/apache/incubator-tvm-vta
diff --git a/3rdparty/vta-hw b/3rdparty/vta-hw
new file mode 160000
index 0000000..db65157
--- /dev/null
+++ b/3rdparty/vta-hw
@@ -0,0 +1 @@
+Subproject commit db65157208ec8fabb7b548c94596211b9db04190
diff --git a/Makefile b/Makefile
index c1b565f..7bfe60b 100644
--- a/Makefile
+++ b/Makefile
@@ -29,7 +29,7 @@ ifndef DLPACK_PATH
 endif
 
 ifndef VTA_HW_PATH
-  VTA_HW_PATH = $(ROOTDIR)/vta/vta-hw
+  VTA_HW_PATH = $(ROOTDIR)/3rdparty/vta-hw
 endif
 
 INCLUDE_FLAGS = -Iinclude -I$(DLPACK_PATH)/include -I$(DMLC_CORE_PATH)/include
diff --git a/cmake/modules/VTA.cmake b/cmake/modules/VTA.cmake
index f11ae0c..4af39e0 100644
--- a/cmake/modules/VTA.cmake
+++ b/cmake/modules/VTA.cmake
@@ -20,7 +20,7 @@ find_program(PYTHON NAMES python python3 python3.6)
 
 # Throw error if VTA_HW_PATH is not set
 if(NOT DEFINED ENV{VTA_HW_PATH})
-  set(VTA_HW_PATH ${CMAKE_CURRENT_SOURCE_DIR}/vta/vta-hw)
+  set(VTA_HW_PATH ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/vta-hw)
 else()
   set(VTA_HW_PATH $ENV{VTA_HW_PATH})
 endif()
diff --git a/docs/vta/dev/config.rst b/docs/vta/dev/config.rst
index e9690cb..2f98d77 100644
--- a/docs/vta/dev/config.rst
+++ b/docs/vta/dev/config.rst
@@ -21,7 +21,7 @@ VTA Configuration
 The VTA stack incorporates both a hardware accelerator stack and
 a TVM based software stack.
 VTA incorporates flexibility out of the box: by modifying the
-``vta/vta-hw/config/vta_config.json`` high-level configuration file,
+``3rdparty/vta-hw/config/vta_config.json`` high-level configuration file,
 the user can change the shape of the tensor intrinsic,
 clock frequency, pipelining, data type width, and on-chip buffer sizes.
 
diff --git a/docs/vta/dev/hardware.rst b/docs/vta/dev/hardware.rst
index 84cfc45..6eb3040 100644
--- a/docs/vta/dev/hardware.rst
+++ b/docs/vta/dev/hardware.rst
@@ -53,17 +53,17 @@ HLS Hardware Source Organization
 
 The VTA design is currently specified in Vivado HLS C++, which is only supported
 by Xilinx toolchains.
-The VTA hardware sources are contained under ``vta/vta-hw/hardware/xilinx/sources``:
+The VTA hardware sources are contained under ``3rdparty/vta-hw/hardware/xilinx/sources``:
 
  - ``vta.cc`` contains the definitions for each VTA module, as well as a top
    level behavioral model for the top-level VTA design.
  - ``vta.h`` contains type definitions using Xilinx ``ap_int`` types, and
    function prototypes declarations.
 
-In addition preprocessor macros are defined under ``vta/vta-hw/include/vta/hw_spec.h``.
+In addition preprocessor macros are defined under ``3rdparty/vta-hw/include/vta/hw_spec.h``.
 Much of these macro definitions are derived from the parameters listed in the
-``vta/vta-hw/config/vta_config.json`` file.
-The json file is processed by ``vta/vta-hw/config/vta_config.py`` to produce a string of
+``3rdparty/vta-hw/config/vta_config.json`` file.
+The json file is processed by ``3rdparty/vta-hw/config/vta_config.py`` to produce a string of
 compile flags that define the preprocessor macros.
 That string is used by the makefile in order to set those high-level
 parameters in both the HLS hardware synthesis compiler, and the C++
@@ -220,7 +220,7 @@ Microarchitectural Overview
 ---------------------------
 
 We describe the modules that compose the VTA design.
-The module definitions are contained in ``vta/vta-hw/hardware/xilinx/sources/vta.cc``.
+The module definitions are contained in ``3rdparty/vta-hw/hardware/xilinx/sources/vta.cc``.
 
 Fetch Module
 ~~~~~~~~~~~~
diff --git a/docs/vta/install.md b/docs/vta/install.md
index dd7ba9b..a938a67 100644
--- a/docs/vta/install.md
+++ b/docs/vta/install.md
@@ -32,7 +32,7 @@ For a quick and easy start, checkout the [Docker Guide](https://tvm.apache.org/d
 You'll need to set the following paths to use VTA:
 ```bash
 export TVM_PATH=<path to TVM root>
-export VTA_HW_PATH=$TVM_PATH/vta/vta-hw
+export VTA_HW_PATH=$TVM_PATH/3rdparty/vta-hw
 ```
 
 The VTA functional simulation library needs to be enabled when building TVM.
@@ -66,7 +66,7 @@ You are invited to try out our [VTA programming tutorials](https://tvm.apache.or
 ### Advanced Configuration (optional)
 
 VTA is a generic configurable deep learning accelerator.
-The configuration is specified by `vta_config.json` under `vta/vta-hw/config`.
+The configuration is specified by `vta_config.json` under `3rdparty/vta-hw/config`.
 This file provides an architectural specification of the VTA accelerator to parameterize the TVM compiler stack and the VTA hardware stack.
 
 The VTA configuration file also specifies the TVM compiler target.
@@ -76,7 +76,7 @@ To do so,
 
 ```bash
 cd <tvm root>
-vim vta/vta-hw/config/vta_config.json
+vim 3rdparty/vta-hw/config/vta_config.json
 # edit vta_config.json
 make
 ```
@@ -134,7 +134,7 @@ mkdir build
 cp cmake/config.cmake build/.
 echo 'set(USE_VTA_FPGA ON)' >> build/config.cmake
 # Copy pynq specific configuration
-cp vta/vta-hw/config/pynq_sample.json vta/vta-hw/config/vta_config.json
+cp 3rdparty/vta-hw/config/pynq_sample.json 3rdparty/vta-hw/config/vta_config.json
 cd build
 cmake ..
 make runtime vta -j2
@@ -168,7 +168,7 @@ In addition, you'll need to edit the `vta_config.json` file on the host to indic
 ```bash
 # On the Host-side
 cd <tvm root>
-cp vta/vta-hw/config/pynq_sample.json vta/vta-hw/config/vta_config.json
+cp 3rdparty/vta-hw/config/pynq_sample.json 3rdparty/vta-hw/config/vta_config.json
 ```
 
 This time again, we will run the 2D convolution testbench.
@@ -359,11 +359,11 @@ For this custom VTA bitstream compilation exercise, we'll change the frequency o
 * Set the `HW_FREQ` field to `142`. The Pynq board supports 100, 142, 167 and 200MHz clocks. Note that the higher the frequency, the harder it will be to close timing. Increasing the frequency can lead to timing violation and thus faulty hardware execution.
 * Set the `HW_CLK_TARGET` to `6`. This parameters refers to the target clock period in nano seconds for HLS - a lower clock period leads to more aggressive pipelining to achieve timing closure at higher frequencies. Technically a 142MHz clock would require a 7ns target, but we intentionally lower the clock target to 6ns to more aggressively pipeline our design.
 
-Bitstream generation is driven by a top-level `Makefile` under `<tvm root>/vta/vta-hw/hardware/xilinx/`.
+Bitstream generation is driven by a top-level `Makefile` under `<tvm root>/3rdparty/vta-hw/hardware/xilinx/`.
 
 If you just want to simulate the VTA design in software emulation to make sure that it is functional, enter:
 ```bash
-cd <tvm root>/vta/vta-hw/hardware/xilinx
+cd <tvm root>/3rdparty/vta-hw/hardware/xilinx
 make ip MODE=sim
 ```
 
@@ -371,7 +371,7 @@ If you just want to generate the HLS-based VTA IP cores without launching the en
 ```bash
 make ip
 ```
-You'll be able to view the HLS synthesis reports under `<tvm root>/vta/vta-hw/build/hardware/xilinx/hls/` `<configuration>/<block>/solution0/syn/report/<block>_csynth.rpt`
+You'll be able to view the HLS synthesis reports under `<tvm root>/3rdparty/vta-hw/build/hardware/xilinx/hls/` `<configuration>/<block>/solution0/syn/report/<block>_csynth.rpt`
 > Note: The `<configuration>` name is a string that summarizes the VTA configuration parameters listed in the `vta_config.json`. The `<block>` name refers to the specific module (or HLS function) that compose the high-level VTA pipeline.
 
 Finally to run the full hardware compilation and generate the VTA bitstream, run:
@@ -383,20 +383,20 @@ make
 This process is lengthy, and can take around up to an hour to complete depending on your machine's specs.
 We recommend setting the `VTA_HW_COMP_THREADS` variable in the Makefile to take full advantage of all the cores on your development machine.
 
-Once the compilation completes, the generated bitstream can be found under `<tvm root>/vta/vta-hw/build/hardware/xilinx/vivado/<configuration>/export/vta.bit`.
+Once the compilation completes, the generated bitstream can be found under `<tvm root>/3rdparty/vta-hw/build/hardware/xilinx/vivado/<configuration>/export/vta.bit`.
 
 ### Chisel-based Custom VTA Bitstream Compilation for DE10-Nano
 
-Similar to the HLS-based design, high-level hardware parameters in Chisel-based design are listed in the VTA configuration file [Configs.scala](https://github.com/apache/incubator-tvm/blob/master/vta/vta-hw/hardware/chisel/src/main/scala/core/Configs.scala), and they can be customized by the user.
+Similar to the HLS-based design, high-level hardware parameters in Chisel-based design are listed in the VTA configuration file [Configs.scala](https://github.com/apache/incubator-tvm/blob/master/3rdparty/vta-hw/hardware/chisel/src/main/scala/core/Configs.scala), and they can be customized by the user.
 
-For Intel FPGA, bitstream generation is driven by a top-level `Makefile` under `<tvmroot>/vta/vta-hw/hardware/intel`.
+For Intel FPGA, bitstream generation is driven by a top-level `Makefile` under `<tvm root>/3rdparty/vta-hw/hardware/intel`.
 
 If you just want to generate the Chisel-based VTA IP core for the DE10-Nano board without compiling the design for the FPGA hardware, enter:
 ```bash
-cd <tvmroot>/vta/vta-hw/hardware/intel
+cd <tvm root>/3rdparty/vta-hw/hardware/intel
 make ip
 ```
-Then you'll be able to locate the generated verilog file at `<tvmroot>/vta/vta-hw/build/hardware/intel/chisel/<configuration>/VTA.DefaultDe10Config.v`.
+Then you'll be able to locate the generated verilog file at `<tvm root>/3rdparty/vta-hw/build/hardware/intel/chisel/<configuration>/VTA.DefaultDe10Config.v`.
 
 If you would like to run the full hardware compilation for the `de10nano` board:
 ```bash
@@ -405,14 +405,14 @@ make
 
 This process might be a bit lengthy, and might take up to half an hour to complete depending on the performance of your PC. The Quartus Prime software would automatically detect the number of cores available on your PC and try to utilize all of them to perform such process.
 
-Once the compilation completes, the generated bistream can be found under `<tvmroot>vtay/vta-hw/build/hardware/intel/quartus/<configuration>/export/vta.rbf`. You can also open the Quartus project file (.qpf) available at `<tvmroot>/vta/vta-hw/build/hardware/intel/quartus/<configuration>/de10_nano_top.qpf` to look around the generated reports.
+Once the compilation completes, the generated bistream can be found under `<tvm root>/3rdparty/vta-hw/build/hardware/intel/quartus/<configuration>/export/vta.rbf`. You can also open the Quartus project file (.qpf) available at `<tvm root>/3rdparty/vta-hw/build/hardware/intel/quartus/<configuration>/de10_nano_top.qpf` to look around the generated reports.
 
 ### Use the Custom Bitstream
 
 We can program the new VTA FPGA bitstream by setting the bitstream path of the `vta.program_fpga()` function in the tutorial examples, or in the `test_program_rpc.py` script.
 
 ```python
-vta.program_fpga(remote, bitstream="<tvm root>/vta/vta-hw/build/hardware/xilinx/vivado/<configuration>/export/vta.bit")
+vta.program_fpga(remote, bitstream="<tvm root>/3rdparty/vta-hw/build/hardware/xilinx/vivado/<configuration>/export/vta.bit")
 ```
 
 Instead of downloading a pre-built bitstream from the VTA bitstream repository, TVM will instead use the new bitstream you just generated, which is a VTA design clocked at a higher frequency.
diff --git a/tests/scripts/task_build.sh b/tests/scripts/task_build.sh
index fbf3a63..d8e35eb 100755
--- a/tests/scripts/task_build.sh
+++ b/tests/scripts/task_build.sh
@@ -15,4 +15,5 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+export VTA_HW_PATH=`pwd`/3rdparty/vta-hw
 cd $1 && cmake .. && make $2 && cd ..
diff --git a/tests/scripts/task_cpp_unittest.sh b/tests/scripts/task_cpp_unittest.sh
index 69697ca..751e98e 100755
--- a/tests/scripts/task_cpp_unittest.sh
+++ b/tests/scripts/task_cpp_unittest.sh
@@ -20,6 +20,8 @@ set -e
 set -u
 
 export LD_LIBRARY_PATH="lib:${LD_LIBRARY_PATH:-}"
+# NOTE: important to use abspath, when VTA is enabled.
+export VTA_HW_PATH=`pwd`/3rdparty/vta-hw
 
 # Remove existing testcases
 rm -f build/*_test
diff --git a/tests/scripts/task_python_vta_fsim.sh b/tests/scripts/task_python_vta_fsim.sh
index 304d623..f269866 100755
--- a/tests/scripts/task_python_vta_fsim.sh
+++ b/tests/scripts/task_python_vta_fsim.sh
@@ -21,7 +21,7 @@ set -u
 
 export TVM_PATH=`pwd`
 export PYTHONPATH=${TVM_PATH}/python:${TVM_PATH}/vta/python:${TVM_PATH}/topi/python
-export VTA_HW_PATH=`pwd`/vta/vta-hw
+export VTA_HW_PATH=`pwd`/3rdparty/vta-hw
 
 # cleanup pycache
 find . -type f -path "*.pyc" | xargs rm -f
diff --git a/tests/scripts/task_python_vta_tsim.sh b/tests/scripts/task_python_vta_tsim.sh
index 65057cc..4936674 100755
--- a/tests/scripts/task_python_vta_tsim.sh
+++ b/tests/scripts/task_python_vta_tsim.sh
@@ -21,7 +21,7 @@ set -u
 
 export TVM_PATH=`pwd`
 export PYTHONPATH=${TVM_PATH}/python:${TVM_PATH}/vta/python:${TVM_PATH}/topi/python
-export VTA_HW_PATH=`pwd`/vta/vta-hw
+export VTA_HW_PATH=`pwd`/3rdparty/vta-hw
 
 # cleanup pycache
 find . -type f -path "*.pyc" | xargs rm -f
diff --git a/vta/python/vta/environment.py b/vta/python/vta/environment.py
index c5b56e3..bbaac2c 100644
--- a/vta/python/vta/environment.py
+++ b/vta/python/vta/environment.py
@@ -28,7 +28,7 @@ from . import intrin
 def get_vta_hw_path():
     """Get the VTA HW path."""
     curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-    vta_hw_default = os.path.abspath(os.path.join(curr_path, "../../vta-hw"))
+    vta_hw_default = os.path.abspath(os.path.join(curr_path, "../../../3rdparty/vta-hw"))
     VTA_HW_PATH = os.getenv('VTA_HW_PATH', vta_hw_default)
     return os.path.abspath(VTA_HW_PATH)
 
diff --git a/vta/tutorials/autotvm/tune_relay_vta.py b/vta/tutorials/autotvm/tune_relay_vta.py
index b0870b1..571dde6 100644
--- a/vta/tutorials/autotvm/tune_relay_vta.py
+++ b/vta/tutorials/autotvm/tune_relay_vta.py
@@ -181,7 +181,7 @@ def compile_network(env, target, model, start_pack, stop_pack):
 tracker_host = os.environ.get("TVM_TRACKER_HOST", '0.0.0.0')
 tracker_port = int(os.environ.get("TVM_TRACKER_PORT", 9190))
 
-# Load VTA parameters from the vta/vta-hw/config/vta_config.json file
+# Load VTA parameters from the 3rdparty/vta-hw/config/vta_config.json file
 env = vta.get_env()
 
 # This target is used for cross compilation. You can query it by :code:`gcc -v` on your device.
diff --git a/vta/tutorials/frontend/deploy_classification.py b/vta/tutorials/frontend/deploy_classification.py
index 7ce3853..62fb321 100644
--- a/vta/tutorials/frontend/deploy_classification.py
+++ b/vta/tutorials/frontend/deploy_classification.py
@@ -68,7 +68,7 @@ assert tvm.runtime.enabled("rpc")
 # -------------------------------------
 # Execute on CPU vs. VTA, and define the model.
 
-# Load VTA parameters from the vta/vta-hw/config/vta_config.json file
+# Load VTA parameters from the 3rdparty/vta-hw/config/vta_config.json file
 env = vta.get_env()
 
 # Set ``device=arm_cpu`` to run inference on the CPU
diff --git a/vta/tutorials/frontend/deploy_detection.py b/vta/tutorials/frontend/deploy_detection.py
index 83fa8fb..efcd2c4 100644
--- a/vta/tutorials/frontend/deploy_detection.py
+++ b/vta/tutorials/frontend/deploy_detection.py
@@ -111,7 +111,7 @@ names = [x.strip() for x in content]
 # --------------------------------------
 # Execute on CPU vs. VTA, and define the model.
 
-# Load VTA parameters from the vta/vta-hw/config/vta_config.json file
+# Load VTA parameters from the 3rdparty/vta-hw/config/vta_config.json file
 env = vta.get_env()
 # Set ``device=arm_cpu`` to run inference on the CPU
 # or ``device=vta`` to run inference on the FPGA.
diff --git a/vta/tutorials/matrix_multiply.py b/vta/tutorials/matrix_multiply.py
index 227144e..024e179 100644
--- a/vta/tutorials/matrix_multiply.py
+++ b/vta/tutorials/matrix_multiply.py
@@ -43,7 +43,7 @@ from tvm import rpc
 from tvm.contrib import util
 from vta.testing import simulator
 
-# Load VTA parameters from the vta/vta-hw/config/vta_config.json file
+# Load VTA parameters from the 3rdparty/vta-hw/config/vta_config.json file
 env = vta.get_env()
 
 # We read the Pynq RPC host IP address and port number from the OS environment
diff --git a/vta/tutorials/optimize/convolution_opt.py b/vta/tutorials/optimize/convolution_opt.py
index f609a72..0564a6a 100644
--- a/vta/tutorials/optimize/convolution_opt.py
+++ b/vta/tutorials/optimize/convolution_opt.py
@@ -47,7 +47,7 @@ from tvm import rpc
 from tvm.contrib import util
 from vta.testing import simulator
 
-# Load VTA parameters from the vta/vta-hw/config/vta_config.json file
+# Load VTA parameters from the 3rdparty/vta-hw/config/vta_config.json file
 env = vta.get_env()
 
 # We read the Pynq RPC host IP address and port number from the OS environment
diff --git a/vta/tutorials/optimize/matrix_multiply_opt.py b/vta/tutorials/optimize/matrix_multiply_opt.py
index da3b9bb..77b0381 100644
--- a/vta/tutorials/optimize/matrix_multiply_opt.py
+++ b/vta/tutorials/optimize/matrix_multiply_opt.py
@@ -46,7 +46,7 @@ from tvm import rpc
 from tvm.contrib import util
 from vta.testing import simulator
 
-# Load VTA parameters from the vta/vta-hw/config/vta_config.json file
+# Load VTA parameters from the 3rdparty/vta-hw/config/vta_config.json file
 env = vta.get_env()
 
 # We read the Pynq RPC host IP address and port number from the OS environment
diff --git a/vta/vta-hw/apps/gemm/CMakeLists.txt b/vta/vta-hw/apps/gemm/CMakeLists.txt
deleted file mode 100644
index f41a467..0000000
--- a/vta/vta-hw/apps/gemm/CMakeLists.txt
+++ /dev/null
@@ -1,51 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-cmake_minimum_required(VERSION 3.2)
-project(tsim C CXX)
-
-if(NOT DEFINED ENV{TVM_PATH})
-    message(ERROR "Make sure to set TVM_PATH in your environment")
-endif()
-
-if(NOT DEFINED ENV{VTA_HW_PATH})
-    message(ERROR "Make sure to set VTA_HW_PATH in your environment")
-endif()
-
-include_directories("$ENV{TVM_PATH}/include")
-include_directories("$ENV{TVM_PATH}/3rdparty/dlpack/include")
-include_directories("$ENV{TVM_PATH}/3rdparty/dmlc-core/include")
-include_directories("$ENV{VTA_HW_PATH}/src/dpi")
-
-set(CMAKE_C_FLAGS "-O2 -Wall -fPIC -fvisibility=hidden")
-set(CMAKE_CXX_FLAGS "-O2 -Wall -fPIC -fvisibility=hidden -std=c++11")
-
-if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" AND
-    CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
-  set(CMAKE_CXX_FLAGS "-faligned-new ${CMAKE_CXX_FLAGS}")
-endif()
-
-file(GLOB TSIM_SW_SRC src/driver.cc)
-list(APPEND TSIM_SW_SRC $ENV{VTA_HW_PATH}/src/vmem/virtual_memory.cc)
-list(APPEND TSIM_SW_SRC $ENV{VTA_HW_PATH}/src/dpi/module.cc)
-
-add_library(sw SHARED ${TSIM_SW_SRC})
-target_include_directories(sw PRIVATE $ENV{VTA_HW_PATH}/include $ENV{VTA_HW_PATH}/src)
-
-if(APPLE)
-  set_target_properties(sw PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
-endif(APPLE)
diff --git a/vta/vta-hw/apps/gemm/Makefile b/vta/vta-hw/apps/gemm/Makefile
deleted file mode 100644
index 6bdebea..0000000
--- a/vta/vta-hw/apps/gemm/Makefile
+++ /dev/null
@@ -1,42 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-export PYTHONPATH:=$(abspath .)/python:$(PYTHONPATH)
-
-BUILD_NAME = build
-build_dir = $(abspath .)/$(BUILD_NAME)
-
-default: chisel driver serial parallel
-
-serial:
-	python3 tests/python/chisel_accel.py serial
-
-parallel:
-	python3 tests/python/chisel_accel.py parallel
-
-driver: | $(build_dir)
-	cd $(build_dir) && cmake .. && make
-
-$(build_dir):
-	mkdir -p $@
-
-chisel:
-	make -C hardware/chisel
-
-clean:
-	-rm -rf $(build_dir)
-	make -C hardware/chisel clean
diff --git a/vta/vta-hw/apps/gemm/README.md b/vta/vta-hw/apps/gemm/README.md
deleted file mode 100644
index bf7e1c1..0000000
--- a/vta/vta-hw/apps/gemm/README.md
+++ /dev/null
@@ -1,50 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-VTA TSIM Application
-======================
-Prior to this application, please take a look at `<vta-hw-root>/apps/tsim_example` for installation
-This is an application that performs Bit Serial Multiplication for GEMM utilizing TSIM.
-
-**Bit Serial Multiplication for GEMM:**
-
-General Matrix Multiplications (GEMM), are mostly calculated by repeatly calculating the dot product for each pair of vectors.
-The dot product is calculated by summing every product of the vector pair.
-We approach this operation with slicing and shifting, like how basic multiplication works, each vector elements before we accumulate them.
-We can sufficiently reduce the cycles required to perform a gemm given that the data bit width is small. This GEMM application uses TSIM for future accerlerator prototypes.
-
-* Test Chisel3 backend with bit serial GEMM
-    * Go to `<vta-hw-root>/apps/gemm`
-    * Run `make`
-
-* If you have already compiled chisel backend (i.e. ran `make`)
-    * Bit Serial test with another input set, run `make serial`
-    * Bit parallel test with another input set, run `make parallel`
-
-* Some steps for creating your own custom TSIM application
-    * Go to `<vta-hw-root>/apps/gemm`
-    * Create custom circuit within `./hardware/chisel/src/scala.main/accel/Compute.scala`
-    * Map the according Registers in `./hardware/chisel/src/scala.main/accel/RegFile.scala`
-    * Create your test script
-    * Map the registers in `./src/driver.cc` and link it with both `RegFile.scala` and the test script
-    * Understanding of `<vta-hw-root>/apps/tsim_example`, which performs add by one to a vector, is highly encouraged to create a more complex application
-
-* Some pointers
-    * Chisel3 tests in `<vta-hw-root>/apps/gemm/tests/python`
-    * Chisel3 accelerator backend `<vta-hw-root>/apps/gemm/hardware/chisel`
-    * Software C++ driver (backend) that handles the accelerator `<vta-hw-root>/apps/gemm/src/driver.cc`
-    * Software Python driver (frontend) that handles the accelerator `<vta-hw-root>/apps/gemm/python/accel`
diff --git a/vta/vta-hw/apps/gemm/hardware/chisel/Makefile b/vta/vta-hw/apps/gemm/hardware/chisel/Makefile
deleted file mode 100644
index 310f623..0000000
--- a/vta/vta-hw/apps/gemm/hardware/chisel/Makefile
+++ /dev/null
@@ -1,112 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-ifeq (, $(shell which verilator))
- $(error "No Verilator in $(PATH), consider doing apt-get install verilator")
-endif
-
-# Change VERILATOR_INC_DIR if Verilator is installed on a different location
-ifeq (, $(VERILATOR_INC_DIR))
-  ifeq (, $(wildcard /usr/local/share/verilator/include/*))
-    ifeq (, $(wildcard /usr/share/verilator/include/*))
-      $(error "Verilator include directory is not set properly")
-    else
-      VERILATOR_INC_DIR := /usr/share/verilator/include
-    endif
-  else
-      VERILATOR_INC_DIR := /usr/local/share/verilator/include
-  endif
-endif
-
-TOP = TestAccel
-BUILD_NAME = build
-USE_TRACE = 1
-LIBNAME = libhw
-
-vta_dir = $(abspath ../../../../)
-tvm_dir = $(abspath ../../../../../../)
-build_dir = $(abspath .)/$(BUILD_NAME)
-verilator_build_dir = $(build_dir)/verilator
-chisel_build_dir = $(build_dir)/chisel
-
-verilator_opt = --cc
-verilator_opt += +define+RANDOMIZE_GARBAGE_ASSIGN
-verilator_opt += +define+RANDOMIZE_REG_INIT
-verilator_opt += +define+RANDOMIZE_MEM_INIT
-verilator_opt += --x-assign unique
-verilator_opt += --output-split 20000
-verilator_opt += --output-split-cfuncs 20000
-verilator_opt += --top-module ${TOP}
-verilator_opt += -Mdir ${verilator_build_dir}
-verilator_opt += -I$(chisel_build_dir)
-
-cxx_flags = -O2 -Wall -fPIC -shared
-cxx_flags += -fvisibility=hidden -std=c++11
-cxx_flags += -DVL_TSIM_NAME=V$(TOP)
-cxx_flags += -DVL_PRINTF=printf
-cxx_flags += -DVL_USER_FINISH
-cxx_flags += -DVM_COVERAGE=0
-cxx_flags += -DVM_SC=0
-cxx_flags += -Wno-sign-compare
-cxx_flags += -include V$(TOP).h
-cxx_flags += -I$(verilator_build_dir)
-cxx_flags += -I$(VERILATOR_INC_DIR)
-cxx_flags += -I$(VERILATOR_INC_DIR)/vltstd
-cxx_flags += -I$(vta_dir)/include
-cxx_flags += -I$(tvm_dir)/include
-cxx_flags += -I$(tvm_dir)/3rdparty/dlpack/include
-
-cxx_files = $(VERILATOR_INC_DIR)/verilated.cpp
-cxx_files += $(VERILATOR_INC_DIR)/verilated_dpi.cpp
-cxx_files += $(wildcard $(verilator_build_dir)/*.cpp)
-cxx_files += $(vta_dir)/hardware/dpi/tsim_device.cc
-
-ifneq ($(USE_TRACE), 0)
-  verilator_opt += --trace
-  cxx_flags += -DVM_TRACE=1
-  cxx_flags += -DTSIM_TRACE_FILE=$(verilator_build_dir)/$(TOP).vcd
-  cxx_files += $(VERILATOR_INC_DIR)/verilated_vcd_c.cpp
-else
-  cxx_flags += -DVM_TRACE=0
-endif
-
-# The following is to be consistent with cmake
-ifeq ($(shell uname), Darwin)
-  lib_path = $(build_dir)/$(LIBNAME).dylib
-else
-  lib_path = $(build_dir)/$(LIBNAME).so
-endif
-
-default: lib
-
-lib: $(lib_path)
-$(lib_path): $(verilator_build_dir)/V$(TOP).cpp
-	g++ $(cxx_flags) $(cxx_files) -o $@
-
-verilator: $(verilator_build_dir)/V$(TOP).cpp
-$(verilator_build_dir)/V$(TOP).cpp: $(chisel_build_dir)/$(TOP).v
-	verilator $(verilator_opt) $<
-
-verilog: $(chisel_build_dir)/$(TOP).v
-$(chisel_build_dir)/$(TOP).v: install_vta_package
-	sbt 'test:runMain test.Elaborate --target-dir $(chisel_build_dir) --top-name $(TOP)'
-
-install_vta_package:
-	cd $(vta_dir)/hardware/chisel && sbt publishLocal
-
-clean:
-	-rm -rf $(build_dir) target project/target project/project
diff --git a/vta/vta-hw/apps/gemm/hardware/chisel/build.sbt b/vta/vta-hw/apps/gemm/hardware/chisel/build.sbt
deleted file mode 100644
index a2afc0d..0000000
--- a/vta/vta-hw/apps/gemm/hardware/chisel/build.sbt
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-name := "accel"
-version := "0.1.0-SNAPSHOT"
-organization := "edu.washington.cs"
-
-def scalacOptionsVersion(scalaVersion: String): Seq[String] = {
-  Seq() ++ {
-    // If we're building with Scala > 2.11, enable the compile option
-    //  switch to support our anonymous Bundle definitions:
-    //  https://github.com/scala/bug/issues/10047
-    CrossVersion.partialVersion(scalaVersion) match {
-      case Some((2, scalaMajor: Long)) if scalaMajor < 12 => Seq()
-      case _ => Seq(
-        "-Xsource:2.11",
-        "-language:reflectiveCalls",
-        "-language:implicitConversions",
-        "-deprecation",
-        "-Xlint",
-        "-Ywarn-unused",
-      )
-    }
-  }
-}
-
-def javacOptionsVersion(scalaVersion: String): Seq[String] = {
-  Seq() ++ {
-    // Scala 2.12 requires Java 8. We continue to generate
-    //  Java 7 compatible code for Scala 2.11
-    //  for compatibility with old clients.
-    CrossVersion.partialVersion(scalaVersion) match {
-      case Some((2, scalaMajor: Long)) if scalaMajor < 12 =>
-        Seq("-source", "1.7", "-target", "1.7")
-      case _ =>
-        Seq("-source", "1.8", "-target", "1.8")
-    }
-  }
-}
-
-scalaVersion := "2.11.12"
-
-resolvers ++= Seq(
-  Resolver.sonatypeRepo("snapshots"),
-  Resolver.sonatypeRepo("releases"))
-
-libraryDependencies ++= Seq(
-  "edu.berkeley.cs" %% "chisel3" % "3.1.7",
-  "edu.washington.cs" %% "vta" % "0.1.0-SNAPSHOT",
-)
-
-scalacOptions ++= scalacOptionsVersion(scalaVersion.value)
-javacOptions ++= javacOptionsVersion(scalaVersion.value)
diff --git a/vta/vta-hw/apps/gemm/hardware/chisel/project/build.properties b/vta/vta-hw/apps/gemm/hardware/chisel/project/build.properties
deleted file mode 100644
index fc7998e..0000000
--- a/vta/vta-hw/apps/gemm/hardware/chisel/project/build.properties
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-sbt.version = 1.3.2
diff --git a/vta/vta-hw/apps/gemm/hardware/chisel/project/plugins.sbt b/vta/vta-hw/apps/gemm/hardware/chisel/project/plugins.sbt
deleted file mode 100644
index 79ffb22..0000000
--- a/vta/vta-hw/apps/gemm/hardware/chisel/project/plugins.sbt
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-logLevel := Level.Warn
diff --git a/vta/vta-hw/apps/gemm/hardware/chisel/src/main/scala/accel/Accel.scala b/vta/vta-hw/apps/gemm/hardware/chisel/src/main/scala/accel/Accel.scala
deleted file mode 100644
index add07c3..0000000
--- a/vta/vta-hw/apps/gemm/hardware/chisel/src/main/scala/accel/Accel.scala
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package accel
-
-import chisel3._
-import vta.dpi._
-
-/** Add-by-one accelerator.
-  *
-  * ___________      ___________
-  * |         |      |         |
-  * | HostDPI | <--> | RegFile | <->|
-  * |_________|      |_________|    |
-  *                                 |
-  * ___________      ___________    |
-  * |         |      |         |    |
-  * | MemDPI  | <--> | Compute | <->|
-  * |_________|      |_________|
-  *
-  */
-case class AccelConfig() {
-  val nCtrl = 1
-  val nECnt = 1
-  val nVals = 4
-  val nPtrs = 3
-  val regBits = 32
-  val ptrBits = 2*regBits
-}
-
-class Accel extends Module {
-  val io = IO(new Bundle {
-    val host = new VTAHostDPIClient
-    val mem = new VTAMemDPIMaster
-  })
-  implicit val config = AccelConfig()
-  val rf = Module(new RegFile)
-  val ce = Module(new Compute)
-  rf.io.host <> io.host
-  io.mem <> ce.io.mem
-  ce.io.launch := rf.io.launch
-  rf.io.finish := ce.io.finish
-  rf.io.ecnt <> ce.io.ecnt
-  ce.io.vals <> rf.io.vals
-  ce.io.ptrs <> rf.io.ptrs
-}
diff --git a/vta/vta-hw/apps/gemm/hardware/chisel/src/main/scala/accel/Compute.scala b/vta/vta-hw/apps/gemm/hardware/chisel/src/main/scala/accel/Compute.scala
deleted file mode 100644
index 1eced6e..0000000
--- a/vta/vta-hw/apps/gemm/hardware/chisel/src/main/scala/accel/Compute.scala
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package accel
-
-import chisel3._
-import chisel3.util._
-import vta.dpi._
-import vta.core._
-import vta.util.config._
-import vta.shell._
-
-class TestConfig extends Config(new CoreConfig ++ new PynqConfig)
-/** Compute
-  *
-  * Bit Slice GEMM:
-  *
-  * 1. Wait for launch to be asserted
-  * 2. Issue 1 read request for 8-bit value at inp1_baddr address (read matrix)
-  * 3. Wait for the value
-  * 4. Increment read-address for next value
-  * 5. Repeat until all inp1 data have been read
-
-  * 6. Issue 1 read request for 8-bit value at inp2_baddr address (read vector)
-  * 7. Wait for the value
-  * 8. Increment read-address for next value
-  * 9. Repeat until all inp2 data have been read
-
-  * 10. Wait for output to be calculated
-  * 11. Issue a write request for 8-byte value at out_baddr address
-  * 12. Increment write-address for next value to write
-  * 13. Check if counter (cntout) is equal to length to asser finish,
-       otherwise go to step 11
-  */
-class Compute(implicit config: AccelConfig) extends Module {
-  val io = IO(new Bundle {
-    val launch = Input(Bool())
-    val finish = Output(Bool())
-    val ecnt = Vec(config.nECnt, ValidIO(UInt(config.regBits.W)))
-    val vals = Input(Vec(config.nVals, UInt(config.regBits.W)))
-    val ptrs = Input(Vec(config.nPtrs, UInt(config.ptrBits.W)))
-    val mem = new VTAMemDPIMaster
-  })
-  implicit val p: Parameters = new TestConfig
-  val sIdle :: sReadAReq :: sReadAData :: sReadADone ::sReadBReq :: sReadBData :: sReadBDone :: sInpDone ::sWait:: sWriteReq :: sWriteData :: sWriteDone :: Nil = Enum(12)
-  val state = RegInit(sIdle)
-  val shift = io.vals(0)
-  val length = io.vals(1)
-  val rstAccum = io.vals(2)
-  val startDot = io.vals(3)
-  val cycles = RegInit(0.U(config.regBits.W))
-  val mvc = Module(new MatrixVectorMultiplication)
-  val reg1 = Reg(chiselTypeOf(mvc.io.wgt.data.bits))
-  val reg2 = Reg(chiselTypeOf(mvc.io.inp.data.bits))
-  val cntwgt = Reg(UInt(config.regBits.W))
-  val cntinp = Reg(UInt(config.regBits.W))
-  val cntout = Reg(UInt(config.regBits.W))
-  val raddr1 = Reg(UInt(config.ptrBits.W))
-  val raddr2 = Reg(UInt(config.ptrBits.W))
-  val waddr = Reg(UInt(config.ptrBits.W))
-  val accum = Module(new Accmulator(size = p(CoreKey).blockOut, accBits = p(CoreKey).accBits))
-
-  switch (state) {
-    is (sIdle) {
-      when (io.launch) {
-        state := sReadAReq
-      }
-    }
-    // Read
-    is (sReadAReq) {
-      state := sReadAData
-    }
-    is (sReadAData) {
-      when (io.mem.rd.valid) {
-        state := sReadADone
-      }
-    }
-    is (sReadADone) {
-      when (cntwgt === (length * length) - 1.U) {
-        state := sReadBReq
-      } .otherwise {
-        state := sReadAReq
-      }
-    }
-    is (sReadBReq) {
-      state := sReadBData
-    }
-    is (sReadBData) {
-      when (io.mem.rd.valid) {
-        state := sReadBDone
-      }
-    }
-    is (sReadBDone) {
-      when (cntinp === length-1.U) {
-        state := sInpDone
-      } .otherwise {
-        state := sReadBReq
-      }
-    }
-    // Both input is processed
-    is (sInpDone) {
-      state := sWait
-    }
-    // Wait for computation
-    is (sWait) {
-      when (accum.io.ready) {
-        state := sWriteReq
-      }
-    }
-    // Write
-    is (sWriteReq) {
-      state := sWriteData
-    }
-    is (sWriteData) {
-        state := sWriteDone
-    }
-    is (sWriteDone) {
-      when (cntout === (length - 1.U)) {
-        state := sIdle
-      } .otherwise {
-        state := sWriteReq
-      }
-    }
-  }
-
-  val last = state === sWriteDone && cntout === (length - 1.U)
-
-  // cycle counter
-  when (state === sIdle) {
-    cycles := 0.U
-  } .otherwise {
-    cycles := cycles + 1.U
-  }
-
-  io.ecnt(0).valid := last
-  io.ecnt(0).bits := cycles
-
-  // calculate next address
-  when (state === sIdle) {
-    raddr1 := io.ptrs(0)
-    raddr2 := io.ptrs(1)
-    waddr := io.ptrs(2)
-  } .elsewhen (state === sReadADone) { // increment input array by 1-byte
-    raddr1 := raddr1 + 1.U
-  } .elsewhen (state === sReadBDone) { // increment input array by 1-byte
-    raddr2 := raddr2 + 1.U
-  } .elsewhen (state === sWriteDone) {
-    waddr := waddr + 4.U // writing 4 bytes
-  }
-
-  // create request
-  io.mem.req.valid := state === sReadAReq | state === sReadBReq | state === sWriteReq
-  io.mem.req.opcode := state === sWriteReq
-  io.mem.req.len := 0.U // one-word-per-request
-  io.mem.req.addr := Mux(state === sReadAReq | state === sReadBReq, Mux(state === sReadAReq, raddr1, raddr2), waddr)
-
-  // read
-  when (state === sReadAData && io.mem.rd.valid) {
-    reg1(cntwgt/length)(cntwgt%length) := io.mem.rd.bits(7, 0)
-  }
-
-  when (state === sReadBData && io.mem.rd.valid) {
-    reg2(0)(cntinp) := io.mem.rd.bits(7, 0)
-  }
-
-  io.mem.rd.ready := state === sReadAData | state === sReadBData
-  mvc.io.inp.data.valid := state === sInpDone // 2 inputs have been processed
-  mvc.io.wgt.data.valid := state === sInpDone // 2 inputs have been processed
-
-  mvc.io.wgt.data.bits <> reg1
-  mvc.io.inp.data.bits <> reg2
-  // Modify when shift operation is supported
-  mvc.io.reset := false.B
-  mvc.io.acc_i.data.valid := true.B
-  for (i <- 0 until p(CoreKey).blockOut) {
-    mvc.io.acc_i.data.bits(0)(i) := 0.U
-  }
-
-  accum.io.in := mvc.io.acc_o.data.bits
-  accum.io.shift := shift
-  accum.io.clear := rstAccum
-  accum.io.valid := mvc.io.acc_o.data.valid
-
-  // write
-  io.mem.wr.valid := state === sWriteData
-  io.mem.wr.bits := accum.io.sum(cntout)
-
-  // count read/write
-  when (state === sIdle) {
-    cntwgt := 0.U
-    cntinp := 0.U
-    cntout := 0.U
-  } .elsewhen (state === sReadADone) {
-    cntwgt := cntwgt + 1.U
-  } .elsewhen (state === sReadBDone) {
-    cntinp := cntinp + 1.U
-  } .elsewhen (state === sWriteDone) {
-    cntout := cntout + 1.U
-  }
-
-  io.finish := last // data has been added
-}
-// Shift operation until supported in MVM
-class Accmulator(size: Int = 16, accBits: Int = 32) extends Module {
-  val io = IO(new Bundle {
-    val clear = Input(Bool())
-    val valid = Input(Bool())
-    val ready = Output(Bool())
-    val in = Input(Vec(1, Vec(size, (UInt(accBits.W)))))
-    val shift = Input(UInt(8.W))
-    val sum = Output(Vec(size, (UInt(accBits.W))))
-  })
-    val reg = RegInit(VecInit(Seq.fill(size)(0.U(accBits.W))))
-
-    for (i <- 0 until size) {
-      when (io.clear) {
-        reg(i) := 0.U
-      } .elsewhen(io.valid) {
-        reg(i) := reg(i) + (io.in(0)(i) << io.shift)
-      }
-    }
-    io.ready := RegNext(io.valid)
-    io.sum := reg
-}
-
diff --git a/vta/vta-hw/apps/gemm/hardware/chisel/src/main/scala/accel/RegFile.scala b/vta/vta-hw/apps/gemm/hardware/chisel/src/main/scala/accel/RegFile.scala
deleted file mode 100644
index 10c40b5..0000000
--- a/vta/vta-hw/apps/gemm/hardware/chisel/src/main/scala/accel/RegFile.scala
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package accel
-
-import chisel3._
-import chisel3.util._
-import vta.dpi._
-
-/** Register File.
-  *
-  * Six 32-bit register file.
-  *
-  * -------------------------------
-  *  Register description    | addr
-  * -------------------------|-----
-  *  Control status register | 0x00
-  *  Cycle counter           | 0x04
-  *  Shift value             | 0x08
-  *  Vector length           | 0x0c
-  *  Reset Accumulator       | 0x10
-  *  Input1 pointer          | 0x18
-  *  Input2 pointer          | 0x20
-  *  Output pointer          | 0x28
-  * -------------------------------
-
-  * ------------------------------
-  *  Control status register | bit
-  * ------------------------------
-  *  Launch                  | 0
-  *  Finish                  | 1
-  * ------------------------------
-  */
-class RegFile(implicit config: AccelConfig) extends Module {
-  val io = IO(new Bundle {
-    val launch = Output(Bool())
-    val finish = Input(Bool())
-    val ecnt = Vec(config.nECnt, Flipped(ValidIO(UInt(config.regBits.W))))
-    val vals = Output(Vec(config.nVals, UInt(config.regBits.W)))
-    val ptrs = Output(Vec(config.nPtrs, UInt(config.ptrBits.W)))
-    val host = new VTAHostDPIClient
-  })
-  val sIdle :: sRead :: Nil = Enum(2)
-  val state = RegInit(sIdle)
-
-  switch (state) {
-    is (sIdle) {
-      when (io.host.req.valid && !io.host.req.opcode) {
-        state := sRead
-      }
-    }
-    is (sRead) {
-      state := sIdle
-    }
-  }
-
-  io.host.req.deq := state === sIdle & io.host.req.valid
-
-  val nTotal = config.nCtrl + config.nECnt + config.nVals + (2*config.nPtrs)
-  val reg = Seq.fill(nTotal)(RegInit(0.U.asTypeOf(chiselTypeOf(io.host.req.value))))
-  val addr = Seq.tabulate(nTotal)(_ * 4)
-  val reg_map = (addr zip reg)  map { case (a, r) => a.U -> r }
-  val eo = config.nCtrl
-  val vo = eo + config.nECnt
-  val po = vo + config.nVals
-
-  when (io.finish) {
-    reg(0) := "b_10".U
-  } .elsewhen (state === sIdle && io.host.req.valid &&
-        io.host.req.opcode && addr(0).U === io.host.req.addr) {
-    reg(0) := io.host.req.value
-  }
-
-  for (i <- 0 until config.nECnt) {
-    when (io.ecnt(i).valid) {
-      reg(eo + i) := io.ecnt(i).bits
-    } .elsewhen (state === sIdle && io.host.req.valid &&
-          io.host.req.opcode && addr(eo + i).U === io.host.req.addr) {
-      reg(eo + i) := io.host.req.value
-    }
-  }
-
-  for (i <- 0 until (config.nVals + (2*config.nPtrs))) {
-    when (state === sIdle && io.host.req.valid &&
-          io.host.req.opcode && addr(vo + i).U === io.host.req.addr) {
-      reg(vo + i) := io.host.req.value
-    }
-  }
-
-  val rdata = RegInit(0.U.asTypeOf(chiselTypeOf(io.host.req.value)))
-  when (state === sIdle && io.host.req.valid && !io.host.req.opcode) {
-    rdata := MuxLookup(io.host.req.addr, 0.U, reg_map)
-  }
-
-  io.host.resp.valid := state === sRead
-  io.host.resp.bits := rdata
-
-  io.launch := reg(0)(0)
-
-  for (i <- 0 until config.nVals) {
-    io.vals(i) := reg(vo + i)
-  }
-
-  for (i <- 0 until config.nPtrs) {
-    io.ptrs(i) := Cat(reg(po + 2*i + 1), reg(po + 2*i))
-  }
-}
diff --git a/vta/vta-hw/apps/gemm/hardware/chisel/src/test/scala/dut/TestAccel.scala b/vta/vta-hw/apps/gemm/hardware/chisel/src/test/scala/dut/TestAccel.scala
deleted file mode 100644
index d931620..0000000
--- a/vta/vta-hw/apps/gemm/hardware/chisel/src/test/scala/dut/TestAccel.scala
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package test
-
-import chisel3._
-import chisel3.experimental.MultiIOModule
-import vta.dpi._
-import accel._
-
-/** VTA simulation shell.
-  *
-  * Instantiate Host and Memory DPI modules.
-  *
-  */
-class VTASimShell extends MultiIOModule {
-  val host = IO(new VTAHostDPIMaster)
-  val mem = IO(new VTAMemDPIClient)
-  val sim_clock = IO(Input(Clock()))
-  val sim_wait = IO(Output(Bool()))
-  val mod_sim = Module(new VTASimDPI)
-  val mod_host = Module(new VTAHostDPI)
-  val mod_mem = Module(new VTAMemDPI)
-  mod_mem.io.clock := clock
-  mod_mem.io.reset := reset
-  mod_mem.io.dpi <> mem
-  mod_host.io.clock := clock
-  mod_host.io.reset := reset
-  host <> mod_host.io.dpi
-  mod_sim.io.clock := sim_clock
-  mod_sim.io.reset := reset
-  sim_wait := mod_sim.io.dpi_wait
-}
-
-/** Test accelerator.
-  *
-  * Instantiate and connect the simulation-shell and the accelerator.
-  *
-  */
-class TestAccel extends MultiIOModule {
-  val sim_clock = IO(Input(Clock()))
-  val sim_wait = IO(Output(Bool()))
-  val sim_shell = Module(new VTASimShell)
-  val vta_accel = Module(new Accel)
-  sim_shell.sim_clock := sim_clock
-  sim_wait := sim_shell.sim_wait
-  sim_shell.mem <> vta_accel.io.mem
-  vta_accel.io.host <> sim_shell.host
-}
-
-/** Generate TestAccel as top module */
-object Elaborate extends App {
-  chisel3.Driver.execute(args, () => new TestAccel)
-}
diff --git a/vta/vta-hw/apps/gemm/python/__init__.py b/vta/vta-hw/apps/gemm/python/__init__.py
deleted file mode 100644
index 4bc21e2..0000000
--- a/vta/vta-hw/apps/gemm/python/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from . import tsim
diff --git a/vta/vta-hw/apps/gemm/python/tsim.py b/vta/vta-hw/apps/gemm/python/tsim.py
deleted file mode 100644
index 85fd463..0000000
--- a/vta/vta-hw/apps/gemm/python/tsim.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import tvm
-from tvm import te
-import ctypes
-import os.path as osp
-from sys import platform
-
-def get_ext():
-    """Return shared library extension"""
-    return ".dylib" if platform == "darwin" else ".so"
-
-def load_dll(dll):
-    """Load shared library
-
-    Parameters
-    ------------
-    dll : str
-        Path for shared library
-
-    Returns
-    ------------
-    The shared library
-    """
-    try:
-        return [ctypes.CDLL(dll, ctypes.RTLD_GLOBAL)]
-    except OSError:
-        return []
-
-def load_sw():
-    """Load all software shared libraries"""
-    cur_path = osp.dirname(osp.abspath(osp.expanduser(__file__)))
-    sw_libname = "libsw" + get_ext()
-    sw_lib = osp.join(cur_path, "..", "build", sw_libname)
-    load_dll(sw_lib)
-
-def init(hw_backend):
-    """Init hardware and software shared library for accelerator
-
-    Parameters
-    ------------
-    hw_backend : str
-        Hardware backend can be verilog or chisel
-
-    """
-    cur_path = osp.dirname(osp.abspath(osp.expanduser(__file__)))
-    hw_libname = "libhw" + get_ext()
-    if hw_backend in ("verilog", "chisel"):
-        hw_lib = osp.join(cur_path, "..", "hardware", hw_backend, "build", hw_libname)
-    load_sw()
-    m = tvm.runtime.load_module(hw_lib, "vta-tsim")
-    f = tvm.get_global_func("tvm.vta.tsim.init")
-    f(m)
-
-def load_module():
-    """Return driver function"""
-    load_sw()
-    return tvm.get_global_func("tvm.vta.driver")
diff --git a/vta/vta-hw/apps/gemm/src/driver.cc b/vta/vta-hw/apps/gemm/src/driver.cc
deleted file mode 100644
index 24b998e..0000000
--- a/vta/vta-hw/apps/gemm/src/driver.cc
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <tvm/runtime/module.h>
-#include <tvm/runtime/registry.h>
-#include <vta/dpi/module.h>
-
-#include "vmem/virtual_memory.h"
-
-namespace vta {
-namespace driver {
-
-using vta::dpi::DPIModuleNode;
-using tvm::runtime::Module;
-
-class DPILoader {
- public:
-  ~DPILoader() {
-    dpi_->SimResume();
-    dpi_->SimFinish();
-  }
-
-  void Init(Module module) {
-    mod_ = module;
-    dpi_ = this->Get();
-    dpi_->SimLaunch();
-    dpi_->SimWait();
-  }
-
-  DPIModuleNode* Get() {
-    return static_cast<DPIModuleNode*>(mod_.operator->());
-  }
-
-  static DPILoader* Global() {
-    static DPILoader inst;
-    return &inst;
-  }
-
-  // TVM module
-  Module mod_;
-  // DPI Module
-  DPIModuleNode* dpi_{nullptr};
-};
-
-class Device {
- public:
-  Device() {
-    loader_ = DPILoader::Global();
-  }
-
-  uint32_t Run(DLTensor* inp1, DLTensor* inp2, uint32_t shiftVal, DLTensor* out, uint32_t reset) {
-    uint32_t cycles;
-    uint32_t length = inp2->shape[0];
-    // 1 matrix 1 vector input
-    size_t size1 = (inp1->dtype.bits >> 3) * length * length;
-    size_t size2 = (inp2->dtype.bits >> 3) * length;
-    // 1 vector output
-    size_t size3 = (32 >> 3) * length;
-    inp1_ = this->MemAlloc(size1);
-    inp2_ = this->MemAlloc(size2);
-    out_ = this->MemAlloc(size3);
-    this->MemCopyFromHost(inp1_, inp1->data, size1);
-    this->MemCopyFromHost(inp2_, inp2->data, size2);
-    this->Init();
-    this->Launch(length, shiftVal, reset);
-    cycles = this->WaitForCompletion();
-    this->MemCopyToHost(out->data, out_, size3);
-    this->MemFree(inp1_);
-    this->MemFree(inp2_);
-    this->MemFree(out_);
-    return cycles;
-  }
-
- private:
-  void Init() {
-    dpi_ = loader_->Get();
-    dpi_->SimResume();
-  }
-
-  void* MemAlloc(size_t size) {
-    void * addr = vta::vmem::VirtualMemoryManager::Global()->Alloc(size);
-    return reinterpret_cast<void*>(vta::vmem::VirtualMemoryManager::Global()->GetPhyAddr(addr));
-  }
-
-  void MemFree(void* buf) {
-    void * addr = vta::vmem::VirtualMemoryManager::Global()->GetAddr(reinterpret_cast<uint64_t>(buf));
-    vta::vmem::VirtualMemoryManager::Global()->Free(addr);
-  }
-
-  vta_phy_addr_t MemGetPhyAddr(void* buf) {
-    return reinterpret_cast<uint64_t>(reinterpret_cast<uint64_t*>(buf));
-  }
-
-  void MemCopyFromHost(void* dst, const void* src, size_t size) {
-    vta::vmem::VirtualMemoryManager::Global()->MemCopyFromHost(dst, src, size);
-  }
-
-  void MemCopyToHost(void* dst, const void* src, size_t size) {
-    vta::vmem::VirtualMemoryManager::Global()->MemCopyToHost(dst, src, size);
-  }
-
-  void Launch(uint32_t length, uint32_t shiftVal, uint32_t reset) {
-    dpi_->WriteReg(0x08, shiftVal);
-    dpi_->WriteReg(0x0c, length); // tensor size
-    dpi_->WriteReg(0x18, this->MemGetPhyAddr(inp1_));
-    dpi_->WriteReg(0x20, this->MemGetPhyAddr(inp2_));
-    dpi_->WriteReg(0x28, this->MemGetPhyAddr(out_));
-    dpi_->WriteReg(0x00, 0x1); // launch
-    dpi_->WriteReg(0x00, 0x0); 
-
-    if (reset == 1) {
-      dpi_->WriteReg(0x10, 0x1); // reset accumulator
-      dpi_->WriteReg(0x10, 0x0); 
-    }
-  }
-
-  uint32_t WaitForCompletion() {
-    uint32_t i, val;
-    for (i = 0; i < wait_cycles_; i++) {
-      val = dpi_->ReadReg(0x00);
-      if (val == 2) break; // finish
-    }
-    val = dpi_->ReadReg(0x04);
-    dpi_->SimWait();
-    return val;
-  }
-
-  // wait cycles
-  uint32_t wait_cycles_{100000000};
-  // DPI loader
-  DPILoader* loader_{nullptr};
-  // DPI Module
-  DPIModuleNode* dpi_{nullptr};
-  // input vm ptr
-  void* inp1_{nullptr};
-  void* inp2_{nullptr};
-  // output vm ptr
-  void* out_{nullptr};
-};
-
-using tvm::runtime::TVMRetValue;
-using tvm::runtime::TVMArgs;
-
-TVM_REGISTER_GLOBAL("tvm.vta.tsim.init")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    Module m = args[0];
-    DPILoader::Global()->Init(m);
-  });
-
-TVM_REGISTER_GLOBAL("tvm.vta.driver")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    DLTensor* A = args[0];
-    DLTensor* B = args[1];
-    DLTensor* C = args[3];
-    Device dev_;
-    uint32_t cycles = dev_.Run(A, B, static_cast<int>(args[2]), C, static_cast<int>(args[4]));
-    *rv = static_cast<int>(cycles);
-  });
-
-}  // namespace driver
-}  // namespace vta
diff --git a/vta/vta-hw/apps/gemm/tests/python/chisel_accel.py b/vta/vta-hw/apps/gemm/tests/python/chisel_accel.py
deleted file mode 100644
index 441f36d..0000000
--- a/vta/vta-hw/apps/gemm/tests/python/chisel_accel.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import tvm
-from tvm import te
-import numpy as np
-import tsim
-import sys
-
-""" Vector Bit Slice and Pack Function
-Parameters
-----------
-A : Vector to be sliced and packed
-slice_width : slice width
-
-Returns
----------
-C: 2d matrix where each cloumn (because of bit packing) represents each bit slice of A
-"""
-def slice(A, slice_width):
-    assert np.log2(slice_width) % 1 == 0, "only power of 2 is supported"
-    dtype = type(A[0])
-    row = 0
-    # currently only supports uint
-    if dtype is np.uint8: row = 8 // slice_width
-    elif dtype is np.uint16: row = 16 // slice_width
-    elif dtype is np.uint32: row = 32 // slice_width
-    elif dtype is np.uint64: row = 64 // slice_width
-    else: raise ValueError("datatype currently not supported")
-    if (row >= 8):
-        dtype = 'uint' + str(row)
-    else:
-        dtype = 'uint8'
-
-    C = np.zeros((row, len(A))).astype(dtype) # sliced and transform
-
-    # create mask
-    slice_mask = 2**(slice_width)-1
-    # slice and pack
-    for x in range(len(A)):
-        for y in range(row):
-            C[y][x] = (np.uint64(A[x]) >> np.uint64(slice_width * y)) & np.uint64(slice_mask)
-    return C
-
-def slice_mat(A, slice_width):
-    assert np.log2(slice_width) % 1 == 0, "only power of 2 is supported"
-    dtype = type(A[0][0])
-    row = 0
-    # currently only supports uint
-    if dtype is np.uint8: row = 8 // slice_width
-    elif dtype is np.uint16: row = 16 // slice_width
-    elif dtype is np.uint32: row = 32 // slice_width
-    elif dtype is np.uint64: row = 64 // slice_width
-    else: raise ValueError("datatype currently not supported")
-    if (row >= 8):
-        dtype = 'uint' + str(row)
-    else:
-        dtype = 'uint8'
-
-    # 3d array (bits, row, clmn)
-    C = np.zeros((row, A.shape[0], A.shape[1])).astype(dtype) # sliced and transform
-
-    # create mask
-    slice_mask = 2**(slice_width)-1
-    # slice and pack
-    for z in range(A.shape[0]):
-        C[:, z, :] = slice(A[z], slice_width)
-    return C
-
-""" Matrix Multiplication Function
-Parameters
-----------
-A : Matrix A
-B: Matrix B
-i_width : weight slice width
-w_width : activation slice width
-
-Returns
----------
-C: result of A * B
-"""
-# A is a n*m matrix, B is a m*p matrix(not transposed yet)
-def matrix_multiply(A, B, i_width, w_width):
-    assert A.shape[1] == B.shape[0], "can't perform multiplication"
-    BT = B.transpose()
-    cycles = 0
-    B_sliced = slice_mat(BT, w_width)
-    C = np.zeros((A.shape[0], B.shape[1])).astype('uint64')
-    for i in range(A.shape[0]):
-        A_sliced = slice(A[i], i_width)
-        test = test_accel(A_sliced, B_sliced, i_width, w_width)
-        C[i] = test[0]
-        cycles += test[1]
-        np.testing.assert_array_equal(C[i], compute(A_sliced, B_sliced, i_width, w_width))
-        print("PASS row " + str(i))
-
-    np.testing.assert_array_equal(C, np.matmul(A.astype('uint64'),B))
-    print("result: ")
-    print(C)
-    print("TEST PASSED, cycles: " + str(cycles))
-    return C
-
-""" Software Verification Function
-Parameter Dimesions
----------
-A (bits, y) and B (bits, y, x) (transposed)
-
-Takes 1 vector and 1 matrix input (sliced and packed)
-
-Returns
----------
-Resulting vector
-"""
-def compute(A, B, i_width, w_width):
-    assert A.shape[1] == B.shape[1], "sliced shape not match"
-    # reset hardware accumulator
-    accum = np.zeros(A.shape[1])
-    for x in range(A.shape[0]):
-        for y in range(B.shape[0]):
-            accum += np.matmul(A[x].astype('uint64'), B[y].transpose()) << np.uint64(x*i_width + y*w_width)
-    # get value from accumulator
-    return accum
-
-"""Testing Function for Matrix Vector Multiplication"""
-def test_accel(A, B, i_width, w_width):
-    assert A.shape[1] == B.shape[2], "sliced shape not match"
-    dtype = A.dtype
-    ctx = tvm.cpu(0)
-    f = tsim.load_module()
-
-    a_arr = []
-    b_arr = []
-    for i in range(A.shape[0]):
-        list_a = np.zeros(A.shape[1]).astype(dtype)
-        for j in range(A.shape[1]):
-            list_a[j] = A[i][j]
-        a_arr.append(tvm.nd.array(list_a.astype(dtype), ctx))
-
-    for i in range(B.shape[0]):
-        # transpose
-        list_b = np.zeros((B.shape[2], B.shape[1])).astype(dtype)
-        for j in range(B.shape[2]):
-            for k in range(B.shape[1]):
-                list_b[j][k] = B[i][j][k]
-        b_arr.append(tvm.nd.array(list_b.astype(dtype), ctx))
-
-    cycles = 0
-    accum = tvm.nd.array(np.zeros(A.shape[1]).astype("uint32"), ctx)
-    for i in range(len(a_arr)):
-        for j in range(len(b_arr)):
-            shift = np.uint8(i*i_width + j*w_width)
-            if i == 0 and j == 0:
-                cycles += f(b_arr[j], a_arr[i], shift, accum, np.uint32(1)) # reset accumulator
-            else:
-                cycles += f(b_arr[j], a_arr[i], shift, accum, np.uint32(0)) # no reset
-
-    return (accum.asnumpy(), cycles)
-
-""" Matrix Generator
-Parameters
-----------
-dtype : String, datatype generated (supports only uint)
-i_width : weight bit slices(needs to be less than actual bit width)
-w_width : activation bit slices(needs to be less than actual bit width)
-"""
-def top_test(dtype, i_width, w_width):
-
-    # only supports positive values (up to 2**(bits-1))
-    rmax = 127
-    # (m,16) * (16,16) GEMM
-    rrow = np.random.randint(7) + 1
-    clmn = 16
-    A = np.random.randint(rmax, size=(rrow,clmn)).astype(dtype)
-    B = np.random.randint(rmax, size=(clmn,clmn)).astype(dtype)
-
-    print("A: " + str(A))
-    print("B: " + str(B))
-    # perform GEMM
-    matrix_multiply(A, B, i_width, w_width)
-
-if __name__ == "__main__":
-    tsim.init("chisel")
-    for i in range(1):
-        # reg1 and reg2 bits in hardware/chisel/src/main/Compute.scala must be modified for slices greater than 8 bits
-        if sys.argv[1] == 'serial':
-          # generates a random uint8 GEMM with 2-bit(8/4) input and 4-bit(8/2) weight
-          top_test("uint8", 4, 2)
-        elif sys.argv[1] == 'parallel':
-          # generates a random uint8 GEMM with 8-bit input and 8-bit weight (bit parallel)
-          top_test('uint8', 8, 8)
diff --git a/vta/vta-hw/apps/tsim_example/CMakeLists.txt b/vta/vta-hw/apps/tsim_example/CMakeLists.txt
deleted file mode 100644
index f41a467..0000000
--- a/vta/vta-hw/apps/tsim_example/CMakeLists.txt
+++ /dev/null
@@ -1,51 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-cmake_minimum_required(VERSION 3.2)
-project(tsim C CXX)
-
-if(NOT DEFINED ENV{TVM_PATH})
-    message(ERROR "Make sure to set TVM_PATH in your environment")
-endif()
-
-if(NOT DEFINED ENV{VTA_HW_PATH})
-    message(ERROR "Make sure to set VTA_HW_PATH in your environment")
-endif()
-
-include_directories("$ENV{TVM_PATH}/include")
-include_directories("$ENV{TVM_PATH}/3rdparty/dlpack/include")
-include_directories("$ENV{TVM_PATH}/3rdparty/dmlc-core/include")
-include_directories("$ENV{VTA_HW_PATH}/src/dpi")
-
-set(CMAKE_C_FLAGS "-O2 -Wall -fPIC -fvisibility=hidden")
-set(CMAKE_CXX_FLAGS "-O2 -Wall -fPIC -fvisibility=hidden -std=c++11")
-
-if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" AND
-    CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
-  set(CMAKE_CXX_FLAGS "-faligned-new ${CMAKE_CXX_FLAGS}")
-endif()
-
-file(GLOB TSIM_SW_SRC src/driver.cc)
-list(APPEND TSIM_SW_SRC $ENV{VTA_HW_PATH}/src/vmem/virtual_memory.cc)
-list(APPEND TSIM_SW_SRC $ENV{VTA_HW_PATH}/src/dpi/module.cc)
-
-add_library(sw SHARED ${TSIM_SW_SRC})
-target_include_directories(sw PRIVATE $ENV{VTA_HW_PATH}/include $ENV{VTA_HW_PATH}/src)
-
-if(APPLE)
-  set_target_properties(sw PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
-endif(APPLE)
diff --git a/vta/vta-hw/apps/tsim_example/Makefile b/vta/vta-hw/apps/tsim_example/Makefile
deleted file mode 100644
index 406f931..0000000
--- a/vta/vta-hw/apps/tsim_example/Makefile
+++ /dev/null
@@ -1,46 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-export PYTHONPATH:=$(PWD)/python:$(PYTHONPATH)
-
-BUILD_NAME = build
-build_dir = $(abspath .)/$(BUILD_NAME)
-
-default: run_verilog
-
-run_verilog: verilog driver
-	python3 tests/python/verilog_accel.py
-
-run_chisel: chisel driver
-	python3 tests/python/chisel_accel.py
-
-driver: | $(build_dir)
-	cd $(build_dir) && cmake .. && make
-
-$(build_dir):
-	mkdir -p $@
-
-verilog:
-	make -C hardware/verilog
-
-chisel:
-	make -C hardware/chisel
-
-clean:
-	-rm -rf $(build_dir)
-	make -C hardware/chisel clean
-	make -C hardware/verilog clean
diff --git a/vta/vta-hw/apps/tsim_example/README.md b/vta/vta-hw/apps/tsim_example/README.md
deleted file mode 100644
index 07d9841..0000000
--- a/vta/vta-hw/apps/tsim_example/README.md
+++ /dev/null
@@ -1,87 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-VTA TSIM Installation
-======================
-
-*TSIM* is a cycle-accurate hardware simulation environment that can be invoked and managed directly from TVM. It aims to enable cycle accurate simulation of deep learning accelerators including VTA.
-This simulation environment can be used in both OSX and Linux.
-There are two dependencies required to make *TSIM* works: [Verilator](https://www.veripool.org/wiki/verilator) and [sbt](https://www.scala-sbt.org/) for accelerators designed in [Chisel3](https://github.com/freechipsproject/chisel3).
-
-## OSX Dependencies
-
-Install `sbt` and `verilator` using [Homebrew](https://brew.sh/).
-
-```bash
-brew install verilator sbt
-```
-
-## Linux Dependencies
-
-Add `sbt` to package manager (Ubuntu).
-
-```bash
-echo "deb https://dl.bintray.com/sbt/debian /" | sudo tee -a /etc/apt/sources.list.d/sbt.list
-sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2EE0EA64E40A89B84B2DF73499E82A75642AC823
-sudo apt-get update
-```
-
-Install `sbt` and `verilator`.
-
-```bash
-sudo apt install verilator sbt
-```
-
-Verilator version check
-
-```bash
-verilator --version
-```
-
-the supported version of Verilator should be at least 4.012,
-if homebrew (OSX) or package-manager (Linux) does not support that version,
-please install Verilator 4.012 or later from binary or source base on following
-instruction of Verilator wiki.
-
-https://www.veripool.org/projects/verilator/wiki/Installing
-
-## Setup in TVM
-
-1. Install `verilator` and `sbt` as described above
-2. Get tvm `git clone https://github.com/apache/incubator-tvm.git tvm --recursive`
-3. Build [tvm](https://docs.tvm.ai/install/from_source.html#build-the-shared-library)
-
-## How to run VTA TSIM examples
-
-There are two sample VTA accelerators, add-a-constant, designed in Chisel3 and Verilog to show how *TSIM* works.
-The default target language for these two implementations is Verilog. The following instructions show
-how to run both of them:
-
-* Test Verilog backend
-    * Go to `<vta-hw-root>/apps/tsim_example`
-    * Run `make`
-
-* Test Chisel3 backend
-    * Go to `<vta-hw-root>/apps/tsim_example`
-    * Run `make run_chisel`
-
-* Some pointers
-    * Verilog and Chisel3 tests in `<vta-hw-root>/apps/tsim_example/tests/python`
-    * Verilog accelerator backend `<vta-hw-root>/apps/tsim_example/hardware/verilog`
-    * Chisel3 accelerator backend `<vta-hw-root>/apps/tsim_example/hardware/chisel`
-    * Software C++ driver (backend) that handles the accelerator `<vta-hw-root>/apps/tsim_example/src/driver.cc`
-    * Software Python driver (frontend) that handles the accelerator `<vta-hw-root>/apps/tsim_example/python/accel`
diff --git a/vta/vta-hw/apps/tsim_example/hardware/chisel/Makefile b/vta/vta-hw/apps/tsim_example/hardware/chisel/Makefile
deleted file mode 100644
index 2bbe777..0000000
--- a/vta/vta-hw/apps/tsim_example/hardware/chisel/Makefile
+++ /dev/null
@@ -1,116 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-ifeq (, $(shell which verilator))
- $(error "No Verilator in $(PATH), consider doing apt-get install verilator")
-endif
-
-# Change VERILATOR_INC_DIR if Verilator is installed on a different location
-ifeq (, $(VERILATOR_INC_DIR))
-  ifeq (, $(wildcard /usr/local/share/verilator/include/*))
-    ifeq (, $(wildcard /usr/share/verilator/include/*))
-      $(error "Verilator include directory is not set properly")
-    else
-      VERILATOR_INC_DIR := /usr/share/verilator/include
-    endif
-  else
-      VERILATOR_INC_DIR := /usr/local/share/verilator/include
-  endif
-endif
-
-TOP = TestAccel
-BUILD_NAME = build
-USE_TRACE = 0
-LIBNAME = libhw
-
-vta_dir = $(abspath ../../../../)
-tvm_dir = $(abspath ../../../../../../)
-build_dir = $(abspath .)/$(BUILD_NAME)
-verilator_build_dir = $(build_dir)/verilator
-chisel_build_dir = $(build_dir)/chisel
-
-verilator_opt = --cc
-verilator_opt += +define+RANDOMIZE_GARBAGE_ASSIGN
-verilator_opt += +define+RANDOMIZE_REG_INIT
-verilator_opt += +define+RANDOMIZE_MEM_INIT
-verilator_opt += --x-assign unique
-verilator_opt += --output-split 20000
-verilator_opt += --output-split-cfuncs 20000
-verilator_opt += --top-module ${TOP}
-verilator_opt += -Mdir ${verilator_build_dir}
-verilator_opt += -I$(chisel_build_dir)
-
-cxx_flags = -O2 -Wall -fPIC -shared
-cxx_flags += -fvisibility=hidden -std=c++11
-cxx_flags += -DVL_TSIM_NAME=V$(TOP)
-cxx_flags += -DVL_PRINTF=printf
-cxx_flags += -DVL_USER_FINISH
-cxx_flags += -DVM_COVERAGE=0
-cxx_flags += -DVM_SC=0
-cxx_flags += -Wno-sign-compare
-cxx_flags += -include V$(TOP).h
-cxx_flags += -I$(verilator_build_dir)
-cxx_flags += -I$(VERILATOR_INC_DIR)
-cxx_flags += -I$(VERILATOR_INC_DIR)/vltstd
-cxx_flags += -I$(vta_dir)/include
-cxx_flags += -I$(tvm_dir)/include
-cxx_flags += -I$(tvm_dir)/3rdparty/dlpack/include
-
-cxx_files = $(VERILATOR_INC_DIR)/verilated.cpp
-cxx_files += $(VERILATOR_INC_DIR)/verilated_dpi.cpp
-cxx_files += $(wildcard $(verilator_build_dir)/*.cpp)
-cxx_files += $(vta_dir)/hardware/dpi/tsim_device.cc
-
-ifneq ($(USE_TRACE), 0)
-  verilator_opt += --trace
-  cxx_flags += -DVM_TRACE=1
-  cxx_flags += -DTSIM_TRACE_FILE=$(verilator_build_dir)/$(TOP).vcd
-  cxx_files += $(VERILATOR_INC_DIR)/verilated_vcd_c.cpp
-else
-  cxx_flags += -DVM_TRACE=0
-endif
-
-# The following is to be consistent with cmake
-ifeq ($(shell uname), Darwin)
-  lib_path = $(build_dir)/$(LIBNAME).dylib
-else
-  lib_path = $(build_dir)/$(LIBNAME).so
-endif
-
-default: lint lib
-
-lint:
-	cp $(vta_dir)/hardware/chisel/scalastyle-config.xml .
-	sbt scalastyle
-
-lib: $(lib_path)
-$(lib_path): $(verilator_build_dir)/V$(TOP).cpp
-	g++ $(cxx_flags) $(cxx_files) -o $@
-
-verilator: $(verilator_build_dir)/V$(TOP).cpp
-$(verilator_build_dir)/V$(TOP).cpp: $(chisel_build_dir)/$(TOP).v
-	verilator $(verilator_opt) $<
-
-verilog: $(chisel_build_dir)/$(TOP).v
-$(chisel_build_dir)/$(TOP).v: install_vta_package
-	sbt 'test:runMain test.Elaborate --target-dir $(chisel_build_dir) --top-name $(TOP)'
-
-install_vta_package:
-	cd $(vta_dir)/hardware/chisel && sbt publishLocal
-
-clean:
-	-rm -rf $(build_dir) target project/target project/project
diff --git a/vta/vta-hw/apps/tsim_example/hardware/chisel/build.sbt b/vta/vta-hw/apps/tsim_example/hardware/chisel/build.sbt
deleted file mode 100644
index a2afc0d..0000000
--- a/vta/vta-hw/apps/tsim_example/hardware/chisel/build.sbt
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-name := "accel"
-version := "0.1.0-SNAPSHOT"
-organization := "edu.washington.cs"
-
-def scalacOptionsVersion(scalaVersion: String): Seq[String] = {
-  Seq() ++ {
-    // If we're building with Scala > 2.11, enable the compile option
-    //  switch to support our anonymous Bundle definitions:
-    //  https://github.com/scala/bug/issues/10047
-    CrossVersion.partialVersion(scalaVersion) match {
-      case Some((2, scalaMajor: Long)) if scalaMajor < 12 => Seq()
-      case _ => Seq(
-        "-Xsource:2.11",
-        "-language:reflectiveCalls",
-        "-language:implicitConversions",
-        "-deprecation",
-        "-Xlint",
-        "-Ywarn-unused",
-      )
-    }
-  }
-}
-
-def javacOptionsVersion(scalaVersion: String): Seq[String] = {
-  Seq() ++ {
-    // Scala 2.12 requires Java 8. We continue to generate
-    //  Java 7 compatible code for Scala 2.11
-    //  for compatibility with old clients.
-    CrossVersion.partialVersion(scalaVersion) match {
-      case Some((2, scalaMajor: Long)) if scalaMajor < 12 =>
-        Seq("-source", "1.7", "-target", "1.7")
-      case _ =>
-        Seq("-source", "1.8", "-target", "1.8")
-    }
-  }
-}
-
-scalaVersion := "2.11.12"
-
-resolvers ++= Seq(
-  Resolver.sonatypeRepo("snapshots"),
-  Resolver.sonatypeRepo("releases"))
-
-libraryDependencies ++= Seq(
-  "edu.berkeley.cs" %% "chisel3" % "3.1.7",
-  "edu.washington.cs" %% "vta" % "0.1.0-SNAPSHOT",
-)
-
-scalacOptions ++= scalacOptionsVersion(scalaVersion.value)
-javacOptions ++= javacOptionsVersion(scalaVersion.value)
diff --git a/vta/vta-hw/apps/tsim_example/hardware/chisel/project/build.properties b/vta/vta-hw/apps/tsim_example/hardware/chisel/project/build.properties
deleted file mode 100644
index fc7998e..0000000
--- a/vta/vta-hw/apps/tsim_example/hardware/chisel/project/build.properties
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-sbt.version = 1.3.2
diff --git a/vta/vta-hw/apps/tsim_example/hardware/chisel/project/plugins.sbt b/vta/vta-hw/apps/tsim_example/hardware/chisel/project/plugins.sbt
deleted file mode 100644
index 19ae5c9..0000000
--- a/vta/vta-hw/apps/tsim_example/hardware/chisel/project/plugins.sbt
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-logLevel := Level.Warn
-addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0")
diff --git a/vta/vta-hw/apps/tsim_example/hardware/chisel/src/main/scala/accel/Accel.scala b/vta/vta-hw/apps/tsim_example/hardware/chisel/src/main/scala/accel/Accel.scala
deleted file mode 100644
index 7ba1e63..0000000
--- a/vta/vta-hw/apps/tsim_example/hardware/chisel/src/main/scala/accel/Accel.scala
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package accel
-
-import chisel3._
-import vta.dpi._
-
-/** Add-by-one accelerator.
- *
- * ___________      ___________
- * |         |      |         |
- * | HostDPI | <--> | RegFile | <->|
- * |_________|      |_________|    |
- *                                 |
- * ___________      ___________    |
- * |         |      |         |    |
- * | MemDPI  | <--> | Compute | <->|
- * |_________|      |_________|
- *
- */
-case class AccelConfig() {
-  val nCtrl = 1
-  val nECnt = 1
-  val nVals = 2
-  val nPtrs = 2
-  val regBits = 32
-  val ptrBits = 2 * regBits
-}
-
-class Accel extends Module {
-  val io = IO(new Bundle {
-    val host = new VTAHostDPIClient
-    val mem = new VTAMemDPIMaster
-  })
-  implicit val config = AccelConfig()
-  val rf = Module(new RegFile)
-  val ce = Module(new Compute)
-  rf.io.host <> io.host
-  io.mem <> ce.io.mem
-  ce.io.launch := rf.io.launch
-  rf.io.finish := ce.io.finish
-  rf.io.ecnt <> ce.io.ecnt
-  ce.io.vals <> rf.io.vals
-  ce.io.ptrs <> rf.io.ptrs
-}
diff --git a/vta/vta-hw/apps/tsim_example/hardware/chisel/src/main/scala/accel/Compute.scala b/vta/vta-hw/apps/tsim_example/hardware/chisel/src/main/scala/accel/Compute.scala
deleted file mode 100644
index 3ef2e7e..0000000
--- a/vta/vta-hw/apps/tsim_example/hardware/chisel/src/main/scala/accel/Compute.scala
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package accel
-
-import chisel3._
-import chisel3.util._
-import vta.dpi._
-
-/** Compute
- *
- * Add-by-one procedure:
- *
- * 1. Wait for launch to be asserted
- * 2. Issue a read request for 8-byte value at inp_baddr address
- * 3. Wait for the value
- * 4. Issue a write request for 8-byte value at out_baddr address
- * 5. Increment read-address and write-address for next value
- * 6. Check if counter (cnt) is equal to length to assert finish,
- *    otherwise go to step 2.
- */
-class Compute(implicit config: AccelConfig) extends Module {
-  val io = IO(new Bundle {
-    val launch = Input(Bool())
-    val finish = Output(Bool())
-    val ecnt = Vec(config.nECnt, ValidIO(UInt(config.regBits.W)))
-    val vals = Input(Vec(config.nVals, UInt(config.regBits.W)))
-    val ptrs = Input(Vec(config.nPtrs, UInt(config.ptrBits.W)))
-    val mem = new VTAMemDPIMaster
-  })
-  val sIdle :: sReadReq :: sReadData :: sWriteReq :: sWriteData :: Nil = Enum(5)
-  val state = RegInit(sIdle)
-  val const = io.vals(0)
-  val length = io.vals(1)
-  val cycles = RegInit(0.U(config.regBits.W))
-  val reg = Reg(chiselTypeOf(io.mem.rd.bits))
-  val cnt = Reg(UInt(config.regBits.W))
-  val raddr = Reg(UInt(config.ptrBits.W))
-  val waddr = Reg(UInt(config.ptrBits.W))
-
-  switch(state) {
-    is(sIdle) {
-      when(io.launch) {
-        state := sReadReq
-      }
-    }
-    is(sReadReq) {
-      state := sReadData
-    }
-    is(sReadData) {
-      when(io.mem.rd.valid) {
-        state := sWriteReq
-      }
-    }
-    is(sWriteReq) {
-      state := sWriteData
-    }
-    is(sWriteData) {
-      when(cnt === (length - 1.U)) {
-        state := sIdle
-      }.otherwise {
-        state := sReadReq
-      }
-    }
-  }
-
-  val last = state === sWriteData && cnt === (length - 1.U)
-
-  // cycle counter
-  when(state === sIdle) {
-    cycles := 0.U
-  }.otherwise {
-    cycles := cycles + 1.U
-  }
-
-  io.ecnt(0).valid := last
-  io.ecnt(0).bits := cycles
-
-  // calculate next address
-  when(state === sIdle) {
-    raddr := io.ptrs(0)
-    waddr := io.ptrs(1)
-  }.elsewhen(state === sWriteData) { // increment by 8-bytes
-    raddr := raddr + 8.U
-    waddr := waddr + 8.U
-  }
-
-  // create request
-  io.mem.req.valid := state === sReadReq | state === sWriteReq
-  io.mem.req.opcode := state === sWriteReq
-  io.mem.req.len := 0.U // one-word-per-request
-  io.mem.req.addr := Mux(state === sReadReq, raddr, waddr)
-
-  // read
-  when(state === sReadData && io.mem.rd.valid) {
-    reg := io.mem.rd.bits + const
-  }
-  io.mem.rd.ready := state === sReadData
-
-  // write
-  io.mem.wr.valid := state === sWriteData
-  io.mem.wr.bits := reg
-
-  // count read/write
-  when(state === sIdle) {
-    cnt := 0.U
-  }.elsewhen(state === sWriteData) {
-    cnt := cnt + 1.U
-  }
-
-  // done when read/write are equal to length
-  io.finish := last
-}
diff --git a/vta/vta-hw/apps/tsim_example/hardware/chisel/src/main/scala/accel/RegFile.scala b/vta/vta-hw/apps/tsim_example/hardware/chisel/src/main/scala/accel/RegFile.scala
deleted file mode 100644
index 2764510..0000000
--- a/vta/vta-hw/apps/tsim_example/hardware/chisel/src/main/scala/accel/RegFile.scala
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package accel
-
-import chisel3._
-import chisel3.util._
-import vta.dpi._
-
-/** Register File.
- *
- * Six 32-bit register file.
- *
- * -------------------------------
- *  Register description    | addr
- * -------------------------|-----
- *  Control status register | 0x00
- *  Cycle counter           | 0x04
- *  Constant value          | 0x08
- *  Vector length           | 0x0c
- *  Input pointer lsb       | 0x10
- *  Input pointer msb       | 0x14
- *  Output pointer lsb      | 0x18
- *  Output pointer msb      | 0x1c
- * -------------------------------
- *
- * ------------------------------
- *  Control status register | bit
- * ------------------------------
- *  Launch                  | 0
- *  Finish                  | 1
- * ------------------------------
- */
-class RegFile(implicit config: AccelConfig) extends Module {
-  val io = IO(new Bundle {
-    val launch = Output(Bool())
-    val finish = Input(Bool())
-    val ecnt = Vec(config.nECnt, Flipped(ValidIO(UInt(config.regBits.W))))
-    val vals = Output(Vec(config.nVals, UInt(config.regBits.W)))
-    val ptrs = Output(Vec(config.nPtrs, UInt(config.ptrBits.W)))
-    val host = new VTAHostDPIClient
-  })
-  val sIdle :: sRead :: Nil = Enum(2)
-  val state = RegInit(sIdle)
-
-  switch(state) {
-    is(sIdle) {
-      when(io.host.req.valid && !io.host.req.opcode) {
-        state := sRead
-      }
-    }
-    is(sRead) {
-      state := sIdle
-    }
-  }
-
-  io.host.req.deq := state === sIdle & io.host.req.valid
-
-  val nTotal = config.nCtrl + config.nECnt + config.nVals + (2 * config.nPtrs)
-  val reg =
-    Seq.fill(nTotal)(RegInit(0.U.asTypeOf(chiselTypeOf(io.host.req.value))))
-  val addr = Seq.tabulate(nTotal)(_ * 4)
-  val reg_map = (addr zip reg) map { case (a, r) => a.U -> r }
-  val eo = config.nCtrl
-  val vo = eo + config.nECnt
-  val po = vo + config.nVals
-
-  when(io.finish) {
-    reg(0) := "b_10".U
-  }.elsewhen(state === sIdle && io.host.req.valid &&
-    io.host.req.opcode && addr(0).U === io.host.req.addr) {
-    reg(0) := io.host.req.value
-  }
-
-  for (i <- 0 until config.nECnt) {
-    when(io.ecnt(i).valid) {
-      reg(eo + i) := io.ecnt(i).bits
-    }.elsewhen(state === sIdle && io.host.req.valid &&
-      io.host.req.opcode && addr(eo + i).U === io.host.req.addr) {
-      reg(eo + i) := io.host.req.value
-    }
-  }
-
-  for (i <- 0 until (config.nVals + (2 * config.nPtrs))) {
-    when(state === sIdle && io.host.req.valid &&
-      io.host.req.opcode && addr(vo + i).U === io.host.req.addr) {
-      reg(vo + i) := io.host.req.value
-    }
-  }
-
-  val rdata = RegInit(0.U.asTypeOf(chiselTypeOf(io.host.req.value)))
-  when(state === sIdle && io.host.req.valid && !io.host.req.opcode) {
-    rdata := MuxLookup(io.host.req.addr, 0.U, reg_map)
-  }
-
-  io.host.resp.valid := state === sRead
-  io.host.resp.bits := rdata
-
-  io.launch := reg(0)(0)
-
-  for (i <- 0 until config.nVals) {
-    io.vals(i) := reg(vo + i)
-  }
-
-  for (i <- 0 until config.nPtrs) {
-    io.ptrs(i) := Cat(reg(po + (2 * i) + 1), reg(po + (2 * i)))
-  }
-}
diff --git a/vta/vta-hw/apps/tsim_example/hardware/chisel/src/test/scala/dut/TestAccel.scala b/vta/vta-hw/apps/tsim_example/hardware/chisel/src/test/scala/dut/TestAccel.scala
deleted file mode 100644
index d931620..0000000
--- a/vta/vta-hw/apps/tsim_example/hardware/chisel/src/test/scala/dut/TestAccel.scala
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package test
-
-import chisel3._
-import chisel3.experimental.MultiIOModule
-import vta.dpi._
-import accel._
-
-/** VTA simulation shell.
-  *
-  * Instantiate Host and Memory DPI modules.
-  *
-  */
-class VTASimShell extends MultiIOModule {
-  val host = IO(new VTAHostDPIMaster)
-  val mem = IO(new VTAMemDPIClient)
-  val sim_clock = IO(Input(Clock()))
-  val sim_wait = IO(Output(Bool()))
-  val mod_sim = Module(new VTASimDPI)
-  val mod_host = Module(new VTAHostDPI)
-  val mod_mem = Module(new VTAMemDPI)
-  mod_mem.io.clock := clock
-  mod_mem.io.reset := reset
-  mod_mem.io.dpi <> mem
-  mod_host.io.clock := clock
-  mod_host.io.reset := reset
-  host <> mod_host.io.dpi
-  mod_sim.io.clock := sim_clock
-  mod_sim.io.reset := reset
-  sim_wait := mod_sim.io.dpi_wait
-}
-
-/** Test accelerator.
-  *
-  * Instantiate and connect the simulation-shell and the accelerator.
-  *
-  */
-class TestAccel extends MultiIOModule {
-  val sim_clock = IO(Input(Clock()))
-  val sim_wait = IO(Output(Bool()))
-  val sim_shell = Module(new VTASimShell)
-  val vta_accel = Module(new Accel)
-  sim_shell.sim_clock := sim_clock
-  sim_wait := sim_shell.sim_wait
-  sim_shell.mem <> vta_accel.io.mem
-  vta_accel.io.host <> sim_shell.host
-}
-
-/** Generate TestAccel as top module */
-object Elaborate extends App {
-  chisel3.Driver.execute(args, () => new TestAccel)
-}
diff --git a/vta/vta-hw/apps/tsim_example/hardware/verilog/Makefile b/vta/vta-hw/apps/tsim_example/hardware/verilog/Makefile
deleted file mode 100644
index 72b0a2a..0000000
--- a/vta/vta-hw/apps/tsim_example/hardware/verilog/Makefile
+++ /dev/null
@@ -1,107 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-ifeq (, $(shell which verilator))
- $(error "No Verilator in $(PATH), consider doing apt-get install verilator")
-endif
-
-# Change VERILATOR_INC_DIR if Verilator is installed on a different location
-ifeq (, $(VERILATOR_INC_DIR))
-  ifeq (, $(wildcard /usr/local/share/verilator/include/*))
-    ifeq (, $(wildcard /usr/share/verilator/include/*))
-      $(error "Verilator include directory is not set properly")
-    else
-      VERILATOR_INC_DIR := /usr/share/verilator/include
-    endif
-  else
-      VERILATOR_INC_DIR := /usr/local/share/verilator/include
-  endif
-endif
-
-TOP = TestAccel
-BUILD_NAME = build
-USE_TRACE = 0
-LIBNAME = libhw
-
-vta_dir = $(abspath ../../../../)
-tvm_dir = $(abspath ../../../../../../)
-build_dir = $(abspath .)/$(BUILD_NAME)
-
-verilator_opt = --cc
-verilator_opt += +define+RANDOMIZE_GARBAGE_ASSIGN
-verilator_opt += +define+RANDOMIZE_REG_INIT
-verilator_opt += +define+RANDOMIZE_MEM_INIT
-verilator_opt += --x-assign unique
-verilator_opt += --output-split 20000
-verilator_opt += --output-split-cfuncs 20000
-verilator_opt += --top-module ${TOP}
-verilator_opt += -Mdir ${build_dir}
-
-cxx_flags = -O2 -Wall -fPIC -shared
-cxx_flags += -fvisibility=hidden -std=c++11
-cxx_flags += -DVL_TSIM_NAME=V$(TOP)
-cxx_flags += -DVL_PRINTF=printf
-cxx_flags += -DVL_USER_FINISH
-cxx_flags += -DVM_COVERAGE=0
-cxx_flags += -DVM_SC=0
-cxx_flags += -Wno-sign-compare
-cxx_flags += -include V$(TOP).h
-cxx_flags += -I$(build_dir)
-cxx_flags += -I$(VERILATOR_INC_DIR)
-cxx_flags += -I$(VERILATOR_INC_DIR)/vltstd
-cxx_flags += -I$(vta_dir)/include
-cxx_flags += -I$(tvm_dir)/include
-cxx_flags += -I$(tvm_dir)/3rdparty/dlpack/include
-
-cxx_files = $(VERILATOR_INC_DIR)/verilated.cpp
-cxx_files += $(VERILATOR_INC_DIR)/verilated_dpi.cpp
-cxx_files += $(wildcard $(build_dir)/*.cpp)
-cxx_files += $(vta_dir)/hardware/dpi/tsim_device.cc
-
-v_files = $(wildcard $(abspath .)/src/*.v $(vta_dir)/hardware/chisel/src/main/resources/verilog/*.v)
-
-ifneq ($(USE_TRACE), 0)
-  verilator_opt += --trace
-  cxx_flags += -DVM_TRACE=1
-  cxx_flags += -DTSIM_TRACE_FILE=$(build_dir)/$(TOP).vcd
-  cxx_files += $(VERILATOR_INC_DIR)/verilated_vcd_c.cpp
-else
-  cxx_flags += -DVM_TRACE=0
-endif
-
-# The following is to be consistent with cmake
-ifeq ($(shell uname), Darwin)
-  lib_path = $(build_dir)/$(LIBNAME).dylib
-else
-  lib_path = $(build_dir)/$(LIBNAME).so
-endif
-
-default: lib
-
-lib: $(lib_path)
-$(lib_path): $(build_dir)/V$(TOP).cpp
-	g++ $(cxx_flags) $(cxx_files) -o $@
-
-verilator: $(build_dir)/V$(TOP).cpp
-$(build_dir)/V$(TOP).cpp: $(v_files) | $(build_dir)
-	verilator $(verilator_opt) $(v_files)
-
-$(build_dir):
-	mkdir -p $@
-
-clean:
-	-rm -rf $(build_dir)
diff --git a/vta/vta-hw/apps/tsim_example/hardware/verilog/src/Accel.v b/vta/vta-hw/apps/tsim_example/hardware/verilog/src/Accel.v
deleted file mode 100644
index 34d7d95..0000000
--- a/vta/vta-hw/apps/tsim_example/hardware/verilog/src/Accel.v
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/** Add-by-one accelerator.
-  *
-  * ___________      ___________
-  * |         |      |         |
-  * | HostDPI | <--> | RegFile | <->|
-  * |_________|      |_________|    |
-  *                                 |
-  * ___________      ___________    |
-  * |         |      |         |    |
-  * | MemDPI  | <--> | Compute | <->|
-  * |_________|      |_________|
-  *
-  */
-module Accel #
-( parameter HOST_ADDR_BITS = 8,
-  parameter HOST_DATA_BITS = 32,
-  parameter MEM_LEN_BITS = 8,
-  parameter MEM_ADDR_BITS = 64,
-  parameter MEM_DATA_BITS = 64
-)
-(
-  input                         clock,
-  input                         reset,
-
-  input                         host_req_valid,
-  input                         host_req_opcode,
-  input    [HOST_ADDR_BITS-1:0] host_req_addr,
-  input    [HOST_DATA_BITS-1:0] host_req_value,
-  output                        host_req_deq,
-  output                        host_resp_valid,
-  output   [HOST_DATA_BITS-1:0] host_resp_bits,
-
-  output                        mem_req_valid,
-  output                        mem_req_opcode,
-  output     [MEM_LEN_BITS-1:0] mem_req_len,
-  output    [MEM_ADDR_BITS-1:0] mem_req_addr,
-  output                        mem_wr_valid,
-  output    [MEM_DATA_BITS-1:0] mem_wr_bits,
-  input                         mem_rd_valid,
-  input     [MEM_DATA_BITS-1:0] mem_rd_bits,
-  output                        mem_rd_ready
-);
-
-  logic                      launch;
-  logic                      finish;
-
-  logic                      event_counter_valid;
-  logic [HOST_DATA_BITS-1:0] event_counter_value;
-
-  logic [HOST_DATA_BITS-1:0] constant;
-  logic [HOST_DATA_BITS-1:0] length;
-  logic  [MEM_ADDR_BITS-1:0] inp_baddr;
-  logic  [MEM_ADDR_BITS-1:0] out_baddr;
-
-  RegFile #
-  (
-    .MEM_ADDR_BITS(MEM_ADDR_BITS),
-    .HOST_ADDR_BITS(HOST_ADDR_BITS),
-    .HOST_DATA_BITS(HOST_DATA_BITS)
-  )
-  rf
-  (
-    .clock               (clock),
-    .reset               (reset),
-
-    .host_req_valid      (host_req_valid),
-    .host_req_opcode     (host_req_opcode),
-    .host_req_addr       (host_req_addr),
-    .host_req_value      (host_req_value),
-    .host_req_deq        (host_req_deq),
-    .host_resp_valid     (host_resp_valid),
-    .host_resp_bits      (host_resp_bits),
-
-    .launch              (launch),
-    .finish              (finish),
-
-    .event_counter_valid (event_counter_valid),
-    .event_counter_value (event_counter_value),
-
-    .constant            (constant),
-    .length              (length),
-    .inp_baddr           (inp_baddr),
-    .out_baddr           (out_baddr)
-  );
-
-  Compute #
-  (
-    .MEM_LEN_BITS(MEM_LEN_BITS),
-    .MEM_ADDR_BITS(MEM_ADDR_BITS),
-    .MEM_DATA_BITS(MEM_DATA_BITS),
-    .HOST_DATA_BITS(HOST_DATA_BITS)
-  )
-  comp
-  (
-    .clock               (clock),
-    .reset               (reset),
-
-    .mem_req_valid       (mem_req_valid),
-    .mem_req_opcode      (mem_req_opcode),
-    .mem_req_len         (mem_req_len),
-    .mem_req_addr        (mem_req_addr),
-    .mem_wr_valid        (mem_wr_valid),
-    .mem_wr_bits         (mem_wr_bits),
-    .mem_rd_valid        (mem_rd_valid),
-    .mem_rd_bits         (mem_rd_bits),
-    .mem_rd_ready        (mem_rd_ready),
-
-    .launch              (launch),
-    .finish              (finish),
-
-    .event_counter_valid (event_counter_valid),
-    .event_counter_value (event_counter_value),
-
-    .constant            (constant),
-    .length              (length),
-    .inp_baddr           (inp_baddr),
-    .out_baddr           (out_baddr)
-  );
-
-endmodule
diff --git a/vta/vta-hw/apps/tsim_example/hardware/verilog/src/Compute.v b/vta/vta-hw/apps/tsim_example/hardware/verilog/src/Compute.v
deleted file mode 100644
index 4360b1c..0000000
--- a/vta/vta-hw/apps/tsim_example/hardware/verilog/src/Compute.v
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/** Compute
-  *
-  * Add-by-one procedure:
-  *
-  * 1. Wait for launch to be asserted
-  * 2. Issue a read request for 8-byte value at inp_baddr address
-  * 3. Wait for the value
-  * 4. Issue a write request for 8-byte value at out_baddr address
-  * 5. Increment read-address and write-address for next value
-  * 6. Check if counter (cnt) is equal to length to assert finish,
-  *    otherwise go to step 2.
-  */
-module Compute #
-(
-  parameter MEM_LEN_BITS = 8,
-  parameter MEM_ADDR_BITS = 64,
-  parameter MEM_DATA_BITS = 64,
-  parameter HOST_DATA_BITS = 32
-)
-(
-  input                         clock,
-  input                         reset,
-
-  output                        mem_req_valid,
-  output                        mem_req_opcode,
-  output     [MEM_LEN_BITS-1:0] mem_req_len,
-  output    [MEM_ADDR_BITS-1:0] mem_req_addr,
-  output                        mem_wr_valid,
-  output    [MEM_DATA_BITS-1:0] mem_wr_bits,
-  input                         mem_rd_valid,
-  input     [MEM_DATA_BITS-1:0] mem_rd_bits,
-  output                        mem_rd_ready,
-
-  input                         launch,
-  output                        finish,
-
-  output                        event_counter_valid,
-  output   [HOST_DATA_BITS-1:0] event_counter_value,
-
-  input    [HOST_DATA_BITS-1:0] constant,
-  input    [HOST_DATA_BITS-1:0] length,
-  input     [MEM_ADDR_BITS-1:0] inp_baddr,
-  input     [MEM_ADDR_BITS-1:0] out_baddr
-);
-
-  typedef enum logic [2:0] {IDLE,
-                            READ_REQ,
-                            READ_DATA,
-                            WRITE_REQ,
-                            WRITE_DATA} state_t;
-
-  state_t state_n, state_r;
-
-  logic [31:0] cnt;
-  logic [MEM_DATA_BITS-1:0] data;
-  logic [MEM_ADDR_BITS-1:0] raddr;
-  logic [MEM_ADDR_BITS-1:0] waddr;
-
-  always_ff @(posedge clock) begin
-    if (reset) begin
-      state_r <= IDLE;
-    end else begin
-      state_r <= state_n;
-    end
-  end
-
-  always_comb begin
-    state_n = IDLE;
-    case (state_r)
-      IDLE: begin
-        if (launch) begin
-          state_n = READ_REQ;
-        end
-      end
-
-      READ_REQ: begin
-        state_n = READ_DATA;
-      end
-
-      READ_DATA: begin
-        if (mem_rd_valid) begin
-          state_n = WRITE_REQ;
-        end else begin
-          state_n = READ_DATA;
-        end
-      end
-
-      WRITE_REQ: begin
-        state_n = WRITE_DATA;
-      end
-
-      WRITE_DATA: begin
-        if (cnt == (length - 1'b1)) begin
-          state_n = IDLE;
-        end else begin
-          state_n = READ_REQ;
-        end
-      end
-
-      default: begin
-      end
-    endcase
-  end
-
-  logic last;
-  assign last = (state_r == WRITE_DATA) & (cnt == (length - 1'b1));
-
-  // cycle counter
-  logic [HOST_DATA_BITS-1:0] cycle_counter;
-  always_ff @(posedge clock) begin
-    if (reset | state_r == IDLE) begin
-      cycle_counter <= '0;
-    end else begin
-      cycle_counter <= cycle_counter + 1'b1;
-    end
-  end
-
-  assign event_counter_valid = last;
-  assign event_counter_value = cycle_counter;
-
-  // calculate next address
-  always_ff @(posedge clock) begin
-    if (reset | state_r == IDLE) begin
-      raddr <= inp_baddr;
-      waddr <= out_baddr;
-    end else if (state_r == WRITE_DATA) begin
-      raddr <= raddr + 'd8;
-      waddr <= waddr + 'd8;
-    end
-  end
-
-  // create request
-  assign mem_req_valid = (state_r == READ_REQ) | (state_r == WRITE_REQ);
-  assign mem_req_opcode = state_r == WRITE_REQ;
-  assign mem_req_len = 'd0; // one-word-per-request
-  assign mem_req_addr = (state_r == READ_REQ)? raddr : waddr;
-
-  // read
-  always_ff @(posedge clock) begin
-    if ((state_r == READ_DATA) & mem_rd_valid) begin
-      data <= mem_rd_bits + {32'd0, constant};
-    end
-  end
-  assign mem_rd_ready = state_r == READ_DATA;
-
-  // write
-  assign mem_wr_valid = state_r == WRITE_DATA;
-  assign mem_wr_bits = data;
-
-  // count read/write
-  always_ff @(posedge clock) begin
-    if (reset | state_r == IDLE) begin
-      cnt <= 'd0;
-    end else if (state_r == WRITE_DATA) begin
-      cnt <= cnt + 1'b1;
-    end
-  end
-
-  // done when read/write are equal to length
-  assign finish = last;
-endmodule
diff --git a/vta/vta-hw/apps/tsim_example/hardware/verilog/src/RegFile.v b/vta/vta-hw/apps/tsim_example/hardware/verilog/src/RegFile.v
deleted file mode 100644
index 7174682..0000000
--- a/vta/vta-hw/apps/tsim_example/hardware/verilog/src/RegFile.v
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/** Register File.
-  *
-  * Six 32-bit register file.
-  *
-  * -------------------------------
-  *  Register description    | addr
-  * -------------------------|-----
-  *  Control status register | 0x00
-  *  Cycle counter           | 0x04
-  *  Constant value          | 0x08
-  *  Vector length           | 0x0c
-  *  Input pointer lsb       | 0x10
-  *  Input pointer msb       | 0x14
-  *  Output pointer lsb      | 0x18
-  *  Output pointer msb      | 0x1c
-  * -------------------------------
-
-  * ------------------------------
-  *  Control status register | bit
-  * ------------------------------
-  *  Launch                  | 0
-  *  Finish                  | 1
-  * ------------------------------
-  */
-module RegFile #
- (parameter MEM_ADDR_BITS = 64,
-  parameter HOST_ADDR_BITS = 8,
-  parameter HOST_DATA_BITS = 32
-)
-(
-  input                         clock,
-  input                         reset,
-
-  input                         host_req_valid,
-  input                         host_req_opcode,
-  input    [HOST_ADDR_BITS-1:0] host_req_addr,
-  input    [HOST_DATA_BITS-1:0] host_req_value,
-  output                        host_req_deq,
-  output                        host_resp_valid,
-  output   [HOST_DATA_BITS-1:0] host_resp_bits,
-
-  output                        launch,
-  input                         finish,
-
-  input                         event_counter_valid,
-  input    [HOST_DATA_BITS-1:0] event_counter_value,
-
-  output   [HOST_DATA_BITS-1:0] constant,
-  output   [HOST_DATA_BITS-1:0] length,
-  output    [MEM_ADDR_BITS-1:0] inp_baddr,
-  output    [MEM_ADDR_BITS-1:0] out_baddr
-);
-
-  localparam NUM_REG = 8;
-
-  typedef enum logic {IDLE, READ} state_t;
-  state_t state_n, state_r;
-
-  always_ff @(posedge clock) begin
-    if (reset) begin
-      state_r <= IDLE;
-    end else begin
-      state_r <= state_n;
-    end
-  end
-
-  always_comb begin
-    state_n = IDLE;
-    case (state_r)
-      IDLE: begin
-        if (host_req_valid & ~host_req_opcode) begin
-          state_n = READ;
-        end
-      end
-
-      READ: begin
-        state_n = IDLE;
-      end
-    endcase
-  end
-
-  assign host_req_deq = (state_r == IDLE) ? host_req_valid : 1'b0;
-
-  logic [HOST_DATA_BITS-1:0] rf [NUM_REG-1:0];
-
-  genvar i;
-  for (i = 0; i < NUM_REG; i++) begin
-
-    logic wen = (state_r == IDLE)? host_req_valid & host_req_opcode & i*4 == host_req_addr : 1'b0;
-
-    if (i == 0) begin
-
-      always_ff @(posedge clock) begin
-        if (reset) begin
-          rf[i] <= 'd0;
-        end else if (finish) begin
-          rf[i] <= 'd2;
-        end else if (wen) begin
-          rf[i] <= host_req_value;
-        end
-      end
-
-    end else if (i == 1) begin
-
-      always_ff @(posedge clock) begin
-        if (reset) begin
-          rf[i] <= 'd0;
-        end else if (event_counter_valid) begin
-          rf[i] <= event_counter_value;
-        end else if (wen) begin
-          rf[i] <= host_req_value;
-        end
-      end
-
-    end else begin
-
-      always_ff @(posedge clock) begin
-        if (reset) begin
-          rf[i] <= 'd0;
-        end else if (wen) begin
-          rf[i] <= host_req_value;
-        end
-      end
-
-    end
-
-  end
-
-  logic [HOST_DATA_BITS-1:0] rdata;
-  always_ff @(posedge clock) begin
-    if (reset) begin
-      rdata <= 'd0;
-    end else if ((state_r == IDLE) & host_req_valid & ~host_req_opcode) begin
-      if (host_req_addr == 'h00) begin
-        rdata <= rf[0];
-      end else if (host_req_addr == 'h04) begin
-        rdata <= rf[1];
-      end else if (host_req_addr == 'h08) begin
-        rdata <= rf[2];
-      end else if (host_req_addr == 'h0c) begin
-        rdata <= rf[3];
-      end else if (host_req_addr == 'h10) begin
-        rdata <= rf[4];
-      end else if (host_req_addr == 'h14) begin
-        rdata <= rf[5];
-      end else if (host_req_addr == 'h18) begin
-        rdata <= rf[6];
-      end else if (host_req_addr == 'h1c) begin
-        rdata <= rf[7];
-      end else begin
-        rdata <= 'd0;
-      end
-    end
-  end
-
-  assign host_resp_valid = (state_r == READ);
-  assign host_resp_bits = rdata;
-
-  assign launch = rf[0][0];
-  assign constant = rf[2];
-  assign length = rf[3];
-  assign inp_baddr = {rf[5], rf[4]};
-  assign out_baddr = {rf[7], rf[6]};
-
-endmodule
diff --git a/vta/vta-hw/apps/tsim_example/hardware/verilog/src/TestAccel.v b/vta/vta-hw/apps/tsim_example/hardware/verilog/src/TestAccel.v
deleted file mode 100644
index cc1ec85..0000000
--- a/vta/vta-hw/apps/tsim_example/hardware/verilog/src/TestAccel.v
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/** Test accelerator.
-  *
-  * Instantiate host/memory DPI modules and connect them to the accelerator.
-  *
-  */
-module TestAccel
-(
-  input clock,
-  input reset,
-  input sim_clock,
-  output sim_wait
-);
-
-  localparam HOST_ADDR_BITS = 8;
-  localparam HOST_DATA_BITS = 32;
-
-  logic                      host_req_valid;
-  logic                      host_req_opcode;
-  logic [HOST_ADDR_BITS-1:0] host_req_addr;
-  logic [HOST_DATA_BITS-1:0] host_req_value;
-  logic                      host_req_deq;
-  logic                      host_resp_valid;
-  logic [HOST_DATA_BITS-1:0] host_resp_bits;
-
-  localparam MEM_LEN_BITS = 8;
-  localparam MEM_ADDR_BITS = 64;
-  localparam MEM_DATA_BITS = 64;
-
-  logic                     mem_req_valid;
-  logic                     mem_req_opcode;
-  logic  [MEM_LEN_BITS-1:0] mem_req_len;
-  logic [MEM_ADDR_BITS-1:0] mem_req_addr;
-  logic                     mem_wr_valid;
-  logic [MEM_DATA_BITS-1:0] mem_wr_bits;
-  logic                     mem_rd_valid;
-  logic [MEM_DATA_BITS-1:0] mem_rd_bits;
-  logic                     mem_rd_ready;
-
-  VTASimDPI sim
-  (
-    .clock          (sim_clock),
-    .reset          (reset),
-
-    .dpi_wait       (sim_wait)
-  );
-
-  VTAHostDPI host
-  (
-    .clock          (clock),
-    .reset          (reset),
-
-    .dpi_req_valid  (host_req_valid),
-    .dpi_req_opcode (host_req_opcode),
-    .dpi_req_addr   (host_req_addr),
-    .dpi_req_value  (host_req_value),
-    .dpi_req_deq    (host_req_deq),
-    .dpi_resp_valid (host_resp_valid),
-    .dpi_resp_bits  (host_resp_bits)
-  );
-
-  VTAMemDPI mem
-  (
-    .clock          (clock),
-    .reset          (reset),
-
-    .dpi_req_valid  (mem_req_valid),
-    .dpi_req_opcode (mem_req_opcode),
-    .dpi_req_len    (mem_req_len),
-    .dpi_req_addr   (mem_req_addr),
-    .dpi_wr_valid   (mem_wr_valid),
-    .dpi_wr_bits    (mem_wr_bits),
-    .dpi_rd_valid   (mem_rd_valid),
-    .dpi_rd_bits    (mem_rd_bits),
-    .dpi_rd_ready   (mem_rd_ready)
-  );
-
-  Accel #
-  (
-    .HOST_ADDR_BITS(HOST_ADDR_BITS),
-    .HOST_DATA_BITS(HOST_DATA_BITS),
-    .MEM_LEN_BITS(MEM_LEN_BITS),
-    .MEM_ADDR_BITS(MEM_ADDR_BITS),
-    .MEM_DATA_BITS(MEM_DATA_BITS)
-  )
-  accel
-  (
-    .clock           (clock),
-    .reset           (reset),
-
-    .host_req_valid  (host_req_valid),
-    .host_req_opcode (host_req_opcode),
-    .host_req_addr   (host_req_addr),
-    .host_req_value  (host_req_value),
-    .host_req_deq    (host_req_deq),
-    .host_resp_valid (host_resp_valid),
-    .host_resp_bits  (host_resp_bits),
-
-    .mem_req_valid   (mem_req_valid),
-    .mem_req_opcode  (mem_req_opcode),
-    .mem_req_len     (mem_req_len),
-    .mem_req_addr    (mem_req_addr),
-    .mem_wr_valid    (mem_wr_valid),
-    .mem_wr_bits     (mem_wr_bits),
-    .mem_rd_valid    (mem_rd_valid),
-    .mem_rd_bits     (mem_rd_bits),
-    .mem_rd_ready    (mem_rd_ready)
-  );
-
-endmodule
diff --git a/vta/vta-hw/apps/tsim_example/python/__init__.py b/vta/vta-hw/apps/tsim_example/python/__init__.py
deleted file mode 100644
index 4bc21e2..0000000
--- a/vta/vta-hw/apps/tsim_example/python/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from . import tsim
diff --git a/vta/vta-hw/apps/tsim_example/python/tsim.py b/vta/vta-hw/apps/tsim_example/python/tsim.py
deleted file mode 100644
index 85fd463..0000000
--- a/vta/vta-hw/apps/tsim_example/python/tsim.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import tvm
-from tvm import te
-import ctypes
-import os.path as osp
-from sys import platform
-
-def get_ext():
-    """Return shared library extension"""
-    return ".dylib" if platform == "darwin" else ".so"
-
-def load_dll(dll):
-    """Load shared library
-
-    Parameters
-    ------------
-    dll : str
-        Path for shared library
-
-    Returns
-    ------------
-    The shared library
-    """
-    try:
-        return [ctypes.CDLL(dll, ctypes.RTLD_GLOBAL)]
-    except OSError:
-        return []
-
-def load_sw():
-    """Load all software shared libraries"""
-    cur_path = osp.dirname(osp.abspath(osp.expanduser(__file__)))
-    sw_libname = "libsw" + get_ext()
-    sw_lib = osp.join(cur_path, "..", "build", sw_libname)
-    load_dll(sw_lib)
-
-def init(hw_backend):
-    """Init hardware and software shared library for accelerator
-
-    Parameters
-    ------------
-    hw_backend : str
-        Hardware backend can be verilog or chisel
-
-    """
-    cur_path = osp.dirname(osp.abspath(osp.expanduser(__file__)))
-    hw_libname = "libhw" + get_ext()
-    if hw_backend in ("verilog", "chisel"):
-        hw_lib = osp.join(cur_path, "..", "hardware", hw_backend, "build", hw_libname)
-    load_sw()
-    m = tvm.runtime.load_module(hw_lib, "vta-tsim")
-    f = tvm.get_global_func("tvm.vta.tsim.init")
-    f(m)
-
-def load_module():
-    """Return driver function"""
-    load_sw()
-    return tvm.get_global_func("tvm.vta.driver")
diff --git a/vta/vta-hw/apps/tsim_example/src/driver.cc b/vta/vta-hw/apps/tsim_example/src/driver.cc
deleted file mode 100644
index 9560696..0000000
--- a/vta/vta-hw/apps/tsim_example/src/driver.cc
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <tvm/runtime/module.h>
-#include <tvm/runtime/registry.h>
-#include <vta/dpi/module.h>
-
-#include "vmem/virtual_memory.h"
-
-namespace vta {
-namespace driver {
-
-using vta::dpi::DPIModuleNode;
-using tvm::runtime::Module;
-
-class DPILoader {
- public:
-  ~DPILoader() {
-    dpi_->SimResume();
-    dpi_->SimFinish();
-  }
-
-  void Init(Module module) {
-    mod_ = module;
-    dpi_ = this->Get();
-    dpi_->SimLaunch();
-    dpi_->SimWait();
-  }
-
-  DPIModuleNode* Get() {
-    return static_cast<DPIModuleNode*>(mod_.operator->());
-  }
-
-  static DPILoader* Global() {
-    static DPILoader inst;
-    return &inst;
-  }
-
-  // TVM module
-  Module mod_;
-  // DPI Module
-  DPIModuleNode* dpi_{nullptr};
-};
-
-class Device {
- public:
-  Device() {
-    loader_ = DPILoader::Global();
-  }
-
-  uint32_t Run(uint32_t c, DLTensor* a, DLTensor* b) {
-    uint32_t cycles;
-    uint32_t len = a->shape[0];
-    size_t size = (a->dtype.bits >> 3) * len;
-    a_ = this->MemAlloc(size);
-    b_ = this->MemAlloc(size);
-    this->MemCopyFromHost(a_, a->data, size);
-    this->Init();
-    this->Launch(c, len);
-    cycles = this->WaitForCompletion();
-    this->MemCopyToHost(b->data, b_, size);
-    this->MemFree(a_);
-    this->MemFree(b_);
-    return cycles;
-  }
-
- private:
-  void Init() {
-    dpi_ = loader_->Get();
-    dpi_->SimResume();
-  }
-
-  void* MemAlloc(size_t size) {
-    void * addr = vta::vmem::VirtualMemoryManager::Global()->Alloc(size);
-    return reinterpret_cast<void*>(vta::vmem::VirtualMemoryManager::Global()->GetPhyAddr(addr));
-  }
-
-  void MemFree(void* buf) {
-    void * addr = vta::vmem::VirtualMemoryManager::Global()->GetAddr(reinterpret_cast<uint64_t>(buf));
-    vta::vmem::VirtualMemoryManager::Global()->Free(addr);
-  }
-
-  vta_phy_addr_t MemGetPhyAddr(void* buf) {
-    return reinterpret_cast<uint64_t>(reinterpret_cast<uint64_t*>(buf));
-  }
-
-  void MemCopyFromHost(void* dst, const void* src, size_t size) {
-    vta::vmem::VirtualMemoryManager::Global()->MemCopyFromHost(dst, src, size);
-  }
-
-  void MemCopyToHost(void* dst, const void* src, size_t size) {
-    vta::vmem::VirtualMemoryManager::Global()->MemCopyToHost(dst, src, size);
-  }
-
-  void Launch(uint32_t c, uint32_t len) {
-    dpi_->WriteReg(0x08, c);
-    dpi_->WriteReg(0x0c, len);
-    dpi_->WriteReg(0x10, this->MemGetPhyAddr(a_));
-    dpi_->WriteReg(0x14, 0);
-    dpi_->WriteReg(0x18, this->MemGetPhyAddr(b_));
-    dpi_->WriteReg(0x1c, 0);
-    dpi_->WriteReg(0x00, 0x1); // launch
-  }
-
-  uint32_t WaitForCompletion() {
-    uint32_t i, val;
-    for (i = 0; i < wait_cycles_; i++) {
-      val = dpi_->ReadReg(0x00);
-      if (val == 2) break; // finish
-    }
-    val = dpi_->ReadReg(0x04);
-    dpi_->SimWait();
-    return val;
-  }
-
-  // wait cycles
-  uint32_t wait_cycles_{100000000};
-  // DPI loader
-  DPILoader* loader_{nullptr};
-  // DPI Module
-  DPIModuleNode* dpi_{nullptr};
-  // input vm ptr
-  void* a_{nullptr};
-  // output vm ptr
-  void* b_{nullptr};
-};
-
-using tvm::runtime::TVMRetValue;
-using tvm::runtime::TVMArgs;
-
-TVM_REGISTER_GLOBAL("tvm.vta.tsim.init")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    Module m = args[0];
-    DPILoader::Global()->Init(m);
-  });
-
-TVM_REGISTER_GLOBAL("tvm.vta.driver")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    Device dev_;
-    DLTensor* A = args[0];
-    DLTensor* B = args[1];
-    uint32_t c = static_cast<int>(args[2]);
-    uint32_t cycles = dev_.Run(c, A, B);
-    *rv = static_cast<int>(cycles);
-  });
-
-}  // namespace driver
-}  // namespace vta
diff --git a/vta/vta-hw/apps/tsim_example/tests/python/chisel_accel.py b/vta/vta-hw/apps/tsim_example/tests/python/chisel_accel.py
deleted file mode 100644
index 370ac40..0000000
--- a/vta/vta-hw/apps/tsim_example/tests/python/chisel_accel.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import tvm
-from tvm import te
-import numpy as np
-import tsim
-
-def test_accel():
-    rmax = 64
-    dtype = "uint64"
-    n = np.random.randint(1, rmax)
-    c = np.random.randint(0, rmax)
-    ctx = tvm.cpu(0)
-    a = tvm.nd.array(np.random.randint(rmax, size=n).astype(dtype), ctx)
-    b = tvm.nd.array(np.zeros(n).astype(dtype), ctx)
-    f = tsim.load_module()
-    cycles = f(a, b, c)
-    msg = "cycles:{0:4} n:{1:2} c:{2:2}".format(cycles, n, c)
-    np.testing.assert_equal(b.asnumpy(), a.asnumpy() + c, err_msg = "[FAIL] " + msg)
-    print("[PASS] " + msg)
-
-if __name__ == "__main__":
-    tsim.init("chisel")
-    for i in range(10):
-        test_accel()
diff --git a/vta/vta-hw/apps/tsim_example/tests/python/verilog_accel.py b/vta/vta-hw/apps/tsim_example/tests/python/verilog_accel.py
deleted file mode 100644
index 3489ff2..0000000
--- a/vta/vta-hw/apps/tsim_example/tests/python/verilog_accel.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import tvm
-from tvm import te
-import numpy as np
-import tsim
-
-def test_accel():
-    rmax = 64
-    dtype = "uint64"
-    n = np.random.randint(1, rmax)
-    c = np.random.randint(0, rmax)
-    ctx = tvm.cpu(0)
-    a = tvm.nd.array(np.random.randint(rmax, size=n).astype(dtype), ctx)
-    b = tvm.nd.array(np.zeros(n).astype(dtype), ctx)
-    f = tsim.load_module()
-    cycles = f(a, b, c)
-    msg = "cycles:{0:4} n:{1:2} c:{2:2}".format(cycles, n, c)
-    np.testing.assert_equal(b.asnumpy(), a.asnumpy() + c, err_msg = "[FAIL] " + msg)
-    print("[PASS] " + msg)
-
-if __name__ == "__main__":
-    tsim.init("verilog")
-    for i in range(10):
-        test_accel()
diff --git a/vta/vta-hw/config/README.md b/vta/vta-hw/config/README.md
deleted file mode 100644
index b675ef2..0000000
--- a/vta/vta-hw/config/README.md
+++ /dev/null
@@ -1,25 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# VTA Configuration
-
-Each VTA runtime/hardware configuration is specified by vta_config.json file.
-You can copy the vta_config.json to tvm project root and modify the configuration
-before you type make.
-
-The config is going to affect the behavior of python package as well as
-the hardware runtime build.
diff --git a/vta/vta-hw/config/de10nano_sample.json b/vta/vta-hw/config/de10nano_sample.json
deleted file mode 100644
index e4148c3..0000000
--- a/vta/vta-hw/config/de10nano_sample.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-  "TARGET" : "de10nano",
-  "HW_VER" : "0.0.1",
-  "LOG_INP_WIDTH" : 3,
-  "LOG_WGT_WIDTH" : 3,
-  "LOG_ACC_WIDTH" : 5,
-  "LOG_BATCH" : 0,
-  "LOG_BLOCK" : 4,
-  "LOG_UOP_BUFF_SIZE" : 15,
-  "LOG_INP_BUFF_SIZE" : 15,
-  "LOG_WGT_BUFF_SIZE" : 18,
-  "LOG_ACC_BUFF_SIZE" : 17
-}
diff --git a/vta/vta-hw/config/fsim_sample.json b/vta/vta-hw/config/fsim_sample.json
deleted file mode 100644
index 0591bb4..0000000
--- a/vta/vta-hw/config/fsim_sample.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-  "TARGET" : "sim",
-  "HW_VER" : "0.0.1",
-  "LOG_INP_WIDTH" : 3,
-  "LOG_WGT_WIDTH" : 3,
-  "LOG_ACC_WIDTH" : 5,
-  "LOG_BATCH" : 0,
-  "LOG_BLOCK" : 4,
-  "LOG_UOP_BUFF_SIZE" : 15,
-  "LOG_INP_BUFF_SIZE" : 15,
-  "LOG_WGT_BUFF_SIZE" : 18,
-  "LOG_ACC_BUFF_SIZE" : 17
-}
diff --git a/vta/vta-hw/config/pkg_config.py b/vta/vta-hw/config/pkg_config.py
deleted file mode 100644
index 9c57706..0000000
--- a/vta/vta-hw/config/pkg_config.py
+++ /dev/null
@@ -1,310 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""VTA Package configuration module
-
-This module is dependency free and can be used to configure package.
-"""
-from __future__ import absolute_import as _abs
-
-import json
-import glob
-import os
-
-
-def get_vta_hw_path():
-    """Get the VTA HW path."""
-    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-    vta_hw_default = os.path.abspath(os.path.join(curr_path, ".."))
-    VTA_HW_PATH = os.getenv('VTA_HW_PATH', vta_hw_default)
-    return VTA_HW_PATH
-
-def get_tvm_path():
-    """Get the TVM path."""
-    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-    tvm_default = os.path.abspath(os.path.join(curr_path, "../../.."))
-    TVM_PATH = os.getenv('TVM_PATH', tvm_default)
-    return TVM_PATH
-
-class PkgConfig(object):
-    """Simple package config tool for VTA.
-
-    This is used to provide runtime specific configurations.
-
-    Parameters
-    ----------
-    cfg : dict
-        The config dictionary
-    """
-    cfg_keys = [
-        "TARGET",
-        "LOG_INP_WIDTH",
-        "LOG_WGT_WIDTH",
-        "LOG_ACC_WIDTH",
-        "LOG_BATCH",
-        "LOG_BLOCK",
-        "LOG_UOP_BUFF_SIZE",
-        "LOG_INP_BUFF_SIZE",
-        "LOG_WGT_BUFF_SIZE",
-        "LOG_ACC_BUFF_SIZE",
-    ]
-
-    def __init__(self, cfg):
-
-        # Derived parameters
-        cfg["LOG_BLOCK_IN"] = cfg["LOG_BLOCK"]
-        cfg["LOG_BLOCK_OUT"] = cfg["LOG_BLOCK"]
-        cfg["LOG_OUT_WIDTH"] = cfg["LOG_INP_WIDTH"]
-        cfg["LOG_OUT_BUFF_SIZE"] = (
-            cfg["LOG_ACC_BUFF_SIZE"] +
-            cfg["LOG_OUT_WIDTH"] -
-            cfg["LOG_ACC_WIDTH"])
-
-        # Update cfg now that we've extended it
-        self.__dict__.update(cfg)
-
-        # VTA_HW path and TVM_PATH
-        vta_hw_path = get_vta_hw_path()
-        tvm_path = get_tvm_path()
-
-        # Include path
-        self.include_path = [
-            "-I%s/include" % tvm_path,
-            "-I%s/include" % vta_hw_path,
-            "-I%s/3rdparty/dlpack/include" % tvm_path,
-            "-I%s/3rdparty/dmlc-core/include" % tvm_path
-        ]
-
-        # List of source files that can be used to build standalone library.
-        self.lib_source = []
-        self.lib_source += glob.glob("%s/src/*.cc" % vta_hw_path)
-        if self.TARGET in ["pynq", "ultra96"]:
-            # add pynq drivers for any board that uses pynq driver stack (see pynq.io)
-            self.lib_source += glob.glob("%s/src/pynq/*.cc" % vta_hw_path)
-        elif self.TARGET in ["de10nano"]:
-            self.lib_source += glob.glob("%s/src/de10nano/*.cc" % vta_hw_path)
-            self.include_path += [
-                "-I%s/src/de10nano" % vta_hw_path,
-                "-I%s/3rdparty" % tvm_path
-            ]
-
-        # Linker flags
-        if self.TARGET in ["pynq", "ultra96"]:
-            self.ldflags = [
-                "-L/usr/lib",
-                "-l:libcma.so"]
-        else:
-            self.ldflags = []
-
-        # Derive bitstream config string.
-        self.bitstream = "{}x{}_i{}w{}a{}_{}_{}_{}_{}".format(
-            (1 << cfg["LOG_BATCH"]),
-            (1 << cfg["LOG_BLOCK"]),
-            (1 << cfg["LOG_INP_WIDTH"]),
-            (1 << cfg["LOG_WGT_WIDTH"]),
-            (1 << cfg["LOG_ACC_WIDTH"]),
-            cfg["LOG_UOP_BUFF_SIZE"],
-            cfg["LOG_INP_BUFF_SIZE"],
-            cfg["LOG_WGT_BUFF_SIZE"],
-            cfg["LOG_ACC_BUFF_SIZE"])
-
-        # Derive FPGA parameters from target
-        #   - device:           part number
-        #   - family:           fpga family
-        #   - freq:             PLL frequency
-        #   - per:              clock period to achieve in HLS
-        #                       (how aggressively design is pipelined)
-        #   - axi_bus_width:    axi bus width used for DMA transactions
-        #                       (property of FPGA memory interface)
-        #   - axi_cache_bits:   ARCACHE/AWCACHE signals for the AXI bus
-        #                       (e.g. 1111 is write-back read and write allocate)
-        #   - axi_prot_bits:    ARPROT/AWPROT signals for the AXI bus
-        if self.TARGET == "de10nano":
-            self.fpga_device = "5CSEBA6U23I7"
-            self.fpga_family = "Cyclone\\ V"
-            # TODO: The following parameters have not been propagated into
-            # current Chisel-based implement of VTA hardware for DE10-Nano.
-            # A future change should be made to propagate these parameters,
-            # in order to avoid duplicated definition.
-            self.fpga_freq = 100
-            self.fpga_per = 2
-            self.fpga_log_axi_bus_width = 6
-            self.axi_prot_bits = '100'
-            # IP register address map
-            self.ip_reg_map_range = "0x1000"
-            self.fetch_base_addr = "0xFF220000"
-            self.load_base_addr = "0xFF221000"
-            self.compute_base_addr = "0xFF222000"
-            self.store_base_addr = "0xFF223000"
-        elif self.TARGET == "ultra96":
-            self.fpga_device = "xczu3eg-sbva484-1-e"
-            self.fpga_family = "zynq-ultrascale+"
-            self.fpga_freq = 333
-            self.fpga_per = 2
-            self.fpga_log_axi_bus_width = 7
-            self.axi_prot_bits = '010'
-            # IP register address map
-            self.ip_reg_map_range = "0x1000"
-            self.fetch_base_addr = "0xA0000000"
-            self.load_base_addr = "0xA0001000"
-            self.compute_base_addr = "0xA0002000"
-            self.store_base_addr = "0xA0003000"
-        else:
-            # By default, we use the pynq parameters
-            self.fpga_device = "xc7z020clg484-1"
-            self.fpga_family = "zynq-7000"
-            self.fpga_freq = 100
-            self.fpga_per = 7
-            self.fpga_log_axi_bus_width = 6
-            self.axi_prot_bits = '000'
-            # IP register address map
-            self.ip_reg_map_range = "0x1000"
-            self.fetch_base_addr = "0x43C00000"
-            self.load_base_addr = "0x43C01000"
-            self.compute_base_addr = "0x43C02000"
-            self.store_base_addr = "0x43C03000"
-        # Set coherence settings
-        coherent = True
-        if coherent:
-            self.axi_cache_bits = '1111'
-            self.coherent = True
-
-        # Define IP memory mapped registers offsets.
-        # In HLS 0x00-0x0C is reserved for block-level I/O protocol.
-        # Make sure to leave 8B between register offsets to maintain
-        # compatibility with 64bit systems.
-        self.fetch_insn_count_offset = 0x10
-        self.fetch_insn_addr_offset = self.fetch_insn_count_offset + 0x08
-        self.load_inp_addr_offset = 0x10
-        self.load_wgt_addr_offset = self.load_inp_addr_offset + 0x08
-        self.compute_done_wr_offet = 0x10
-        self.compute_done_rd_offet = self.compute_done_wr_offet + 0x08
-        self.compute_uop_addr_offset = self.compute_done_rd_offet + 0x08
-        self.compute_bias_addr_offset = self.compute_uop_addr_offset + 0x08
-        self.store_out_addr_offset = 0x10
-
-        # Derive SRAM parameters
-        # The goal here is to determine how many memory banks are needed,
-        # how deep and wide each bank needs to be. This is derived from
-        # the size of each memory element (result of data width, and tensor shape),
-        # and also how wide a memory can be as permitted by the FPGA tools.
-        #
-        # The mem axi ratio is a parameter used by HLS to resize memories
-        # so memory read/write ports are the same size as the design axi bus width.
-        #
-        # Max bus width allowed (property of FPGA vendor toolchain)
-        max_bus_width = 1024
-        # Bus width of a memory interface
-        mem_bus_width = 1 << self.fpga_log_axi_bus_width
-        # Input memory
-        inp_mem_bus_width = 1 << (cfg["LOG_INP_WIDTH"] + \
-                                  cfg["LOG_BATCH"] + \
-                                  cfg["LOG_BLOCK_IN"])
-        self.inp_mem_size = 1 << cfg["LOG_INP_BUFF_SIZE"]  # bytes
-        self.inp_mem_banks = (inp_mem_bus_width + \
-                              max_bus_width - 1) // \
-            max_bus_width
-        self.inp_mem_width = min(inp_mem_bus_width, max_bus_width)
-        self.inp_mem_depth = self.inp_mem_size * 8 // inp_mem_bus_width
-        self.inp_mem_axi_ratio = self.inp_mem_width // mem_bus_width
-        # Weight memory
-        wgt_mem_bus_width = 1 << (cfg["LOG_WGT_WIDTH"] + \
-                                  cfg["LOG_BLOCK_IN"] + \
-                                  cfg["LOG_BLOCK_OUT"])
-        self.wgt_mem_size = 1 << cfg["LOG_WGT_BUFF_SIZE"]  # bytes
-        self.wgt_mem_banks = (wgt_mem_bus_width + \
-                              max_bus_width - 1) // \
-            max_bus_width
-        self.wgt_mem_width = min(wgt_mem_bus_width, max_bus_width)
-        self.wgt_mem_depth = self.wgt_mem_size * 8 // wgt_mem_bus_width
-        self.wgt_mem_axi_ratio = self.wgt_mem_width // mem_bus_width
-        # Output memory
-        out_mem_bus_width = 1 << (cfg["LOG_OUT_WIDTH"] + \
-                                  cfg["LOG_BATCH"] + \
-                                  cfg["LOG_BLOCK_OUT"])
-        self.out_mem_size = 1 << cfg["LOG_OUT_BUFF_SIZE"]  # bytes
-        self.out_mem_banks = (out_mem_bus_width + \
-                              max_bus_width - 1) // \
-            max_bus_width
-        self.out_mem_width = min(out_mem_bus_width, max_bus_width)
-        self.out_mem_depth = self.out_mem_size * 8 // out_mem_bus_width
-        self.out_mem_axi_ratio = self.out_mem_width // mem_bus_width
-
-        # Macro defs
-        self.macro_defs = []
-        self.cfg_dict = {}
-        for key in cfg:
-            self.macro_defs.append("-DVTA_%s=%s" % (key, str(cfg[key])))
-            self.cfg_dict[key] = cfg[key]
-        self.macro_defs.append("-DVTA_LOG_BUS_WIDTH=%s" % (self.fpga_log_axi_bus_width))
-        # Macros used by the VTA driver
-        self.macro_defs.append("-DVTA_IP_REG_MAP_RANGE=%s" % (self.ip_reg_map_range))
-        self.macro_defs.append("-DVTA_FETCH_ADDR=%s" % (self.fetch_base_addr))
-        self.macro_defs.append("-DVTA_LOAD_ADDR=%s" % (self.load_base_addr))
-        self.macro_defs.append("-DVTA_COMPUTE_ADDR=%s" % (self.compute_base_addr))
-        self.macro_defs.append("-DVTA_STORE_ADDR=%s" % (self.store_base_addr))
-        # IP register offsets
-        self.macro_defs.append("-DVTA_FETCH_INSN_COUNT_OFFSET=%s" % \
-                               (self.fetch_insn_count_offset))
-        self.macro_defs.append("-DVTA_FETCH_INSN_ADDR_OFFSET=%s" % \
-                               (self.fetch_insn_addr_offset))
-        self.macro_defs.append("-DVTA_LOAD_INP_ADDR_OFFSET=%s" % \
-                               (self.load_inp_addr_offset))
-        self.macro_defs.append("-DVTA_LOAD_WGT_ADDR_OFFSET=%s" % \
-                               (self.load_wgt_addr_offset))
-        self.macro_defs.append("-DVTA_COMPUTE_DONE_WR_OFFSET=%s" % \
-                               (self.compute_done_wr_offet))
-        self.macro_defs.append("-DVTA_COMPUTE_DONE_RD_OFFSET=%s" % \
-                               (self.compute_done_rd_offet))
-        self.macro_defs.append("-DVTA_COMPUTE_UOP_ADDR_OFFSET=%s" % \
-                               (self.compute_uop_addr_offset))
-        self.macro_defs.append("-DVTA_COMPUTE_BIAS_ADDR_OFFSET=%s" % \
-                               (self.compute_bias_addr_offset))
-        self.macro_defs.append("-DVTA_STORE_OUT_ADDR_OFFSET=%s" % \
-                               (self.store_out_addr_offset))
-        # Coherency
-        if coherent:
-            self.macro_defs.append("-DVTA_COHERENT_ACCESSES=true")
-        else:
-            self.macro_defs.append("-DVTA_COHERENT_ACCESSES=false")
-
-    @property
-    def cflags(self):
-        return self.include_path + self.macro_defs
-
-    @property
-    def cfg_json(self):
-        return json.dumps(self.cfg_dict, indent=2)
-
-    def same_config(self, cfg):
-        """Compare if cfg is same as current config.
-
-        Parameters
-        ----------
-        cfg : the configuration
-            The configuration
-
-        Returns
-        -------
-        equal : bool
-            Whether the configuration is the same.
-        """
-        for k, v in self.cfg_dict.items():
-            if k not in cfg:
-                return False
-            if cfg[k] != v:
-                return False
-        return True
diff --git a/vta/vta-hw/config/pynq_sample.json b/vta/vta-hw/config/pynq_sample.json
deleted file mode 100644
index 7a26641..0000000
--- a/vta/vta-hw/config/pynq_sample.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-  "TARGET" : "pynq",
-  "HW_VER" : "0.0.1",
-  "LOG_INP_WIDTH" : 3,
-  "LOG_WGT_WIDTH" : 3,
-  "LOG_ACC_WIDTH" : 5,
-  "LOG_BATCH" : 0,
-  "LOG_BLOCK" : 4,
-  "LOG_UOP_BUFF_SIZE" : 15,
-  "LOG_INP_BUFF_SIZE" : 15,
-  "LOG_WGT_BUFF_SIZE" : 18,
-  "LOG_ACC_BUFF_SIZE" : 17
-}
diff --git a/vta/vta-hw/config/tsim_sample.json b/vta/vta-hw/config/tsim_sample.json
deleted file mode 100644
index 71f77c0..0000000
--- a/vta/vta-hw/config/tsim_sample.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-  "TARGET" : "tsim",
-  "HW_VER" : "0.0.1",
-  "LOG_INP_WIDTH" : 3,
-  "LOG_WGT_WIDTH" : 3,
-  "LOG_ACC_WIDTH" : 5,
-  "LOG_BATCH" : 0,
-  "LOG_BLOCK" : 4,
-  "LOG_UOP_BUFF_SIZE" : 15,
-  "LOG_INP_BUFF_SIZE" : 15,
-  "LOG_WGT_BUFF_SIZE" : 18,
-  "LOG_ACC_BUFF_SIZE" : 17
-}
diff --git a/vta/vta-hw/config/ultra96_sample.json b/vta/vta-hw/config/ultra96_sample.json
deleted file mode 100644
index 35b5a7e..0000000
--- a/vta/vta-hw/config/ultra96_sample.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-  "TARGET" : "ultra96",
-  "HW_VER" : "0.0.1",
-  "LOG_INP_WIDTH" : 3,
-  "LOG_WGT_WIDTH" : 3,
-  "LOG_ACC_WIDTH" : 5,
-  "LOG_BATCH" : 0,
-  "LOG_BLOCK" : 4,
-  "LOG_UOP_BUFF_SIZE" : 15,
-  "LOG_INP_BUFF_SIZE" : 15,
-  "LOG_WGT_BUFF_SIZE" : 18,
-  "LOG_ACC_BUFF_SIZE" : 17
-}
diff --git a/vta/vta-hw/config/vta_config.json b/vta/vta-hw/config/vta_config.json
deleted file mode 100644
index 0591bb4..0000000
--- a/vta/vta-hw/config/vta_config.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-  "TARGET" : "sim",
-  "HW_VER" : "0.0.1",
-  "LOG_INP_WIDTH" : 3,
-  "LOG_WGT_WIDTH" : 3,
-  "LOG_ACC_WIDTH" : 5,
-  "LOG_BATCH" : 0,
-  "LOG_BLOCK" : 4,
-  "LOG_UOP_BUFF_SIZE" : 15,
-  "LOG_INP_BUFF_SIZE" : 15,
-  "LOG_WGT_BUFF_SIZE" : 18,
-  "LOG_ACC_BUFF_SIZE" : 17
-}
diff --git a/vta/vta-hw/config/vta_config.py b/vta/vta-hw/config/vta_config.py
deleted file mode 100644
index 9bb6d7b..0000000
--- a/vta/vta-hw/config/vta_config.py
+++ /dev/null
@@ -1,230 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""VTA config tool"""
-import os
-import sys
-import json
-import argparse
-
-
-def pkg_config(cfg):
-    """Returns PkgConfig pkg config object."""
-    pkg_config_py = os.path.join(
-            os.path.dirname(os.path.abspath(os.path.expanduser(__file__))),
-            "pkg_config.py"
-    )
-    libpkg = {"__file__": pkg_config_py}
-    exec(compile(open(pkg_config_py, "rb").read(), pkg_config_py, "exec"), libpkg, libpkg)
-    PkgConfig = libpkg["PkgConfig"]
-    return PkgConfig(cfg)
-
-def main():
-    """Main funciton"""
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--use-cfg", type=str, default="",
-                        help="path to the config json")
-    parser.add_argument("--cflags", action="store_true",
-                        help="print the cflags")
-    parser.add_argument("--defs", action="store_true",
-                        help="print the macro defs")
-    parser.add_argument("--sources", action="store_true",
-                        help="print the source file paths")
-    parser.add_argument("--update", action="store_true",
-                        help="Print out the json option.")
-    parser.add_argument("--ldflags", action="store_true",
-                        help="print the ldflags")
-    parser.add_argument("--cfg-json", action="store_true",
-                        help="print all the config json")
-    parser.add_argument("--save-cfg-json", type=str, default="",
-                        help="save config json to file")
-    parser.add_argument("--target", action="store_true",
-                        help="print the target")
-    parser.add_argument("--cfg-str", action="store_true",
-                        help="print the configuration string")
-    parser.add_argument("--get-inp-mem-banks", action="store_true",
-                        help="returns number of input memory banks")
-    parser.add_argument("--get-inp-mem-width", action="store_true",
-                        help="returns input memory read/write port width")
-    parser.add_argument("--get-inp-mem-depth", action="store_true",
-                        help="returns input memory depth")
-    parser.add_argument("--get-inp-mem-axi-ratio", action="store_true",
-                        help="returns ratio between input element width and axi width")
-    parser.add_argument("--get-wgt-mem-banks", action="store_true",
-                        help="returns number of weight memory banks")
-    parser.add_argument("--get-wgt-mem-width", action="store_true",
-                        help="returns weight memory read/write port width")
-    parser.add_argument("--get-wgt-mem-depth", action="store_true",
-                        help="returns weight memory depth")
-    parser.add_argument("--get-wgt-mem-axi-ratio", action="store_true",
-                        help="returns ratio between weight element width and axi width")
-    parser.add_argument("--get-out-mem-banks", action="store_true",
-                        help="returns number of output memory banks")
-    parser.add_argument("--get-out-mem-width", action="store_true",
-                        help="returns output memory read/write port width")
-    parser.add_argument("--get-out-mem-depth", action="store_true",
-                        help="returns output memory depth")
-    parser.add_argument("--get-out-mem-axi-ratio", action="store_true",
-                        help="returns ratio between output element width and axi width")
-    parser.add_argument("--get-axi-cache-bits", action="store_true",
-                        help="returns AXI system ARCACHE/AWCACHE hardcoded bit value")
-    parser.add_argument("--get-axi-prot-bits", action="store_true",
-                        help="returns AXI system ARPROT/AWPROT hardcoded bit value")
-    parser.add_argument("--get-ip-reg-map-range", action="store_true",
-                        help="returns ip register map address range")
-    parser.add_argument("--get-fetch-base-addr", action="store_true",
-                        help="returns fetch module base address")
-    parser.add_argument("--get-load-base-addr", action="store_true",
-                        help="returns load module base address")
-    parser.add_argument("--get-compute-base-addr", action="store_true",
-                        help="returns compute module base address")
-    parser.add_argument("--get-store-base-addr", action="store_true",
-                        help="returns store module base address")
-    parser.add_argument("--get-fpga-dev", action="store_true",
-                        help="returns FPGA device target")
-    parser.add_argument("--get-fpga-family", action="store_true",
-                        help="returns FPGA device family")
-    parser.add_argument("--get-fpga-freq", action="store_true",
-                        help="returns FPGA frequency")
-    parser.add_argument("--get-fpga-per", action="store_true",
-                        help="returns HLS target clock period")
-    args = parser.parse_args()
-
-    if len(sys.argv) == 1:
-        parser.print_help()
-        return
-
-    # Path to vta config
-    curr_path = os.path.dirname(
-        os.path.abspath(os.path.expanduser(__file__)))
-
-    path_list = [
-        "vta_config.json", os.path.join(curr_path, "vta_config.json")
-    ]
-
-    if args.use_cfg:
-        path_list = [args.use_cfg]
-
-    ok_path_list = [p for p in path_list if os.path.exists(p)]
-    if not ok_path_list:
-        raise RuntimeError("Cannot find config in %s" % str(path_list))
-
-    cfg = json.load(open(ok_path_list[0]))
-    pkg = pkg_config(cfg)
-
-    if args.target:
-        print(pkg.TARGET)
-
-    if args.defs:
-        print(" ".join(pkg.macro_defs))
-
-    if args.sources:
-        print(" ".join(pkg.lib_source))
-
-    if args.cflags:
-        cflags_str = " ".join(pkg.cflags)
-        if pkg.TARGET == "pynq":
-            cflags_str += " -DVTA_TARGET_PYNQ"
-        elif pkg.TARGET == "de10nano":
-            cflags_str += " -DVTA_TARGET_DE10_NANO"
-        elif pkg.TARGET == "ultra96":
-            cflags_str += " -DVTA_TARGET_ULTRA96"
-        print(cflags_str)
-
-    if args.ldflags:
-        print(" ".join(pkg.ldflags))
-
-    if args.cfg_json:
-        print(pkg.cfg_json)
-
-    if args.save_cfg_json:
-        with open(args.save_cfg_json, "w") as fo:
-            fo.write(pkg.cfg_json)
-
-    if args.cfg_str:
-        print(pkg.TARGET + "_" + pkg.bitstream)
-
-    if args.get_inp_mem_banks:
-        print(pkg.inp_mem_banks)
-
-    if args.get_inp_mem_width:
-        print(pkg.inp_mem_width)
-
-    if args.get_inp_mem_depth:
-        print(pkg.inp_mem_depth)
-
-    if args.get_inp_mem_axi_ratio:
-        print(pkg.inp_mem_axi_ratio)
-
-    if args.get_wgt_mem_banks:
-        print(pkg.wgt_mem_banks)
-
-    if args.get_wgt_mem_width:
-        print(pkg.wgt_mem_width)
-
-    if args.get_wgt_mem_depth:
-        print(pkg.wgt_mem_depth)
-
-    if args.get_wgt_mem_axi_ratio:
-        print(pkg.wgt_mem_axi_ratio)
-
-    if args.get_out_mem_banks:
-        print(pkg.out_mem_banks)
-
-    if args.get_out_mem_width:
-        print(pkg.out_mem_width)
-
-    if args.get_out_mem_depth:
-        print(pkg.out_mem_depth)
-
-    if args.get_out_mem_axi_ratio:
-        print(pkg.out_mem_axi_ratio)
-
-    if args.get_axi_cache_bits:
-        print(pkg.axi_cache_bits)
-
-    if args.get_axi_prot_bits:
-        print(pkg.axi_prot_bits)
-
-    if args.get_ip_reg_map_range:
-        print(pkg.ip_reg_map_range)
-
-    if args.get_fetch_base_addr:
-        print(pkg.fetch_base_addr)
-
-    if args.get_load_base_addr:
-        print(pkg.load_base_addr)
-
-    if args.get_compute_base_addr:
-        print(pkg.compute_base_addr)
-
-    if args.get_store_base_addr:
-        print(pkg.store_base_addr)
-
-    if args.get_fpga_dev:
-        print(pkg.fpga_device)
-
-    if args.get_fpga_family:
-        print(pkg.fpga_family)
-
-    if args.get_fpga_freq:
-        print(pkg.fpga_freq)
-
-    if args.get_fpga_per:
-        print(pkg.fpga_per)
-
-if __name__ == "__main__":
-    main()
diff --git a/vta/vta-hw/hardware/chisel/.gitignore b/vta/vta-hw/hardware/chisel/.gitignore
deleted file mode 100644
index f65a6ba..0000000
--- a/vta/vta-hw/hardware/chisel/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-test_run_dir
diff --git a/vta/vta-hw/hardware/chisel/Makefile b/vta/vta-hw/hardware/chisel/Makefile
deleted file mode 100644
index 049b4d4..0000000
--- a/vta/vta-hw/hardware/chisel/Makefile
+++ /dev/null
@@ -1,205 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-ifeq (, $(shell which verilator))
- $(error "No Verilator in $(PATH), consider doing apt-get install verilator")
-endif
-
-# Change VERILATOR_INC_DIR if Verilator is installed on a different location
-ifeq (, $(VERILATOR_INC_DIR))
-	ifeq (, $(wildcard /usr/local/share/verilator/include/*))
-		ifeq (, $(wildcard /usr/share/verilator/include/*))
-			$(error "Verilator include directory is not set properly")
-		else
-			VERILATOR_INC_DIR := /usr/share/verilator/include
-		endif
-	else
-			VERILATOR_INC_DIR := /usr/local/share/verilator/include
-	endif
-endif
-
-CONFIG = DefaultDe10Config
-TOP = VTA
-TOP_TEST = Test
-BUILD_NAME = build
-# Set USE_TRACE = 1 to generate a trace during simulation.
-USE_TRACE = 0
-# With USE_TRACE = 1, default trace format is VCD.
-# Set USE_TRACE_FST = 1 to use the FST format.
-# Note that although FST is around two orders of magnitude smaller than VCD
-# it is also currently much slower to produce (verilator limitation). But if
-# you are low on disk space it may be your only option.
-USE_TRACE_FST = 0
-# With USE_TRACE = 1, USE_TRACE_DETAILED = 1 will generate traces that also
-# include non-interface internal signal names starting with an underscore.
-# This will significantly increase the trace size and should only be used
-# on a per need basis for difficult debug problems.
-USE_TRACE_DETAILED = 0
-USE_THREADS = 0
-VTA_LIBNAME = libvta_hw
-UNITTEST_NAME = all
-CXX = g++
-# A debug build with DEBUG = 1 is useful to trace the simulation with a
-# debugger.
-DEBUG = 0
-# With DEBUG = 1, SANITIZE = 1 turns on address sanitizing to verify that
-# the verilator build is sane. To be used if you know what you are doing.
-SANITIZE = 0
-
-CXX_MAJOR := $(shell $(CXX) -dumpversion | sed 's/\..*//')
-CXX_HAS_ALIGN_NEW := $(shell [ $(CXX_MAJOR) -ge 7 ] && echo true)
-
-config_test = $(TOP_TEST)$(CONFIG)
-
-
-ifndef TVM_PATH
-   TVM_PATH := $(abspath ../../../../)
-endif
-
-ifndef VTA_HW_PATH
-   VTA_HW_PATH := $(abspath ../../)
-endif
-
-verilator_build_dir = $(VTA_HW_PATH)/$(BUILD_NAME)/verilator
-chisel_build_dir = $(VTA_HW_PATH)/$(BUILD_NAME)/chisel
-
-verilator_opt = --cc
-verilator_opt += +define+RANDOMIZE_GARBAGE_ASSIGN
-verilator_opt += +define+RANDOMIZE_REG_INIT
-verilator_opt += +define+RANDOMIZE_MEM_INIT
-verilator_opt += --x-assign unique
-verilator_opt += --output-split 20000
-verilator_opt += --output-split-cfuncs 20000
-verilator_opt += --top-module ${TOP_TEST}
-verilator_opt += -Mdir ${verilator_build_dir}
-verilator_opt += -I$(chisel_build_dir)
-
-ifeq ($(DEBUG), 0)
-	cxx_flags = -O2 -Wall -fvisibility=hidden
-else
-	cxx_flags = -O0 -g -Wall
-endif
-
-cxx_flags += -std=c++11 -Wno-maybe-uninitialized
-ifeq ($(CXX_HAS_ALIGN_NEW),true)
-	cxx_flags += -faligned-new
-endif
-cxx_flags += -DVL_TSIM_NAME=V$(TOP_TEST)
-cxx_flags += -DVL_PRINTF=printf
-cxx_flags += -DVL_USER_FINISH
-cxx_flags += -DVM_COVERAGE=0
-cxx_flags += -DVM_SC=0
-cxx_flags += -Wno-sign-compare
-cxx_flags += -include V$(TOP_TEST).h
-cxx_flags += -I$(verilator_build_dir)
-cxx_flags += -I$(VERILATOR_INC_DIR)
-cxx_flags += -I$(VERILATOR_INC_DIR)/vltstd
-cxx_flags += -I$(VTA_HW_PATH)/include
-cxx_flags += -I$(TVM_PATH)/include
-cxx_flags += -I$(TVM_PATH)/3rdparty/dlpack/include
-
-ld_flags = -fPIC -shared
-
-ifeq ($(SANITIZE), 1)
-	ifeq ($(DEBUG), 1)
-		cxx_flags += -fno-omit-frame-pointer -fsanitize=address -fsanitize-recover=address
-		ld_flags += -fno-omit-frame-pointer -fsanitize=address -fsanitize-recover=address
-	endif
-endif
-
-cxx_objs = $(verilator_build_dir)/verilated.o $(verilator_build_dir)/verilated_dpi.o $(verilator_build_dir)/tsim_device.o
-
-ifneq ($(USE_TRACE), 0)
-	cxx_flags += -DVM_TRACE=1
-	ifeq ($(USE_TRACE_FST), 1)
-		cxx_flags += -DVM_TRACE_FST
-		verilator_opt += --trace-fst
-	else
-		verilator_opt += --trace
-	endif
-	ifeq ($(USE_TRACE_DETAILED), 1)
-		verilator_opt += --trace-underscore --trace-structs
-	endif
-	ifeq ($(USE_TRACE_FST), 1)
-		cxx_flags += -DTSIM_TRACE_FILE=$(verilator_build_dir)/$(TOP_TEST).fst
-		cxx_objs += $(verilator_build_dir)/verilated_fst_c.o
-	else
-		cxx_flags += -DTSIM_TRACE_FILE=$(verilator_build_dir)/$(TOP_TEST).vcd
-		cxx_objs += $(verilator_build_dir)/verilated_vcd_c.o
-	endif
-else
-	cxx_flags += -DVM_TRACE=0
-endif
-
-ifneq ($(USE_THREADS), 0)
-	verilator_opt += --threads $(USE_THREADS)
-	cxx_flags += -DVL_THREADED
-	cxx_objs += $(verilator_build_dir)/verilated_threads.o
-endif
-
-VPATH = $(VERILATOR_INC_DIR):$(verilator_build_dir):$(VTA_HW_PATH)/hardware/dpi
-
-# The following is to be consistent with cmake
-ifeq ($(shell uname), Darwin)
-	lib_path = $(VTA_HW_PATH)/$(BUILD_NAME)/$(VTA_LIBNAME).dylib
-	cxx_flags += -isysroot /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
-else
-	lib_path = $(VTA_HW_PATH)/$(BUILD_NAME)/$(VTA_LIBNAME).so
-endif
-
-default: lint lib
-
-lint:
-	sbt scalastyle
-
-lib: $(lib_path)
-
-$(verilator_build_dir)/%.o: %.cpp
-	$(CXX) -fPIC $(cxx_flags) -c $^ -o $@
-
-$(verilator_build_dir)/tsim_device.o: tsim_device.cc
-	$(CXX) -fPIC $(cxx_flags) -c $^ -o $@
-
-$(lib_path): $(verilator_build_dir)/V$(TOP_TEST).cpp $(cxx_objs)
-	for f in $(shell find $(verilator_build_dir)/*.cpp); do \
-		$(CXX) -fPIC $(cxx_flags) -c $${f} -o $${f}.o ; \
-	done
-	$(CXX) $(ld_flags) $(cxx_flags) $(cxx_objs) $(patsubst %.cpp,%.cpp.o,$(shell find $(verilator_build_dir)/*.cpp)) -o $@
-
-verilator: $(verilator_build_dir)/V$(TOP_TEST).cpp
-$(verilator_build_dir)/V$(TOP_TEST).cpp: $(chisel_build_dir)/$(TOP_TEST).$(CONFIG).v
-	verilator $(verilator_opt) $<
-
-verilog: $(chisel_build_dir)/$(TOP).$(CONFIG).v
-$(chisel_build_dir)/$(TOP).$(CONFIG).v:
-	sbt 'runMain vta.$(CONFIG) --target-dir $(chisel_build_dir) --top-name $(TOP).$(CONFIG)'
-
-verilog_test: $(chisel_build_dir)/$(TOP_TEST).$(CONFIG).v
-$(chisel_build_dir)/$(TOP_TEST).$(CONFIG).v:
-	sbt 'runMain vta.$(config_test) --target-dir $(chisel_build_dir) --top-name $(TOP_TEST).$(CONFIG)'
-
-unittest:
-	sbt 'test:runMain unittest.Launcher $(UNITTEST_NAME)'
-
-clean:
-	-rm -rf target project/target project/project test_run_dir
-
-cleanall:
-	-rm -rf $(VTA_HW_PATH)/$(BUILD_NAME)/chisel
-	-rm -rf $(VTA_HW_PATH)/$(BUILD_NAME)/libvta_hw.so
-	-rm -rf $(VTA_HW_PATH)/$(BUILD_NAME)/libvta_hw.dylib
-	-rm -rf $(VTA_HW_PATH)/$(BUILD_NAME)/verilator
diff --git a/vta/vta-hw/hardware/chisel/README.md b/vta/vta-hw/hardware/chisel/README.md
deleted file mode 100644
index 40c4322..0000000
--- a/vta/vta-hw/hardware/chisel/README.md
+++ /dev/null
@@ -1,30 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-VTA in Chisel
-===================================================
-For contributors who wants to test a chisel module:
-
- - You can add your test files in  `src/test/scala/unitttest`
- - Add your test name and tests to the `test` object in `src/test/scala/unitttest/Launcher.scala`
- - Check out the provided sample test `mvm` which tests the MatrixVectorComputation module
-    in `src/main/scala/core/TensorGemm.scala`
-
-- Running unit tests: `make test test_name=your_own test_name`
-
-
-
diff --git a/vta/vta-hw/hardware/chisel/build.sbt b/vta/vta-hw/hardware/chisel/build.sbt
deleted file mode 100644
index 7efd59d..0000000
--- a/vta/vta-hw/hardware/chisel/build.sbt
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-name := "vta"
-version := "0.1.0-SNAPSHOT"
-organization := "edu.washington.cs"
-
-def scalacOptionsVersion(scalaVersion: String): Seq[String] = {
-  Seq() ++ {
-    // If we're building with Scala > 2.11, enable the compile option
-    //  switch to support our anonymous Bundle definitions:
-    //  https://github.com/scala/bug/issues/10047
-    CrossVersion.partialVersion(scalaVersion) match {
-      case Some((2, scalaMajor: Long)) if scalaMajor < 12 => Seq()
-      case _ => Seq(
-        "-Xsource:2.11",
-        "-language:reflectiveCalls",
-        "-language:implicitConversions",
-        "-deprecation",
-        "-Xlint",
-        "-Ywarn-unused",
-      )
-    }
-  }
-}
-
-def javacOptionsVersion(scalaVersion: String): Seq[String] = {
-  Seq() ++ {
-    // Scala 2.12 requires Java 8. We continue to generate
-    //  Java 7 compatible code for Scala 2.11
-    //  for compatibility with old clients.
-    CrossVersion.partialVersion(scalaVersion) match {
-      case Some((2, scalaMajor: Long)) if scalaMajor < 12 =>
-        Seq("-source", "1.7", "-target", "1.7")
-      case _ =>
-        Seq("-source", "1.8", "-target", "1.8")
-    }
-  }
-}
-
-scalaVersion := "2.11.12"
-
-resolvers ++= Seq(
-  Resolver.sonatypeRepo("snapshots"),
-  Resolver.sonatypeRepo("releases"))
-
-val defaultVersions = Map(
-  "chisel3" -> "3.1.7",
-  "chisel-iotesters" -> "1.2.4"
-  )
-
-libraryDependencies ++= Seq("chisel3","chisel-iotesters").map {
-  dep: String => "edu.berkeley.cs" %% dep % sys.props.getOrElse(dep + "Version", defaultVersions(dep)) }
-
-scalacOptions ++= scalacOptionsVersion(scalaVersion.value)
-javacOptions ++= javacOptionsVersion(scalaVersion.value)
diff --git a/vta/vta-hw/hardware/chisel/project/build.properties b/vta/vta-hw/hardware/chisel/project/build.properties
deleted file mode 100644
index fc7998e..0000000
--- a/vta/vta-hw/hardware/chisel/project/build.properties
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-sbt.version = 1.3.2
diff --git a/vta/vta-hw/hardware/chisel/project/plugins.sbt b/vta/vta-hw/hardware/chisel/project/plugins.sbt
deleted file mode 100644
index 19ae5c9..0000000
--- a/vta/vta-hw/hardware/chisel/project/plugins.sbt
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-logLevel := Level.Warn
-addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0")
diff --git a/vta/vta-hw/hardware/chisel/scalastyle-config.xml b/vta/vta-hw/hardware/chisel/scalastyle-config.xml
deleted file mode 100644
index ae7c8e6..0000000
--- a/vta/vta-hw/hardware/chisel/scalastyle-config.xml
+++ /dev/null
@@ -1,128 +0,0 @@
-<scalastyle>
- <name>Scalastyle standard configuration</name>
- <check level="error" class="org.scalastyle.file.FileTabChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.file.FileLengthChecker" enabled="true">
-  <parameters>
-   <parameter name="maxFileLength"><![CDATA[800]]></parameter>
-  </parameters>
- </check>
- <check level="error" class="org.scalastyle.file.HeaderMatchesChecker" enabled="true">
-  <parameters>
-   <parameter name="header"><![CDATA[/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-]]></parameter>
-  </parameters>
- </check>
- <check level="error" class="org.scalastyle.scalariform.SpacesAfterPlusChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.file.WhitespaceEndOfLineChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.scalariform.SpacesBeforePlusChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.file.FileLineLengthChecker" enabled="true">
-  <parameters>
-   <parameter name="maxLineLength"><![CDATA[120]]></parameter>
-   <parameter name="tabSize"><![CDATA[2]]></parameter>
-  </parameters>
- </check>
- <check level="error" class="org.scalastyle.scalariform.ClassNamesChecker" enabled="true">
-  <parameters>
-   <parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter>
-  </parameters>
- </check>
- <check level="error" class="org.scalastyle.scalariform.ObjectNamesChecker" enabled="true">
-  <parameters>
-   <parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter>
-  </parameters>
- </check>
- <check level="error" class="org.scalastyle.scalariform.PackageObjectNamesChecker" enabled="true">
-  <parameters>
-   <parameter name="regex"><![CDATA[^[a-z][A-Za-z]*$]]></parameter>
-  </parameters>
- </check>
- <check level="error" class="org.scalastyle.scalariform.EqualsHashCodeChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.scalariform.IllegalImportsChecker" enabled="true">
-  <parameters>
-   <parameter name="illegalImports"><![CDATA[sun._,java.awt._]]></parameter>
-  </parameters>
- </check>
- <check level="error" class="org.scalastyle.scalariform.ParameterNumberChecker" enabled="true">
-  <parameters>
-   <parameter name="maxParameters"><![CDATA[8]]></parameter>
-  </parameters>
- </check>
- <check level="error" class="org.scalastyle.scalariform.MagicNumberChecker" enabled="false">
-  <parameters>
-   <parameter name="ignore"><![CDATA[-1,0,1,2,3,4,8,16,32,64,128]]></parameter>
-  </parameters>
- </check>
- <check level="error" class="org.scalastyle.scalariform.NoWhitespaceBeforeLeftBracketChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.scalariform.NoWhitespaceAfterLeftBracketChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.scalariform.ReturnChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.scalariform.NullChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.scalariform.NoCloneChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.scalariform.NoFinalizeChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.scalariform.CovariantEqualsChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.scalariform.StructuralTypeChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
-  <parameters>
-   <parameter name="regex"><![CDATA[println]]></parameter>
-  </parameters>
- </check>
- <check level="error" class="org.scalastyle.scalariform.NumberOfTypesChecker" enabled="true">
-  <parameters>
-   <parameter name="maxTypes"><![CDATA[30]]></parameter>
-  </parameters>
- </check>
- <check level="error" class="org.scalastyle.scalariform.CyclomaticComplexityChecker" enabled="true">
-  <parameters>
-   <parameter name="maximum"><![CDATA[10]]></parameter>
-  </parameters>
- </check>
- <check level="error" class="org.scalastyle.scalariform.UppercaseLChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.scalariform.SimplifyBooleanExpressionChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.scalariform.IfBraceChecker" enabled="false">
-  <parameters>
-   <parameter name="singleLineAllowed"><![CDATA[true]]></parameter>
-   <parameter name="doubleLineAllowed"><![CDATA[false]]></parameter>
-  </parameters>
- </check>
- <check level="error" class="org.scalastyle.scalariform.MethodLengthChecker" enabled="true">
-  <parameters>
-   <parameter name="maxLength"><![CDATA[50]]></parameter>
-  </parameters>
- </check>
- <check level="error" class="org.scalastyle.scalariform.MethodNamesChecker" enabled="false">
-  <parameters>
-   <parameter name="regex"><![CDATA[^[a-z][A-Za-z0-9]*$]]></parameter>
-  </parameters>
- </check>
- <check level="error" class="org.scalastyle.scalariform.NumberOfMethodsInTypeChecker" enabled="true">
-  <parameters>
-   <parameter name="maxMethods"><![CDATA[30]]></parameter>
-  </parameters>
- </check>
- <check level="error" class="org.scalastyle.scalariform.PublicMethodsHaveTypeChecker" enabled="false"></check>
- <check level="error" class="org.scalastyle.file.NewLineAtEofChecker" enabled="true"></check>
- <check level="error" class="org.scalastyle.file.NoNewLineAtEofChecker" enabled="false"></check>
- <check level="error" class="org.scalastyle.file.IndentationChecker" enabled="true">
-   <parameters>
-     <parameter name="tabSize">2</parameter>
-     <parameter name="methodParamIndentSize">2</parameter>
-     <parameter name="classParamIndentSize">4</parameter>
-   </parameters>
- </check>
-</scalastyle>
diff --git a/vta/vta-hw/hardware/chisel/src/main/resources/verilog/VTAHostDPI.v b/vta/vta-hw/hardware/chisel/src/main/resources/verilog/VTAHostDPI.v
deleted file mode 100644
index 3441e3e..0000000
--- a/vta/vta-hw/hardware/chisel/src/main/resources/verilog/VTAHostDPI.v
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-module VTAHostDPI #
-( parameter ADDR_BITS = 8,
-  parameter DATA_BITS = 32
-)
-(
-  input                        clock,
-  input                        reset,
-  output logic                 dpi_req_valid,
-  output logic                 dpi_req_opcode,
-  output logic [ADDR_BITS-1:0] dpi_req_addr,
-  output logic [DATA_BITS-1:0] dpi_req_value,
-  input                        dpi_req_deq,
-  input                        dpi_resp_valid,
-  input        [DATA_BITS-1:0] dpi_resp_bits
-);
-
-  import "DPI-C" function void VTAHostDPI
-  (
-    output byte unsigned req_valid,
-    output byte unsigned req_opcode,
-    output byte unsigned req_addr,
-    output int  unsigned req_value,
-    input  byte unsigned req_deq,
-    input  byte unsigned resp_valid,
-    input  int  unsigned resp_value
-  );
-
-  typedef logic        dpi1_t;
-  typedef logic  [7:0] dpi8_t;
-  typedef logic [31:0] dpi32_t;
-
-  dpi1_t  __reset;
-  dpi8_t  __req_valid;
-  dpi8_t  __req_opcode;
-  dpi8_t  __req_addr;
-  dpi32_t __req_value;
-  dpi8_t  __req_deq;
-  dpi8_t  __resp_valid;
-  dpi32_t __resp_bits;
-
-  // reset
-  always_ff @(posedge clock) begin
-    __reset <= reset;
-  end
-
-  // delaying outputs by one-cycle
-  // since verilator does not support delays
-  always_ff @(posedge clock) begin
-    dpi_req_valid  <= dpi1_t ' (__req_valid);
-    dpi_req_opcode <= dpi1_t ' (__req_opcode);
-    dpi_req_addr   <= __req_addr;
-    dpi_req_value  <= __req_value;
-  end
-
-  assign __req_deq    = dpi8_t ' (dpi_req_deq);
-  assign __resp_valid = dpi8_t ' (dpi_resp_valid);
-  assign __resp_bits  = dpi_resp_bits;
-
-  // evaluate DPI function
-  always_ff @(posedge clock) begin
-    if (reset | __reset) begin
-      __req_valid = 0;
-      __req_opcode = 0;
-      __req_addr = 0;
-      __req_value = 0;
-    end
-    else begin
-      VTAHostDPI(
-        __req_valid,
-        __req_opcode,
-        __req_addr,
-        __req_value,
-        __req_deq,
-        __resp_valid,
-        __resp_bits);
-    end
-  end
-
-endmodule
diff --git a/vta/vta-hw/hardware/chisel/src/main/resources/verilog/VTAMemDPI.v b/vta/vta-hw/hardware/chisel/src/main/resources/verilog/VTAMemDPI.v
deleted file mode 100644
index e0ed949..0000000
--- a/vta/vta-hw/hardware/chisel/src/main/resources/verilog/VTAMemDPI.v
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-module VTAMemDPI #
-( parameter LEN_BITS = 8,
-  parameter ADDR_BITS = 64,
-  parameter DATA_BITS = 64
-)
-(
-  input                        clock,
-  input                        reset,
-  input                        dpi_req_valid,
-  input                        dpi_req_opcode,
-  input         [LEN_BITS-1:0] dpi_req_len,
-  input        [ADDR_BITS-1:0] dpi_req_addr,
-  input                        dpi_wr_valid,
-  input        [DATA_BITS-1:0] dpi_wr_bits,
-  output logic                 dpi_rd_valid,
-  output logic [DATA_BITS-1:0] dpi_rd_bits,
-  input                        dpi_rd_ready
-);
-
-  import "DPI-C" function void VTAMemDPI
-  (
-    input  byte     unsigned req_valid,
-    input  byte     unsigned req_opcode,
-    input  byte     unsigned req_len,
-    input  longint  unsigned req_addr,
-    input  byte     unsigned wr_valid,
-    input  longint  unsigned wr_value,
-    output byte     unsigned rd_valid,
-    output longint  unsigned rd_value,
-    input  byte     unsigned rd_ready
-  );
-
-  typedef logic        dpi1_t;
-  typedef logic  [7:0] dpi8_t;
-  typedef logic [31:0] dpi32_t;
-  typedef logic [63:0] dpi64_t;
-
-  dpi1_t  __reset;
-  dpi8_t  __req_valid;
-  dpi8_t  __req_opcode;
-  dpi8_t  __req_len;
-  dpi64_t __req_addr;
-  dpi8_t  __wr_valid;
-  dpi64_t __wr_value;
-  dpi8_t  __rd_valid;
-  dpi64_t __rd_value;
-  dpi8_t  __rd_ready;
-
-  always_ff @(posedge clock) begin
-    __reset <= reset;
-  end
-
-  // delaying outputs by one-cycle
-  // since verilator does not support delays
-  always_ff @(posedge clock) begin
-    dpi_rd_valid <= dpi1_t ' (__rd_valid);
-    dpi_rd_bits  <= __rd_value;
-  end
-
-  assign __req_valid  = dpi8_t ' (dpi_req_valid);
-  assign __req_opcode = dpi8_t ' (dpi_req_opcode);
-  assign __req_len    = dpi_req_len;
-  assign __req_addr   = dpi_req_addr;
-  assign __wr_valid   = dpi8_t ' (dpi_wr_valid);
-  assign __wr_value   = dpi_wr_bits;
-  assign __rd_ready   = dpi8_t ' (dpi_rd_ready);
-
-  // evaluate DPI function
-  always_ff @(posedge clock) begin
-    if (reset | __reset) begin
-      __rd_valid = 0;
-      __rd_value = 0;
-    end
-    else begin
-      VTAMemDPI(
-        __req_valid,
-        __req_opcode,
-        __req_len,
-        __req_addr,
-        __wr_valid,
-        __wr_value,
-        __rd_valid,
-        __rd_value,
-        __rd_ready);
-    end
-  end
-endmodule
diff --git a/vta/vta-hw/hardware/chisel/src/main/resources/verilog/VTASimDPI.v b/vta/vta-hw/hardware/chisel/src/main/resources/verilog/VTASimDPI.v
deleted file mode 100644
index fc0d4c8..0000000
--- a/vta/vta-hw/hardware/chisel/src/main/resources/verilog/VTASimDPI.v
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-module VTASimDPI
-(
-  input                        clock,
-  input                        reset,
-  output logic                 dpi_wait
-);
-
-  import "DPI-C" function void VTASimDPI
-  (
-    output byte unsigned sim_wait,
-    output byte unsigned sim_exit
-  );
-
-  typedef logic        dpi1_t;
-  typedef logic  [7:0] dpi8_t;
-
-  dpi1_t __reset;
-  dpi8_t __wait;
-  dpi8_t __exit;
-
-  // reset
-  always_ff @(posedge clock) begin
-    __reset <= reset;
-  end
-
-  // evaluate DPI function
-  always_ff @(posedge clock) begin
-    if (reset | __reset) begin
-      __wait = 0;
-      __exit = 0;
-    end
-    else begin
-      VTASimDPI(
-        __wait,
-	__exit);
-    end
-  end
-
-  logic wait_reg;
-
-  always_ff @(posedge clock) begin
-    if (reset | __reset) begin
-      wait_reg <= 1'b0;
-    end else if (__wait == 1) begin
-      wait_reg <= 1'b1;
-    end else begin
-      wait_reg <= 1'b0;
-    end
-  end
-
-  assign dpi_wait = wait_reg;
-
-  always_ff @(posedge clock) begin
-    if (__exit == 1) begin
-      $finish;
-    end
-  end
-
-endmodule
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/core/Compute.scala b/vta/vta-hw/hardware/chisel/src/main/scala/core/Compute.scala
deleted file mode 100644
index a1e7fad..0000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/core/Compute.scala
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.core
-
-import chisel3._
-import chisel3.util._
-import vta.util.config._
-import vta.shell._
-
-/** Compute.
- *
- * The compute unit is in charge of the following:
- * - Loading micro-ops from memory (loadUop module)
- * - Loading biases (acc) from memory (tensorAcc module)
- * - Compute ALU instructions (tensorAlu module)
- * - Compute GEMM instructions (tensorGemm module)
- */
-class Compute(debug: Boolean = false)(implicit p: Parameters) extends Module {
-  val mp = p(ShellKey).memParams
-  val io = IO(new Bundle {
-    val i_post = Vec(2, Input(Bool()))
-    val o_post = Vec(2, Output(Bool()))
-    val inst = Flipped(Decoupled(UInt(INST_BITS.W)))
-    val uop_baddr = Input(UInt(mp.addrBits.W))
-    val acc_baddr = Input(UInt(mp.addrBits.W))
-    val vme_rd = Vec(2, new VMEReadMaster)
-    val inp = new TensorMaster(tensorType = "inp")
-    val wgt = new TensorMaster(tensorType = "wgt")
-    val out = new TensorMaster(tensorType = "out")
-    val finish = Output(Bool())
-    val acc_wr_event = Output(Bool())
-  })
-  val sIdle :: sSync :: sExe :: Nil = Enum(3)
-  val state = RegInit(sIdle)
-
-  val s = Seq.tabulate(2)(_ =>
-    Module(new Semaphore(counterBits = 8, counterInitValue = 0)))
-
-  val loadUop = Module(new LoadUop)
-  val tensorAcc = Module(new TensorLoad(tensorType = "acc"))
-  val tensorGemm = Module(new TensorGemm)
-  val tensorAlu = Module(new TensorAlu)
-
-  val inst_q = Module(new Queue(UInt(INST_BITS.W), p(CoreKey).instQueueEntries))
-
-  // decode
-  val dec = Module(new ComputeDecode)
-  dec.io.inst := inst_q.io.deq.bits
-
-  val inst_type =
-    Cat(dec.io.isFinish,
-      dec.io.isAlu,
-      dec.io.isGemm,
-      dec.io.isLoadAcc,
-      dec.io.isLoadUop).asUInt
-
-  val sprev = inst_q.io.deq.valid & Mux(dec.io.pop_prev, s(0).io.sready, true.B)
-  val snext = inst_q.io.deq.valid & Mux(dec.io.pop_next, s(1).io.sready, true.B)
-  val start = snext & sprev
-  val done =
-    MuxLookup(
-      inst_type,
-      false.B, // default
-      Array(
-        "h_01".U -> loadUop.io.done,
-        "h_02".U -> tensorAcc.io.done,
-        "h_04".U -> tensorGemm.io.done,
-        "h_08".U -> tensorAlu.io.done,
-        "h_10".U -> true.B // Finish
-      )
-    )
-
-  // control
-  switch(state) {
-    is(sIdle) {
-      when(start) {
-        when(dec.io.isSync) {
-          state := sSync
-        }.elsewhen(inst_type.orR) {
-          state := sExe
-        }
-      }
-    }
-    is(sSync) {
-      state := sIdle
-    }
-    is(sExe) {
-      when(done) {
-        state := sIdle
-      }
-    }
-  }
-
-  // instructions
-  inst_q.io.enq <> io.inst
-  inst_q.io.deq.ready := (state === sExe & done) | (state === sSync)
-
-  // uop
-  loadUop.io.start := state === sIdle & start & dec.io.isLoadUop
-  loadUop.io.inst := inst_q.io.deq.bits
-  loadUop.io.baddr := io.uop_baddr
-  io.vme_rd(0) <> loadUop.io.vme_rd
-  loadUop.io.uop.idx <> Mux(dec.io.isGemm, tensorGemm.io.uop.idx, tensorAlu.io.uop.idx)
-
-  // acc
-  tensorAcc.io.start := state === sIdle & start & dec.io.isLoadAcc
-  tensorAcc.io.inst := inst_q.io.deq.bits
-  tensorAcc.io.baddr := io.acc_baddr
-  tensorAcc.io.tensor.rd.idx <> Mux(dec.io.isGemm, tensorGemm.io.acc.rd.idx, tensorAlu.io.acc.rd.idx)
-  tensorAcc.io.tensor.wr <> Mux(dec.io.isGemm, tensorGemm.io.acc.wr, tensorAlu.io.acc.wr)
-  io.vme_rd(1) <> tensorAcc.io.vme_rd
-  io.acc_wr_event := tensorAcc.io.tensor.wr.valid
-
-  // gemm
-  tensorGemm.io.start := state === sIdle & start & dec.io.isGemm
-  tensorGemm.io.inst := inst_q.io.deq.bits
-  tensorGemm.io.uop.data.valid := loadUop.io.uop.data.valid & dec.io.isGemm
-  tensorGemm.io.uop.data.bits <> loadUop.io.uop.data.bits
-  tensorGemm.io.inp <> io.inp
-  tensorGemm.io.wgt <> io.wgt
-  tensorGemm.io.acc.rd.data.valid := tensorAcc.io.tensor.rd.data.valid & dec.io.isGemm
-  tensorGemm.io.acc.rd.data.bits <> tensorAcc.io.tensor.rd.data.bits
-  tensorGemm.io.out.rd.data.valid := io.out.rd.data.valid & dec.io.isGemm
-  tensorGemm.io.out.rd.data.bits <> io.out.rd.data.bits
-
-  // alu
-  tensorAlu.io.start := state === sIdle & start & dec.io.isAlu
-  tensorAlu.io.inst := inst_q.io.deq.bits
-  tensorAlu.io.uop.data.valid := loadUop.io.uop.data.valid & dec.io.isAlu
-  tensorAlu.io.uop.data.bits <> loadUop.io.uop.data.bits
-  tensorAlu.io.acc.rd.data.valid := tensorAcc.io.tensor.rd.data.valid & dec.io.isAlu
-  tensorAlu.io.acc.rd.data.bits <> tensorAcc.io.tensor.rd.data.bits
-  tensorAlu.io.out.rd.data.valid := io.out.rd.data.valid & dec.io.isAlu
-  tensorAlu.io.out.rd.data.bits <> io.out.rd.data.bits
-
-  // out
-  io.out.rd.idx <> Mux(dec.io.isGemm,
-    tensorGemm.io.out.rd.idx,
-    tensorAlu.io.out.rd.idx)
-  io.out.wr <> Mux(dec.io.isGemm, tensorGemm.io.out.wr, tensorAlu.io.out.wr)
-
-  // semaphore
-  s(0).io.spost := io.i_post(0)
-  s(1).io.spost := io.i_post(1)
-  s(0).io.swait := dec.io.pop_prev & (state === sIdle & start)
-  s(1).io.swait := dec.io.pop_next & (state === sIdle & start)
-  io.o_post(0) := dec.io.push_prev & ((state === sExe & done) | (state === sSync))
-  io.o_post(1) := dec.io.push_next & ((state === sExe & done) | (state === sSync))
-
-  // finish
-  io.finish := state === sExe & done & dec.io.isFinish
-
-  // debug
-  if (debug) {
-    // start
-    when(state === sIdle && start) {
-      when(dec.io.isSync) {
-        printf("[Compute] start sync\n")
-      }.elsewhen(dec.io.isLoadUop) {
-        printf("[Compute] start load uop\n")
-      }.elsewhen(dec.io.isLoadAcc) {
-        printf("[Compute] start load acc\n")
-      }.elsewhen(dec.io.isGemm) {
-        printf("[Compute] start gemm\n")
-      }.elsewhen(dec.io.isAlu) {
-        printf("[Compute] start alu\n")
-      }.elsewhen(dec.io.isFinish) {
-        printf("[Compute] start finish\n")
-      }
-    }
-    // done
-    when(state === sSync) {
-      printf("[Compute] done sync\n")
-    }
-    when(state === sExe) {
-      when(done) {
-        when(dec.io.isLoadUop) {
-          printf("[Compute] done load uop\n")
-        }.elsewhen(dec.io.isLoadAcc) {
-          printf("[Compute] done load acc\n")
-        }.elsewhen(dec.io.isGemm) {
-          printf("[Compute] done gemm\n")
-        }.elsewhen(dec.io.isAlu) {
-          printf("[Compute] done alu\n")
-        }.elsewhen(dec.io.isFinish) {
-          printf("[Compute] done finish\n")
-        }
-      }
-    }
-  }
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/core/Configs.scala b/vta/vta-hw/hardware/chisel/src/main/scala/core/Configs.scala
deleted file mode 100644
index 4ab7d85..0000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/core/Configs.scala
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.core
-
-import vta.util.config._
-
-/** CoreConfig.
- *
- * This is one supported configuration for VTA. This file will
- * be eventually filled out with class configurations that can be
- * mixed/matched with Shell configurations for different backends.
- */
-class CoreConfig extends Config((site, here, up) => {
-  case CoreKey =>
-    CoreParams(
-      batch = 1,
-      blockOut = 16,
-      blockIn = 16,
-      inpBits = 8,
-      wgtBits = 8,
-      uopBits = 32,
-      accBits = 32,
-      outBits = 8,
-      uopMemDepth = 2048,
-      inpMemDepth = 2048,
-      wgtMemDepth = 1024,
-      accMemDepth = 2048,
-      outMemDepth = 2048,
-      instQueueEntries = 512
-    )
-})
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/core/Core.scala b/vta/vta-hw/hardware/chisel/src/main/scala/core/Core.scala
deleted file mode 100644
index e2ac51a..0000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/core/Core.scala
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.core
-
-import chisel3._
-import vta.util.config._
-import vta.shell._
-
-/** Core parameters */
-case class CoreParams(
-    batch: Int = 1,
-    blockOut: Int = 16,
-    blockIn: Int = 16,
-    inpBits: Int = 8,
-    wgtBits: Int = 8,
-    uopBits: Int = 32,
-    accBits: Int = 32,
-    outBits: Int = 8,
-    uopMemDepth: Int = 512,
-    inpMemDepth: Int = 512,
-    wgtMemDepth: Int = 512,
-    accMemDepth: Int = 512,
-    outMemDepth: Int = 512,
-    instQueueEntries: Int = 32
-) {
-  require(uopBits % 8 == 0,
-    s"\n\n[VTA] [CoreParams] uopBits must be byte aligned\n\n")
-}
-
-case object CoreKey extends Field[CoreParams]
-
-/** Core.
- *
- * The core defines the current VTA architecture by connecting memory and
- * compute modules together such as load/store and compute. Most of the
- * connections in the core are bulk (<>), and we should try to keep it this
- * way, because it is easier to understand what is going on.
- *
- * Also, the core must be instantiated by a shell using the
- * VTA Control Register (VCR) and the VTA Memory Engine (VME) interfaces.
- * More info about these interfaces and modules can be found in the shell
- * directory.
- */
-class Core(implicit p: Parameters) extends Module {
-  val io = IO(new Bundle {
-    val vcr = new VCRClient
-    val vme = new VMEMaster
-  })
-  val fetch = Module(new Fetch)
-  val load = Module(new Load)
-  val compute = Module(new Compute)
-  val store = Module(new Store)
-  val ecounters = Module(new EventCounters)
-
-  // Read(rd) and write(wr) from/to memory (i.e. DRAM)
-  io.vme.rd(0) <> fetch.io.vme_rd
-  io.vme.rd(1) <> compute.io.vme_rd(0)
-  io.vme.rd(2) <> load.io.vme_rd(0)
-  io.vme.rd(3) <> load.io.vme_rd(1)
-  io.vme.rd(4) <> compute.io.vme_rd(1)
-  io.vme.wr(0) <> store.io.vme_wr
-
-  // Fetch instructions (tasks) from memory (DRAM) into queues (SRAMs)
-  fetch.io.launch := io.vcr.launch
-  fetch.io.ins_baddr := io.vcr.ptrs(0)
-  fetch.io.ins_count := io.vcr.vals(0)
-
-  // Load inputs and weights from memory (DRAM) into scratchpads (SRAMs)
-  load.io.i_post := compute.io.o_post(0)
-  load.io.inst <> fetch.io.inst.ld
-  load.io.inp_baddr := io.vcr.ptrs(2)
-  load.io.wgt_baddr := io.vcr.ptrs(3)
-
-  // The compute module performs the following:
-  // - Load micro-ops (uops) and accumulations (acc)
-  // - Compute dense and ALU instructions (tasks)
-  compute.io.i_post(0) := load.io.o_post
-  compute.io.i_post(1) := store.io.o_post
-  compute.io.inst <> fetch.io.inst.co
-  compute.io.uop_baddr := io.vcr.ptrs(1)
-  compute.io.acc_baddr := io.vcr.ptrs(4)
-  compute.io.inp <> load.io.inp
-  compute.io.wgt <> load.io.wgt
-
-  // The store module performs the following:
-  // - Writes results from compute into scratchpads (SRAMs)
-  // - Store results from scratchpads (SRAMs) to memory (DRAM)
-  store.io.i_post := compute.io.o_post(1)
-  store.io.inst <> fetch.io.inst.st
-  store.io.out_baddr := io.vcr.ptrs(5)
-  store.io.out <> compute.io.out
-
-  // Event counters
-  ecounters.io.launch := io.vcr.launch
-  ecounters.io.finish := compute.io.finish
-  io.vcr.ecnt <> ecounters.io.ecnt
-  io.vcr.ucnt <> ecounters.io.ucnt
-  ecounters.io.acc_wr_event := compute.io.acc_wr_event
-
-  // Finish instruction is executed and asserts the VCR finish flag
-  val finish = RegNext(compute.io.finish)
-  io.vcr.finish := finish
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/core/Decode.scala b/vta/vta-hw/hardware/chisel/src/main/scala/core/Decode.scala
deleted file mode 100644
index 37f6ab4..0000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/core/Decode.scala
+++ /dev/null
@@ -1,230 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.core
-
-import chisel3._
-import chisel3.util._
-
-import ISA._
-
-/** MemDecode.
- *
- * Decode memory instructions with a Bundle. This is similar to an union,
- * therefore order matters when declaring fields. These are the instructions
- * decoded with this bundle:
- *   - LUOP
- *   - LWGT
- *   - LINP
- *   - LACC
- *   - SOUT
- */
-class MemDecode extends Bundle {
-  val xpad_1 = UInt(M_PAD_BITS.W)
-  val xpad_0 = UInt(M_PAD_BITS.W)
-  val ypad_1 = UInt(M_PAD_BITS.W)
-  val ypad_0 = UInt(M_PAD_BITS.W)
-  val xstride = UInt(M_STRIDE_BITS.W)
-  val xsize = UInt(M_SIZE_BITS.W)
-  val ysize = UInt(M_SIZE_BITS.W)
-  val empty_0 = UInt(7.W) // derive this
-  val dram_offset = UInt(M_DRAM_OFFSET_BITS.W)
-  val sram_offset = UInt(M_SRAM_OFFSET_BITS.W)
-  val id = UInt(M_ID_BITS.W)
-  val push_next = Bool()
-  val push_prev = Bool()
-  val pop_next = Bool()
-  val pop_prev = Bool()
-  val op = UInt(OP_BITS.W)
-}
-
-/** GemmDecode.
- *
- * Decode GEMM instruction with a Bundle. This is similar to an union,
- * therefore order matters when declaring fields.
- */
-class GemmDecode extends Bundle {
-  val wgt_1 = UInt(C_WIDX_BITS.W)
-  val wgt_0 = UInt(C_WIDX_BITS.W)
-  val inp_1 = UInt(C_IIDX_BITS.W)
-  val inp_0 = UInt(C_IIDX_BITS.W)
-  val acc_1 = UInt(C_AIDX_BITS.W)
-  val acc_0 = UInt(C_AIDX_BITS.W)
-  val empty_0 = Bool()
-  val lp_1 = UInt(C_ITER_BITS.W)
-  val lp_0 = UInt(C_ITER_BITS.W)
-  val uop_end = UInt(C_UOP_END_BITS.W)
-  val uop_begin = UInt(C_UOP_BGN_BITS.W)
-  val reset = Bool()
-  val push_next = Bool()
-  val push_prev = Bool()
-  val pop_next = Bool()
-  val pop_prev = Bool()
-  val op = UInt(OP_BITS.W)
-}
-
-/** AluDecode.
- *
- * Decode ALU instructions with a Bundle. This is similar to an union,
- * therefore order matters when declaring fields. These are the instructions
- * decoded with this bundle:
- *   - VMIN
- *   - VMAX
- *   - VADD
- *   - VSHX
- */
-class AluDecode extends Bundle {
-  val empty_1 = Bool()
-  val alu_imm = UInt(C_ALU_IMM_BITS.W)
-  val alu_use_imm = Bool()
-  val alu_op = UInt(C_ALU_DEC_BITS.W)
-  val src_1 = UInt(C_IIDX_BITS.W)
-  val src_0 = UInt(C_IIDX_BITS.W)
-  val dst_1 = UInt(C_AIDX_BITS.W)
-  val dst_0 = UInt(C_AIDX_BITS.W)
-  val empty_0 = Bool()
-  val lp_1 = UInt(C_ITER_BITS.W)
-  val lp_0 = UInt(C_ITER_BITS.W)
-  val uop_end = UInt(C_UOP_END_BITS.W)
-  val uop_begin = UInt(C_UOP_BGN_BITS.W)
-  val reset = Bool()
-  val push_next = Bool()
-  val push_prev = Bool()
-  val pop_next = Bool()
-  val pop_prev = Bool()
-  val op = UInt(OP_BITS.W)
-}
-
-/** UopDecode.
- *
- * Decode micro-ops (uops).
- */
-class UopDecode extends Bundle {
-  val u2 = UInt(10.W)
-  val u1 = UInt(11.W)
-  val u0 = UInt(11.W)
-}
-
-/** FetchDecode.
- *
- * Partial decoding for dispatching instructions to Load, Compute, and Store.
- */
-class FetchDecode extends Module {
-  val io = IO(new Bundle {
-    val inst = Input(UInt(INST_BITS.W))
-    val isLoad = Output(Bool())
-    val isCompute = Output(Bool())
-    val isStore = Output(Bool())
-  })
-  val csignals =
-    ListLookup(
-      io.inst,
-      List(N, OP_X),
-      Array(
-        LUOP -> List(Y, OP_G),
-        LWGT -> List(Y, OP_L),
-        LINP -> List(Y, OP_L),
-        LACC -> List(Y, OP_G),
-        SOUT -> List(Y, OP_S),
-        GEMM -> List(Y, OP_G),
-        FNSH -> List(Y, OP_G),
-        VMIN -> List(Y, OP_G),
-        VMAX -> List(Y, OP_G),
-        VADD -> List(Y, OP_G),
-        VSHX -> List(Y, OP_G)
-      )
-    )
-
-  val (cs_val_inst: Bool) :: cs_op_type :: Nil = csignals
-
-  io.isLoad := cs_val_inst & cs_op_type === OP_L
-  io.isCompute := cs_val_inst & cs_op_type === OP_G
-  io.isStore := cs_val_inst & cs_op_type === OP_S
-}
-
-/** LoadDecode.
- *
- * Decode dependencies, type and sync for Load module.
- */
-class LoadDecode extends Module {
-  val io = IO(new Bundle {
-    val inst = Input(UInt(INST_BITS.W))
-    val push_next = Output(Bool())
-    val pop_next = Output(Bool())
-    val isInput = Output(Bool())
-    val isWeight = Output(Bool())
-    val isSync = Output(Bool())
-  })
-  val dec = io.inst.asTypeOf(new MemDecode)
-  io.push_next := dec.push_next
-  io.pop_next := dec.pop_next
-  io.isInput := io.inst === LINP & dec.xsize =/= 0.U
-  io.isWeight := io.inst === LWGT & dec.xsize =/= 0.U
-  io.isSync := (io.inst === LINP | io.inst === LWGT) & dec.xsize === 0.U
-}
-
-/** ComputeDecode.
- *
- * Decode dependencies, type and sync for Compute module.
- */
-class ComputeDecode extends Module {
-  val io = IO(new Bundle {
-    val inst = Input(UInt(INST_BITS.W))
-    val push_next = Output(Bool())
-    val push_prev = Output(Bool())
-    val pop_next = Output(Bool())
-    val pop_prev = Output(Bool())
-    val isLoadAcc = Output(Bool())
-    val isLoadUop = Output(Bool())
-    val isSync = Output(Bool())
-    val isAlu = Output(Bool())
-    val isGemm = Output(Bool())
-    val isFinish = Output(Bool())
-  })
-  val dec = io.inst.asTypeOf(new MemDecode)
-  io.push_next := dec.push_next
-  io.push_prev := dec.push_prev
-  io.pop_next := dec.pop_next
-  io.pop_prev := dec.pop_prev
-  io.isLoadAcc := io.inst === LACC & dec.xsize =/= 0.U
-  io.isLoadUop := io.inst === LUOP & dec.xsize =/= 0.U
-  io.isSync := (io.inst === LACC | io.inst === LUOP) & dec.xsize === 0.U
-  io.isAlu := io.inst === VMIN | io.inst === VMAX | io.inst === VADD | io.inst === VSHX
-  io.isGemm := io.inst === GEMM
-  io.isFinish := io.inst === FNSH
-}
-
-/** StoreDecode.
- *
- * Decode dependencies, type and sync for Store module.
- */
-class StoreDecode extends Module {
-  val io = IO(new Bundle {
-    val inst = Input(UInt(INST_BITS.W))
-    val push_prev = Output(Bool())
-    val pop_prev = Output(Bool())
-    val isStore = Output(Bool())
-    val isSync = Output(Bool())
-  })
-  val dec = io.inst.asTypeOf(new MemDecode)
-  io.push_prev := dec.push_prev
-  io.pop_prev := dec.pop_prev
-  io.isStore := io.inst === SOUT & dec.xsize =/= 0.U
-  io.isSync := io.inst === SOUT & dec.xsize === 0.U
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/core/EventCounters.scala b/vta/vta-hw/hardware/chisel/src/main/scala/core/EventCounters.scala
deleted file mode 100644
index 5ef3586..0000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/core/EventCounters.scala
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.core
-
-import chisel3._
-import chisel3.util._
-import vta.util.config._
-import vta.shell._
-
-/** EventCounters.
- *
- * This unit contains all the event counting logic. One common event tracked in
- * hardware is the number of clock cycles taken to achieve certain task. We
- * can count the total number of clock cycles spent in a VTA run by checking
- * launch and finish signals.
- *
- * The event counter value is passed to the VCR module via the ecnt port, so
- * they can be accessed by the host. The number of event counters (nECnt) is
- * defined in the Shell VCR module as a parameter, see VCRParams.
- *
- * If one would like to add an event counter, then the value of nECnt must be
- * changed in VCRParams together with the corresponding counting logic here.
- */
-class EventCounters(debug: Boolean = false)(implicit p: Parameters) extends Module {
-  val vp = p(ShellKey).vcrParams
-  val io = IO(new Bundle {
-    val launch = Input(Bool())
-    val finish = Input(Bool())
-    val ecnt = Vec(vp.nECnt, ValidIO(UInt(vp.regBits.W)))
-    val ucnt = Vec(vp.nUCnt, ValidIO(UInt(vp.regBits.W)))
-    val acc_wr_event = Input(Bool())
-  })
-  val cycle_cnt = RegInit(0.U(vp.regBits.W))
-  when(io.launch && !io.finish) {
-    cycle_cnt := cycle_cnt + 1.U
-  }.otherwise {
-    cycle_cnt := 0.U
-  }
-  io.ecnt(0).valid := io.finish
-  io.ecnt(0).bits := cycle_cnt
-
-  val acc_wr_count = Reg(UInt(vp.regBits.W))
-  when (!io.launch || io.finish) {
-    acc_wr_count := 0.U
-  }.elsewhen (io.acc_wr_event) {
-    acc_wr_count := acc_wr_count + 1.U
-  }
-  io.ucnt(0).valid := io.finish
-  io.ucnt(0).bits := acc_wr_count
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/core/Fetch.scala b/vta/vta-hw/hardware/chisel/src/main/scala/core/Fetch.scala
deleted file mode 100644
index 0ea35a3..0000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/core/Fetch.scala
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.core
-
-import chisel3._
-import chisel3.util._
-import vta.util.config._
-import vta.shell._
-
-/** Fetch.
- *
- * The fetch unit reads instructions (tasks) from memory (i.e. DRAM), using the
- * VTA Memory Engine (VME), and push them into an instruction queue called
- * inst_q. Once the instruction queue is full, instructions are dispatched to
- * the Load, Compute and Store module queues based on the instruction opcode.
- * After draining the queue, the fetch unit checks if there are more instructions
- * via the ins_count register which is written by the host.
- *
- * Additionally, instructions are read into two chunks (see sReadLSB and sReadMSB)
- * because we are using a DRAM payload of 8-bytes or half of a VTA instruction.
- * This should be configurable for larger payloads, i.e. 64-bytes, which can load
- * more than one instruction at the time. Finally, the instruction queue is
- * sized (entries_q), depending on the maximum burst allowed in the memory.
- */
-class Fetch(debug: Boolean = false)(implicit p: Parameters) extends Module {
-  val vp = p(ShellKey).vcrParams
-  val mp = p(ShellKey).memParams
-  val io = IO(new Bundle {
-    val launch = Input(Bool())
-    val ins_baddr = Input(UInt(mp.addrBits.W))
-    val ins_count = Input(UInt(vp.regBits.W))
-    val vme_rd = new VMEReadMaster
-    val inst = new Bundle {
-      val ld = Decoupled(UInt(INST_BITS.W))
-      val co = Decoupled(UInt(INST_BITS.W))
-      val st = Decoupled(UInt(INST_BITS.W))
-    }
-  })
-  val entries_q = 1 << (mp.lenBits - 1) // one-instr-every-two-vme-word
-  val inst_q = Module(new Queue(UInt(INST_BITS.W), entries_q))
-  val dec = Module(new FetchDecode)
-
-  val s1_launch = RegNext(io.launch)
-  val pulse = io.launch & ~s1_launch
-
-  val raddr = Reg(chiselTypeOf(io.vme_rd.cmd.bits.addr))
-  val rlen = Reg(chiselTypeOf(io.vme_rd.cmd.bits.len))
-  val ilen = Reg(chiselTypeOf(io.vme_rd.cmd.bits.len))
-
-  val xrem = Reg(chiselTypeOf(io.ins_count))
-  val xsize = (io.ins_count << 1.U) - 1.U
-  val xmax = (1 << mp.lenBits).U
-  val xmax_bytes = ((1 << mp.lenBits) * mp.dataBits / 8).U
-
-  val sIdle :: sReadCmd :: sReadLSB :: sReadMSB :: sDrain :: Nil = Enum(5)
-  val state = RegInit(sIdle)
-
-  // control
-  switch(state) {
-    is(sIdle) {
-      when(pulse) {
-        state := sReadCmd
-        when(xsize < xmax) {
-          rlen := xsize
-          ilen := xsize >> 1.U
-          xrem := 0.U
-        }.otherwise {
-          rlen := xmax - 1.U
-          ilen := (xmax >> 1.U) - 1.U
-          xrem := xsize - xmax
-        }
-      }
-    }
-    is(sReadCmd) {
-      when(io.vme_rd.cmd.ready) {
-        state := sReadLSB
-      }
-    }
-    is(sReadLSB) {
-      when(io.vme_rd.data.valid) {
-        state := sReadMSB
-      }
-    }
-    is(sReadMSB) {
-      when(io.vme_rd.data.valid) {
-        when(inst_q.io.count === ilen) {
-          state := sDrain
-        }.otherwise {
-          state := sReadLSB
-        }
-      }
-    }
-    is(sDrain) {
-      when(inst_q.io.count === 0.U) {
-        when(xrem === 0.U) {
-          state := sIdle
-        }.elsewhen(xrem < xmax) {
-          state := sReadCmd
-          rlen := xrem
-          ilen := xrem >> 1.U
-          xrem := 0.U
-        }.otherwise {
-          state := sReadCmd
-          rlen := xmax - 1.U
-          ilen := (xmax >> 1.U) - 1.U
-          xrem := xrem - xmax
-        }
-      }
-    }
-  }
-
-  // read instructions from dram
-  when(state === sIdle) {
-    raddr := io.ins_baddr
-  }.elsewhen(state === sDrain && inst_q.io.count === 0.U && xrem =/= 0.U) {
-    raddr := raddr + xmax_bytes
-  }
-
-  io.vme_rd.cmd.valid := state === sReadCmd
-  io.vme_rd.cmd.bits.addr := raddr
-  io.vme_rd.cmd.bits.len := rlen
-
-  io.vme_rd.data.ready := inst_q.io.enq.ready
-
-  val lsb = Reg(chiselTypeOf(io.vme_rd.data.bits))
-  val msb = io.vme_rd.data.bits
-  val inst = Cat(msb, lsb)
-
-  when(state === sReadLSB) { lsb := io.vme_rd.data.bits }
-
-  inst_q.io.enq.valid := io.vme_rd.data.valid & state === sReadMSB
-  inst_q.io.enq.bits := inst
-
-  // decode
-  dec.io.inst := inst_q.io.deq.bits
-
-  // instruction queues
-  io.inst.ld.valid := dec.io.isLoad & inst_q.io.deq.valid & state === sDrain
-  io.inst.co.valid := dec.io.isCompute & inst_q.io.deq.valid & state === sDrain
-  io.inst.st.valid := dec.io.isStore & inst_q.io.deq.valid & state === sDrain
-
-  io.inst.ld.bits := inst_q.io.deq.bits
-  io.inst.co.bits := inst_q.io.deq.bits
-  io.inst.st.bits := inst_q.io.deq.bits
-
-  // check if selected queue is ready
-  val deq_sel = Cat(dec.io.isCompute, dec.io.isStore, dec.io.isLoad).asUInt
-  val deq_ready =
-    MuxLookup(deq_sel,
-      false.B, // default
-      Array(
-        "h_01".U -> io.inst.ld.ready,
-        "h_02".U -> io.inst.st.ready,
-        "h_04".U -> io.inst.co.ready
-      ))
-
-  // dequeue instruction
-  inst_q.io.deq.ready := deq_ready & inst_q.io.deq.valid & state === sDrain
-
-  // debug
-  if (debug) {
-    when(state === sIdle && pulse) {
-      printf("[Fetch] Launch\n")
-    }
-    // instruction
-    when(inst_q.io.deq.fire()) {
-      when(dec.io.isLoad) {
-        printf("[Fetch] [instruction decode] [L] %x\n", inst_q.io.deq.bits)
-      }
-      when(dec.io.isCompute) {
-        printf("[Fetch] [instruction decode] [C] %x\n", inst_q.io.deq.bits)
-      }
-      when(dec.io.isStore) {
-        printf("[Fetch] [instruction decode] [S] %x\n", inst_q.io.deq.bits)
-      }
-    }
-  }
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/core/ISA.scala b/vta/vta-hw/hardware/chisel/src/main/scala/core/ISA.scala
deleted file mode 100644
index bfe89eb..0000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/core/ISA.scala
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.core
-
-import chisel3._
-import chisel3.util._
-import scala.collection.mutable.HashMap
-
-/** ISAConstants.
- *
- * These constants are used for decoding (parsing) fields on instructions.
- */
-trait ISAConstants {
-  val INST_BITS = 128
-
-  val OP_BITS = 3
-
-  val M_DEP_BITS = 4
-  val M_ID_BITS = 2
-  val M_SRAM_OFFSET_BITS = 16
-  val M_DRAM_OFFSET_BITS = 32
-  val M_SIZE_BITS = 16
-  val M_STRIDE_BITS = 16
-  val M_PAD_BITS = 4
-
-  val C_UOP_BGN_BITS = 13
-  val C_UOP_END_BITS = 14
-  val C_ITER_BITS = 14
-  val C_AIDX_BITS = 11
-  val C_IIDX_BITS = 11
-  val C_WIDX_BITS = 10
-  val C_ALU_DEC_BITS = 2 // FIXME: there should be a SHL and SHR instruction
-  val C_ALU_OP_BITS = 3
-  val C_ALU_IMM_BITS = 16
-
-  val Y = true.B
-  val N = false.B
-
-  val OP_L = 0.asUInt(OP_BITS.W)
-  val OP_S = 1.asUInt(OP_BITS.W)
-  val OP_G = 2.asUInt(OP_BITS.W)
-  val OP_F = 3.asUInt(OP_BITS.W)
-  val OP_A = 4.asUInt(OP_BITS.W)
-  val OP_X = 5.asUInt(OP_BITS.W)
-
-  val ALU_OP_NUM = 5
-  val ALU_OP = Enum(ALU_OP_NUM)
-
-  val M_ID_U = 0.asUInt(M_ID_BITS.W)
-  val M_ID_W = 1.asUInt(M_ID_BITS.W)
-  val M_ID_I = 2.asUInt(M_ID_BITS.W)
-  val M_ID_A = 3.asUInt(M_ID_BITS.W)
-}
-
-/** ISA.
- *
- * This is the VTA task ISA
- *
- * TODO: Add VXOR to clear accumulator
- * TODO: Use ISA object for decoding as well
- * TODO: Eventually deprecate ISAConstants
- */
-object ISA {
-  private val xLen = 128
-  private val depBits = 4
-
-  private val idBits: HashMap[String, Int] =
-    HashMap(("task", 3), ("mem", 2), ("alu", 2))
-
-  private val taskId: HashMap[String, String] =
-    HashMap(("load", "000"),
-      ("store", "001"),
-      ("gemm", "010"),
-      ("finish", "011"),
-      ("alu", "100"))
-
-  private val memId: HashMap[String, String] =
-    HashMap(("uop", "00"), ("wgt", "01"), ("inp", "10"), ("acc", "11"))
-
-  private val aluId: HashMap[String, String] =
-    HashMap(("minpool", "00"),
-      ("maxpool", "01"),
-      ("add", "10"),
-      ("shift", "11"))
-
-  private def dontCare(bits: Int): String = "?" * bits
-
-  private def instPat(bin: String): BitPat = BitPat("b" + bin)
-
-  private def load(id: String): BitPat = {
-    val rem = xLen - idBits("mem") - depBits - idBits("task")
-    val inst = dontCare(rem) + memId(id) + dontCare(depBits) + taskId("load")
-    instPat(inst)
-  }
-
-  private def store: BitPat = {
-    val rem = xLen - idBits("task")
-    val inst = dontCare(rem) + taskId("store")
-    instPat(inst)
-  }
-
-  private def gemm: BitPat = {
-    val rem = xLen - idBits("task")
-    val inst = dontCare(rem) + taskId("gemm")
-    instPat(inst)
-  }
-
-  private def alu(id: String): BitPat = {
-    // TODO: move alu id next to task id
-    val inst = dontCare(18) + aluId(id) + dontCare(105) + taskId("alu")
-    instPat(inst)
-  }
-
-  private def finish: BitPat = {
-    val rem = xLen - idBits("task")
-    val inst = dontCare(rem) + taskId("finish")
-    instPat(inst)
-  }
-
-  def LUOP = load("uop")
-  def LWGT = load("wgt")
-  def LINP = load("inp")
-  def LACC = load("acc")
-  def SOUT = store
-  def GEMM = gemm
-  def VMIN = alu("minpool")
-  def VMAX = alu("maxpool")
-  def VADD = alu("add")
-  def VSHX = alu("shift")
-  def FNSH = finish
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/core/Load.scala b/vta/vta-hw/hardware/chisel/src/main/scala/core/Load.scala
deleted file mode 100644
index 50c26bb..0000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/core/Load.scala
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.core
-
-import chisel3._
-import chisel3.util._
-import vta.util.config._
-import vta.shell._
-
-/** Load.
- *
- * Load inputs and weights from memory (DRAM) into scratchpads (SRAMs).
- * This module instantiate the TensorLoad unit which is in charge of
- * loading 1D and 2D tensors to scratchpads, so it can be used by
- * other modules such as Compute.
- */
-class Load(debug: Boolean = false)(implicit p: Parameters) extends Module {
-  val mp = p(ShellKey).memParams
-  val io = IO(new Bundle {
-    val i_post = Input(Bool())
-    val o_post = Output(Bool())
-    val inst = Flipped(Decoupled(UInt(INST_BITS.W)))
-    val inp_baddr = Input(UInt(mp.addrBits.W))
-    val wgt_baddr = Input(UInt(mp.addrBits.W))
-    val vme_rd = Vec(2, new VMEReadMaster)
-    val inp = new TensorClient(tensorType = "inp")
-    val wgt = new TensorClient(tensorType = "wgt")
-  })
-  val sIdle :: sSync :: sExe :: Nil = Enum(3)
-  val state = RegInit(sIdle)
-
-  val s = Module(new Semaphore(counterBits = 8, counterInitValue = 0))
-  val inst_q = Module(new Queue(UInt(INST_BITS.W), p(CoreKey).instQueueEntries))
-
-  val dec = Module(new LoadDecode)
-  dec.io.inst := inst_q.io.deq.bits
-
-  val tensorType = Seq("inp", "wgt")
-  val tensorDec = Seq(dec.io.isInput, dec.io.isWeight)
-  val tensorLoad =
-    Seq.tabulate(2)(i => Module(new TensorLoad(tensorType = tensorType(i))))
-
-  val start = inst_q.io.deq.valid & Mux(dec.io.pop_next, s.io.sready, true.B)
-  val done = Mux(dec.io.isInput, tensorLoad(0).io.done, tensorLoad(1).io.done)
-
-  // control
-  switch(state) {
-    is(sIdle) {
-      when(start) {
-        when(dec.io.isSync) {
-          state := sSync
-        }.elsewhen(dec.io.isInput || dec.io.isWeight) {
-          state := sExe
-        }
-      }
-    }
-    is(sSync) {
-      state := sIdle
-    }
-    is(sExe) {
-      when(done) {
-        state := sIdle
-      }
-    }
-  }
-
-  // instructions
-  inst_q.io.enq <> io.inst
-  inst_q.io.deq.ready := (state === sExe & done) | (state === sSync)
-
-  // load tensor
-  // [0] input (inp)
-  // [1] weight (wgt)
-  val ptr = Seq(io.inp_baddr, io.wgt_baddr)
-  val tsor = Seq(io.inp, io.wgt)
-  for (i <- 0 until 2) {
-    tensorLoad(i).io.start := state === sIdle & start & tensorDec(i)
-    tensorLoad(i).io.inst := inst_q.io.deq.bits
-    tensorLoad(i).io.baddr := ptr(i)
-    tensorLoad(i).io.tensor <> tsor(i)
-    io.vme_rd(i) <> tensorLoad(i).io.vme_rd
-  }
-
-  // semaphore
-  s.io.spost := io.i_post
-  s.io.swait := dec.io.pop_next & (state === sIdle & start)
-  io.o_post := dec.io.push_next & ((state === sExe & done) | (state === sSync))
-
-  // debug
-  if (debug) {
-    // start
-    when(state === sIdle && start) {
-      when(dec.io.isSync) {
-        printf("[Load] start sync\n")
-      }.elsewhen(dec.io.isInput) {
-        printf("[Load] start input\n")
-      }.elsewhen(dec.io.isWeight) {
-        printf("[Load] start weight\n")
-      }
-    }
-    // done
-    when(state === sSync) {
-      printf("[Load] done sync\n")
-    }
-    when(state === sExe) {
-      when(done) {
-        when(dec.io.isInput) {
-          printf("[Load] done input\n")
-        }.elsewhen(dec.io.isWeight) {
-          printf("[Load] done weight\n")
-        }
-      }
-    }
-  }
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/core/LoadUop.scala b/vta/vta-hw/hardware/chisel/src/main/scala/core/LoadUop.scala
deleted file mode 100644
index 87bd508..0000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/core/LoadUop.scala
+++ /dev/null
@@ -1,225 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.core
-
-import chisel3._
-import chisel3.util._
-import vta.util.config._
-import vta.shell._
-
-/** UopMaster.
- *
- * Uop interface used by a master module, i.e. TensorAlu or TensorGemm,
- * to request a micro-op (uop) from the uop-scratchpad. The index (idx) is
- * used as an address to find the uop in the uop-scratchpad.
- */
-class UopMaster(implicit p: Parameters) extends Bundle {
-  val addrBits = log2Ceil(p(CoreKey).uopMemDepth)
-  val idx = ValidIO(UInt(addrBits.W))
-  val data = Flipped(ValidIO(new UopDecode))
-  override def cloneType = new UopMaster().asInstanceOf[this.type]
-}
-
-/** UopClient.
- *
- * Uop interface used by a client module, i.e. LoadUop, to receive
- * a request from a master module, i.e. TensorAlu or TensorGemm.
- * The index (idx) is used as an address to find the uop in the uop-scratchpad.
- */
-class UopClient(implicit p: Parameters) extends Bundle {
-  val addrBits = log2Ceil(p(CoreKey).uopMemDepth)
-  val idx = Flipped(ValidIO(UInt(addrBits.W)))
-  val data = ValidIO(new UopDecode)
-  override def cloneType = new UopClient().asInstanceOf[this.type]
-}
-
-/** LoadUop.
- *
- * Load micro-ops (uops) from memory, i.e. DRAM, and store them in the
- * uop-scratchpad. Currently, micro-ops are 32-bit wide and loaded in
- * group of 2 given the fact that the DRAM payload is 8-bytes. This module
- * should be modified later on to support different DRAM sizes efficiently.
- */
-class LoadUop(debug: Boolean = false)(implicit p: Parameters) extends Module {
-  val mp = p(ShellKey).memParams
-  val io = IO(new Bundle {
-    val start = Input(Bool())
-    val done = Output(Bool())
-    val inst = Input(UInt(INST_BITS.W))
-    val baddr = Input(UInt(mp.addrBits.W))
-    val vme_rd = new VMEReadMaster
-    val uop = new UopClient
-  })
-  val numUop = 2 // store two uops per sram word
-  val uopBits = p(CoreKey).uopBits
-  val uopBytes = uopBits / 8
-  val uopDepth = p(CoreKey).uopMemDepth / numUop
-
-  val dec = io.inst.asTypeOf(new MemDecode)
-  val raddr = Reg(chiselTypeOf(io.vme_rd.cmd.bits.addr))
-  val xcnt = Reg(chiselTypeOf(io.vme_rd.cmd.bits.len))
-  val xlen = Reg(chiselTypeOf(io.vme_rd.cmd.bits.len))
-  val xrem = Reg(chiselTypeOf(dec.xsize))
-  val xsize = (dec.xsize >> log2Ceil(numUop)) + dec.xsize(0) + (dec.sram_offset % 2.U) - 1.U
-  val xmax = (1 << mp.lenBits).U
-  val xmax_bytes = ((1 << mp.lenBits) * mp.dataBits / 8).U
-
-  val dram_even = (dec.dram_offset % 2.U) === 0.U
-  val sram_even = (dec.sram_offset % 2.U) === 0.U
-  val sizeIsEven = (dec.xsize % 2.U) === 0.U
-
-  val sIdle :: sReadCmd :: sReadData :: Nil = Enum(3)
-  val state = RegInit(sIdle)
-
-  // control
-  switch(state) {
-    is(sIdle) {
-      when(io.start) {
-        state := sReadCmd
-        when(xsize < xmax) {
-          xlen := xsize
-          xrem := 0.U
-        }.otherwise {
-          xlen := xmax - 1.U
-          xrem := xsize - xmax
-        }
-      }
-    }
-    is(sReadCmd) {
-      when(io.vme_rd.cmd.ready) {
-        state := sReadData
-      }
-    }
-    is(sReadData) {
-      when(io.vme_rd.data.valid) {
-        when(xcnt === xlen) {
-          when(xrem === 0.U) {
-            state := sIdle
-          }.otherwise {
-            raddr := raddr + xmax_bytes
-            when(xrem < xmax) {
-              state := sReadCmd
-              xlen := xrem
-              xrem := 0.U
-            }
-            .otherwise {
-              state := sReadCmd
-              xlen := xmax - 1.U
-              xrem := xrem - xmax
-            }
-          }
-        }
-      }
-    }
-  }
-
-  // read-from-dram
-  val maskOffset = VecInit(Seq.fill(M_DRAM_OFFSET_BITS)(true.B)).asUInt
-  when(state === sIdle) {
-    when(dram_even) {
-      raddr := io.baddr | (maskOffset & (dec.dram_offset << log2Ceil(uopBytes)))
-    }.otherwise {
-      raddr := (io.baddr | (maskOffset & (dec.dram_offset << log2Ceil(uopBytes)))) - uopBytes.U
-    }
-  }
-
-  io.vme_rd.cmd.valid := state === sReadCmd
-  io.vme_rd.cmd.bits.addr := raddr
-  io.vme_rd.cmd.bits.len := xlen
-
-  io.vme_rd.data.ready := state === sReadData
-
-  when(state =/= sReadData) {
-    xcnt := 0.U
-  }.elsewhen(io.vme_rd.data.fire()) {
-    xcnt := xcnt + 1.U
-  }
-
-  val waddr = Reg(UInt(log2Ceil(uopDepth).W))
-  when(state === sIdle) {
-    waddr := dec.sram_offset >> log2Ceil(numUop)
-  }.elsewhen(io.vme_rd.data.fire()) {
-    waddr := waddr + 1.U
-  }
-
-  val wdata = Wire(Vec(numUop, UInt(uopBits.W)))
-  val mem = SyncReadMem(uopDepth, chiselTypeOf(wdata))
-  val wmask = Reg(Vec(numUop, Bool()))
-
-  when(sram_even) {
-    when(sizeIsEven) {
-      wmask := "b_11".U.asTypeOf(wmask)
-    }.elsewhen(io.vme_rd.cmd.fire()) {
-      when(dec.xsize === 1.U) {
-        wmask := "b_01".U.asTypeOf(wmask)
-      }.otherwise {
-        wmask := "b_11".U.asTypeOf(wmask)
-      }
-    }.elsewhen(io.vme_rd.data.fire()) {
-      when((xcnt === xlen - 1.U) && (xrem === 0.U)) {
-        wmask := "b_01".U.asTypeOf(wmask)
-      }.otherwise {
-        wmask := "b_11".U.asTypeOf(wmask)
-      }
-    }
-  }.otherwise {
-    when(io.vme_rd.cmd.fire()) {
-      wmask := "b_10".U.asTypeOf(wmask)
-    }.elsewhen(io.vme_rd.data.fire()) {
-      when(sizeIsEven && (xcnt === xlen - 1.U) && (xrem === 0.U)) {
-        wmask := "b_01".U.asTypeOf(wmask)
-      }.otherwise {
-        wmask := "b_11".U.asTypeOf(wmask)
-      }
-    }
-  }
-
-  wdata := io.vme_rd.data.bits.asTypeOf(wdata)
-  when(dram_even === false.B && sram_even) {
-    wdata(0) := io.vme_rd.data.bits.asTypeOf(wdata)(1)
-  }.elsewhen(sram_even === false.B && dram_even) {
-    wdata(1) := io.vme_rd.data.bits.asTypeOf(wdata)(0)
-  }
-
-  when(io.vme_rd.data.fire()) {
-    mem.write(waddr, wdata, wmask)
-  }
-
-  // read-from-sram
-  io.uop.data.valid := RegNext(io.uop.idx.valid)
-
-  val sIdx = io.uop.idx.bits % numUop.U
-  val rIdx = io.uop.idx.bits >> log2Ceil(numUop)
-  val memRead = mem.read(rIdx, io.uop.idx.valid)
-  val sWord = memRead.asUInt.asTypeOf(wdata)
-  val sUop = sWord(sIdx).asTypeOf(io.uop.data.bits)
-
-  io.uop.data.bits <> sUop
-
-  // done
-  io.done := state === sReadData & io.vme_rd.data.valid & xcnt === xlen & xrem === 0.U
-
-  // debug
-  if (debug) {
-    when(io.vme_rd.cmd.fire()) {
-      printf("[LoadUop] cmd addr:%x len:%x rem:%x\n", raddr, xlen, xrem)
-    }
-  }
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/core/Semaphore.scala b/vta/vta-hw/hardware/chisel/src/main/scala/core/Semaphore.scala
deleted file mode 100644
index efc895b..0000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/core/Semaphore.scala
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.core
-
-import chisel3._
-import chisel3.util._
-
-/** Semaphore.
- *
- * This semaphore is used instead of push/pop fifo, used in the initial
- * version of VTA. This semaphore is incremented (spost) or decremented (swait)
- * depending on the push and pop fields on instructions to prevent RAW and WAR
- * hazards.
- */
-class Semaphore(counterBits: Int = 1, counterInitValue: Int = 1) extends Module {
-  val io = IO(new Bundle {
-    val spost = Input(Bool())
-    val swait = Input(Bool())
-    val sready = Output(Bool())
-  })
-  val cnt = RegInit(counterInitValue.U(counterBits.W))
-  when(io.spost && !io.swait && cnt =/= ((1 << counterBits) - 1).asUInt) {
-    cnt := cnt + 1.U
-  }
-  when(!io.spost && io.swait && cnt =/= 0.U) { cnt := cnt - 1.U }
-  io.sready := cnt =/= 0.U
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/core/Store.scala b/vta/vta-hw/hardware/chisel/src/main/scala/core/Store.scala
deleted file mode 100644
index 025a0a2..0000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/core/Store.scala
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.core
-
-import chisel3._
-import chisel3.util._
-import vta.util.config._
-import vta.shell._
-
-/** Store.
- *
- * Store results back to memory (DRAM) from scratchpads (SRAMs).
- * This module instantiate the TensorStore unit which is in charge
- * of storing 1D and 2D tensors to main memory.
- */
-class Store(debug: Boolean = false)(implicit p: Parameters) extends Module {
-  val mp = p(ShellKey).memParams
-  val io = IO(new Bundle {
-    val i_post = Input(Bool())
-    val o_post = Output(Bool())
-    val inst = Flipped(Decoupled(UInt(INST_BITS.W)))
-    val out_baddr = Input(UInt(mp.addrBits.W))
-    val vme_wr = new VMEWriteMaster
-    val out = new TensorClient(tensorType = "out")
-  })
-  val sIdle :: sSync :: sExe :: Nil = Enum(3)
-  val state = RegInit(sIdle)
-
-  val s = Module(new Semaphore(counterBits = 8, counterInitValue = 0))
-  val inst_q = Module(new Queue(UInt(INST_BITS.W), p(CoreKey).instQueueEntries))
-
-  val dec = Module(new StoreDecode)
-  dec.io.inst := inst_q.io.deq.bits
-
-  val tensorStore = Module(new TensorStore(tensorType = "out"))
-
-  val start = inst_q.io.deq.valid & Mux(dec.io.pop_prev, s.io.sready, true.B)
-  val done = tensorStore.io.done
-
-  // control
-  switch(state) {
-    is(sIdle) {
-      when(start) {
-        when(dec.io.isSync) {
-          state := sSync
-        }.elsewhen(dec.io.isStore) {
-          state := sExe
-        }
-      }
-    }
-    is(sSync) {
-      state := sIdle
-    }
-    is(sExe) {
-      when(done) {
-        state := sIdle
-      }
-    }
-  }
-
-  // instructions
-  inst_q.io.enq <> io.inst
-  inst_q.io.deq.ready := (state === sExe & done) | (state === sSync)
-
-  // store
-  tensorStore.io.start := state === sIdle & start & dec.io.isStore
-  tensorStore.io.inst := inst_q.io.deq.bits
-  tensorStore.io.baddr := io.out_baddr
-  io.vme_wr <> tensorStore.io.vme_wr
-  tensorStore.io.tensor <> io.out
-
-  // semaphore
-  s.io.spost := io.i_post
-  s.io.swait := dec.io.pop_prev & (state === sIdle & start)
-  io.o_post := dec.io.push_prev & ((state === sExe & done) | (state === sSync))
-
-  // debug
-  if (debug) {
-    // start
-    when(state === sIdle && start) {
-      when(dec.io.isSync) {
-        printf("[Store] start sync\n")
-      }.elsewhen(dec.io.isStore) {
-        printf("[Store] start\n")
-      }
-    }
-    // done
-    when(state === sSync) {
-      printf("[Store] done sync\n")
-    }
-    when(state === sExe) {
-      when(done) {
-        printf("[Store] done\n")
-      }
-    }
-  }
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/core/TensorAlu.scala b/vta/vta-hw/hardware/chisel/src/main/scala/core/TensorAlu.scala
deleted file mode 100644
index 6af3c83..0000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/core/TensorAlu.scala
+++ /dev/null
@@ -1,308 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.core
-
-import chisel3._
-import chisel3.util._
-import vta.util.config._
-
-/** ALU datapath */
-class Alu(implicit p: Parameters) extends Module {
-  val aluBits = p(CoreKey).accBits
-  val io = IO(new Bundle {
-    val opcode = Input(UInt(C_ALU_OP_BITS.W))
-    val a = Input(SInt(aluBits.W))
-    val b = Input(SInt(aluBits.W))
-    val y = Output(SInt(aluBits.W))
-  })
-
-  // FIXME: the following three will change once we support properly SHR and SHL
-  val ub = io.b.asUInt
-  val width = log2Ceil(aluBits)
-  val m = ~ub(width - 1, 0) + 1.U
-
-  val n = ub(width - 1, 0)
-  val fop = Seq(Mux(io.a < io.b, io.a, io.b), Mux(io.a < io.b, io.b, io.a),
-    io.a + io.b, io.a >> n, io.a << m)
-
-  val opmux = Seq.tabulate(ALU_OP_NUM)(i => ALU_OP(i) -> fop(i))
-  io.y := MuxLookup(io.opcode, io.a, opmux)
-}
-
-/** Pipelined ALU */
-class AluReg(implicit p: Parameters) extends Module {
-  val io = IO(new Bundle {
-    val opcode = Input(UInt(C_ALU_OP_BITS.W))
-    val a = Flipped(ValidIO(UInt(p(CoreKey).accBits.W)))
-    val b = Flipped(ValidIO(UInt(p(CoreKey).accBits.W)))
-    val y = ValidIO(UInt(p(CoreKey).accBits.W))
-  })
-  val alu = Module(new Alu)
-  val rA = RegEnable(io.a.bits, io.a.valid)
-  val rB = RegEnable(io.b.bits, io.b.valid)
-  val valid = RegNext(io.b.valid)
-
-  alu.io.opcode := io.opcode
-
-  // register input
-  alu.io.a := rA.asSInt
-  alu.io.b := rB.asSInt
-
-  // output
-  io.y.valid := valid
-  io.y.bits := alu.io.y.asUInt
-}
-
-/** Vector of pipeline ALUs */
-class AluVector(implicit p: Parameters) extends Module {
-  val io = IO(new Bundle {
-    val opcode = Input(UInt(C_ALU_OP_BITS.W))
-    val acc_a = new TensorMasterData(tensorType = "acc")
-    val acc_b = new TensorMasterData(tensorType = "acc")
-    val acc_y = new TensorClientData(tensorType = "acc")
-    val out = new TensorClientData(tensorType = "out")
-  })
-  val blockOut = p(CoreKey).blockOut
-  val f = Seq.fill(blockOut)(Module(new AluReg))
-  val valid = Wire(Vec(blockOut, Bool()))
-  for (i <- 0 until blockOut) {
-    f(i).io.opcode := io.opcode
-    f(i).io.a.valid := io.acc_a.data.valid
-    f(i).io.a.bits := io.acc_a.data.bits(0)(i)
-    f(i).io.b.valid := io.acc_b.data.valid
-    f(i).io.b.bits := io.acc_b.data.bits(0)(i)
-    valid(i) := f(i).io.y.valid
-    io.acc_y.data.bits(0)(i) := f(i).io.y.bits
-    io.out.data.bits(0)(i) := f(i).io.y.bits
-  }
-  io.acc_y.data.valid := valid.asUInt.andR
-  io.out.data.valid := valid.asUInt.andR
-}
-
-/** TensorAlu.
- *
- * This unit instantiate the ALU vector unit (AluVector) and go over the
- * micro-ops (uops) which are used to read the source operands (vectors)
- * from the acc-scratchpad and then they are written back the same
- * acc-scratchpad.
- */
-class TensorAlu(debug: Boolean = false)(implicit p: Parameters) extends Module {
-  val aluBits = p(CoreKey).accBits
-  val io = IO(new Bundle {
-    val start = Input(Bool())
-    val done = Output(Bool())
-    val inst = Input(UInt(INST_BITS.W))
-    val uop = new UopMaster
-    val acc = new TensorMaster(tensorType = "acc")
-    val out = new TensorMaster(tensorType = "out")
-  })
-  val sIdle :: sReadUop :: sComputeIdx :: sReadTensorA :: sReadTensorB :: sExe :: Nil =
-    Enum(6)
-  val state = RegInit(sIdle)
-  val alu = Module(new AluVector)
-  val dec = io.inst.asTypeOf(new AluDecode)
-  val uop_idx = Reg(chiselTypeOf(dec.uop_end))
-  val uop_end = dec.uop_end
-  val uop_dst = Reg(chiselTypeOf(dec.uop_end))
-  val uop_src = Reg(chiselTypeOf(dec.uop_end))
-  val cnt_o = Reg(chiselTypeOf(dec.lp_0))
-  val dst_o = Reg(chiselTypeOf(dec.uop_end))
-  val src_o = Reg(chiselTypeOf(dec.uop_end))
-  val cnt_i = Reg(chiselTypeOf(dec.lp_1))
-  val dst_i = Reg(chiselTypeOf(dec.uop_end))
-  val src_i = Reg(chiselTypeOf(dec.uop_end))
-  val done =
-    state === sExe &
-      alu.io.out.data.valid &
-      (cnt_o === dec.lp_0 - 1.U) &
-      (cnt_i === dec.lp_1 - 1.U) &
-      (uop_idx === uop_end - 1.U)
-
-  switch(state) {
-    is(sIdle) {
-      when(io.start) {
-        state := sReadUop
-      }
-    }
-    is(sReadUop) {
-      state := sComputeIdx
-    }
-    is(sComputeIdx) {
-      state := sReadTensorA
-    }
-    is(sReadTensorA) {
-      state := sReadTensorB
-    }
-    is(sReadTensorB) {
-      state := sExe
-    }
-    is(sExe) {
-      when(alu.io.out.data.valid) {
-        when(
-          (cnt_o === dec.lp_0 - 1.U) &&
-            (cnt_i === dec.lp_1 - 1.U) &&
-            (uop_idx === uop_end - 1.U)) {
-          state := sIdle
-        }.otherwise {
-          state := sReadUop
-        }
-      }
-    }
-  }
-
-  when(
-    state === sIdle ||
-      (state === sExe &&
-        alu.io.out.data.valid &&
-        uop_idx === uop_end - 1.U)) {
-    uop_idx := dec.uop_begin
-  }.elsewhen(state === sExe && alu.io.out.data.valid) {
-    uop_idx := uop_idx + 1.U
-  }
-
-  when(state === sIdle) {
-    cnt_o := 0.U
-    dst_o := 0.U
-    src_o := 0.U
-  }.elsewhen(
-    state === sExe &&
-      alu.io.out.data.valid &&
-      uop_idx === uop_end - 1.U &&
-      cnt_i === dec.lp_1 - 1.U) {
-    cnt_o := cnt_o + 1.U
-    dst_o := dst_o + dec.dst_0
-    src_o := src_o + dec.src_0
-  }
-
-  when(state === sIdle) {
-    cnt_i := 0.U
-    dst_i := 0.U
-    src_i := 0.U
-  }.elsewhen(state === sReadUop && cnt_i === dec.lp_1) {
-    cnt_i := 0.U
-    dst_i := dst_o
-    src_i := src_o
-  }.elsewhen(state === sExe && alu.io.out.data.valid && uop_idx === uop_end - 1.U) {
-    cnt_i := cnt_i + 1.U
-    dst_i := dst_i + dec.dst_1
-    src_i := src_i + dec.src_1
-  }
-
-  when(state === sComputeIdx && io.uop.data.valid) {
-    uop_dst := io.uop.data.bits.u0 + dst_i
-    uop_src := io.uop.data.bits.u1 + src_i
-  }
-
-  // uop
-  io.uop.idx.valid := state === sReadUop
-  io.uop.idx.bits := uop_idx
-
-  // acc_i
-  io.acc.rd.idx.valid := state === sReadTensorA | (state === sReadTensorB & ~dec.alu_use_imm)
-  io.acc.rd.idx.bits := Mux(state === sReadTensorA, uop_dst, uop_src)
-
-  // imm
-  val tensorImm = Wire(new TensorClientData(tensorType = "acc"))
-  tensorImm.data.valid := state === sReadTensorB
-  tensorImm.data.bits.foreach { b =>
-    b.foreach { c =>
-      c := Mux(dec.alu_imm(C_ALU_IMM_BITS - 1),
-        Cat(-1.S((aluBits - C_ALU_IMM_BITS).W), dec.alu_imm), dec.alu_imm)
-    }
-  }
-
-  // alu
-  val isSHR = dec.alu_op === ALU_OP(3)
-  val neg_shift = isSHR & dec.alu_imm(C_ALU_IMM_BITS - 1)
-  val fixme_alu_op = Cat(neg_shift, Mux(neg_shift, 0.U, dec.alu_op))
-  alu.io.opcode := fixme_alu_op
-  alu.io.acc_a.data.valid := io.acc.rd.data.valid & state === sReadTensorB
-  alu.io.acc_a.data.bits <> io.acc.rd.data.bits
-  alu.io.acc_b.data.valid := Mux(dec.alu_use_imm,
-    tensorImm.data.valid,
-    io.acc.rd.data.valid & state === sExe)
-  alu.io.acc_b.data.bits <> Mux(dec.alu_use_imm,
-    tensorImm.data.bits,
-    io.acc.rd.data.bits)
-
-  // acc_o
-  io.acc.wr.valid := alu.io.acc_y.data.valid
-  io.acc.wr.bits.idx := uop_dst
-  io.acc.wr.bits.data <> alu.io.acc_y.data.bits
-
-  // out
-  io.out.wr.valid := alu.io.out.data.valid
-  io.out.wr.bits.idx := uop_dst
-  io.out.wr.bits.data <> alu.io.out.data.bits
-  io.out.tieoffRead() // write-only
-
-  io.done := done
-
-  if (debug) {
-
-    when(state === sReadUop) {
-      printf("[TensorAlu] [uop] idx:%x\n", uop_idx)
-    }
-
-    when(state === sReadTensorA) {
-      printf("[TensorAlu] [uop] dst:%x src:%x\n", uop_dst, uop_src)
-    }
-
-    when(state === sIdle && io.start) {
-      printf(p"[TensorAlu] decode:$dec\n")
-    }
-
-    alu.io.acc_a.data.bits.foreach { tensor =>
-      tensor.zipWithIndex.foreach {
-        case (elem, i) =>
-          when(alu.io.acc_a.data.valid) {
-            printf("[TensorAlu] [a] i:%x val:%x\n", i.U, elem)
-          }
-      }
-    }
-
-    alu.io.acc_b.data.bits.foreach { tensor =>
-      tensor.zipWithIndex.foreach {
-        case (elem, i) =>
-          when(alu.io.acc_b.data.valid) {
-            printf("[TensorAlu] [b] i:%x val:%x\n", i.U, elem)
-          }
-      }
-    }
-
-    alu.io.acc_y.data.bits.foreach { tensor =>
-      tensor.zipWithIndex.foreach {
-        case (elem, i) =>
-          when(alu.io.acc_y.data.valid) {
-            printf("[TensorAlu] [y] i:%x val:%x\n", i.U, elem)
-          }
-      }
-    }
-
-    alu.io.out.data.bits.foreach { tensor =>
-      tensor.zipWithIndex.foreach {
-        case (elem, i) =>
-          when(alu.io.out.data.valid) {
-            printf("[TensorAlu] [out] i:%x val:%x\n", i.U, elem)
-          }
-      }
-    }
-  }
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/core/TensorGemm.scala b/vta/vta-hw/hardware/chisel/src/main/scala/core/TensorGemm.scala
deleted file mode 100644
index f2d295f..0000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/core/TensorGemm.scala
+++ /dev/null
@@ -1,413 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.core
-
-import chisel3._
-import chisel3.util._
-import chisel3.experimental._
-import vta.util.config._
-import scala.math.pow
-
-/** Pipelined multiply and accumulate */
-class MAC(aBits: Int = 8, bBits: Int = 8, cBits: Int = 16) extends Module {
-  val outBits = Math.max(aBits + bBits, cBits) + 1
-  val io = IO(new Bundle {
-    val a = Input(SInt(aBits.W))
-    val b = Input(SInt(bBits.W))
-    val c = Input(SInt(cBits.W))
-    val y = Output(SInt(outBits.W))
-  })
-  val mult = Wire(SInt((aBits + bBits).W))
-  val add = Wire(SInt(outBits.W))
-  val rA = RegNext(io.a)
-  val rB = RegNext(io.b)
-  val rC = RegNext(io.c)
-
-  mult := rA * rB
-  add := rC +& mult
-
-  io.y := add
-}
-
-/** PipeAdder
- *
- * This unit loads input bits into register and performs addition in the next cycle
- */
-class PipeAdder(aBits: Int = 8, bBits: Int = 8) extends Module {
-  val outBits = Math.max(aBits, bBits) + 1
-  val io = IO(new Bundle {
-    val a = Input(SInt(aBits.W))
-    val b = Input(SInt(bBits.W))
-    val y = Output(SInt(outBits.W))
-  })
-  val add = Wire(SInt(outBits.W))
-  val rA = RegNext(io.a)
-  val rB = RegNext(io.b)
-  add := rA +& rB
-  io.y := add
-}
-
-/** Adder
- *
- * This unit wires input bits to an adder directly.
- * The output comes out of combinational logic without waiting for another cycle.
- */
-class Adder(aBits: Int = 8, bBits: Int = 8) extends Module {
-  val outBits = Math.max(aBits, bBits) + 1
-  val io = IO(new Bundle {
-    val a = Input(SInt(aBits.W))
-    val b = Input(SInt(bBits.W))
-    val y = Output(SInt(outBits.W))
-  })
-  val add = Wire(SInt(outBits.W))
-  val rA = Wire(SInt(aBits.W))
-  val rB = Wire(SInt(bBits.W))
-  rA := io.a
-  rB := io.b
-  add := rA +& rB
-  io.y := add
-}
-
-/** Pipelined DotProduct based on MAC and PipeAdder */
-class DotProduct(aBits: Int = 8, bBits: Int = 8, size: Int = 16) extends Module {
-  val errorMsg =
-    s"\n\n[VTA] [DotProduct] size must be greater than 4 and a power of 2\n\n"
-  require(size >= 2 && isPow2(size), errorMsg)
-  val b = aBits + bBits
-  val outBits = b + log2Ceil(size) + 1
-  val io = IO(new Bundle {
-    val a = Input(Vec(size, SInt(aBits.W)))
-    val b = Input(Vec(size, SInt(bBits.W)))
-    val y = Output(SInt(outBits.W))
-  })
-  val s = Seq.tabulate(log2Ceil(size + 1))(i =>
-    pow(2, log2Ceil(size) - i).toInt) // # of total layers
-  val p = log2Ceil(size / 2) + 1 // # of adder layers
-  val m = Seq.fill(s(0))(Module(new MAC(aBits, bBits, cBits = 1))) // # of total vector pairs
-  val a = Seq.tabulate(p)(
-    i =>
-      Seq.fill(s(i + 1))(
-        if (i == 0)
-          Module(new PipeAdder(aBits = (b + i + 1), bBits = (b + i + 1)))
-        else
-          Module(new Adder(aBits = (b + i + 1), bBits = (b + i + 1))))) // # adders within each layer
-
-  // Vector MACs
-  for (i <- 0 until s(0)) {
-    m(i).io.a := io.a(i)
-    m(i).io.b := io.b(i)
-    m(i).io.c := 0.S
-  }
-
-  // PipeAdder Reduction
-  for (i <- 0 until p) {
-    for (j <- 0 until s(i + 1)) {
-      if (i == 0) {
-        // First layer of PipeAdders
-        a(i)(j).io.a := m(2 * j).io.y
-        a(i)(j).io.b := m(2 * j + 1).io.y
-      } else {
-        a(i)(j).io.a := a(i - 1)(2 * j).io.y
-        a(i)(j).io.b := a(i - 1)(2 * j + 1).io.y
-      }
-    }
-  }
-
-  // last adder
-  io.y := a(p - 1)(0).io.y
-}
-
-/** Perform matrix-vector-multiplication based on DotProduct */
-class MatrixVectorMultiplication(implicit p: Parameters) extends Module {
-  val accBits = p(CoreKey).accBits
-  val size = p(CoreKey).blockOut
-  val inpBits = p(CoreKey).inpBits
-  val wgtBits = p(CoreKey).wgtBits
-  val outBits = p(CoreKey).outBits
-  val io = IO(new Bundle {
-    val reset = Input(Bool()) // FIXME: reset should be replaced by a load-acc instr
-    val inp = new TensorMasterData(tensorType = "inp")
-    val wgt = new TensorMasterData(tensorType = "wgt")
-    val acc_i = new TensorMasterData(tensorType = "acc")
-    val acc_o = new TensorClientData(tensorType = "acc")
-    val out = new TensorClientData(tensorType = "out")
-  })
-  val dot = Seq.fill(size)(
-    Module(new DotProduct(aBits = inpBits, bBits = wgtBits, size)))
-  // Latency is defined as two in the following, because there is one cycle in the MAC module,
-  // and another cycle in the pipelined adders as the first layer of the accumulator
-  val acc = Seq.fill(size)(Module(new Pipe(UInt(accBits.W), latency = 2)))
-  val add = Seq.fill(size)(Wire(SInt(accBits.W)))
-  val vld = Wire(Vec(size, Bool()))
-
-  for (i <- 0 until size) {
-    acc(i).io.enq.valid := io.inp.data.valid & io.wgt.data.valid & io.acc_i.data.valid & ~io.reset
-    acc(i).io.enq.bits := io.acc_i.data.bits(0)(i)
-    for (j <- 0 until size) {
-      dot(i).io.a(j) := io.inp.data.bits(0)(j).asSInt
-      dot(i).io.b(j) := io.wgt.data.bits(i)(j).asSInt
-    }
-    add(i) := acc(i).io.deq.bits.asSInt + dot(i).io.y
-    io.acc_o.data.bits(0)(i) := Mux(io.reset, 0.U, add(i).asUInt)
-    io.out.data.bits(0)(i) := add(i).asUInt
-    vld(i) := acc(i).io.deq.valid
-  }
-  io.acc_o.data.valid := vld.asUInt.andR | io.reset
-  io.out.data.valid := vld.asUInt.andR
-}
-
-/** TensorGemm.
- *
- * This unit instantiate the MatrixVectorMultiplication and go over the
- * micro-ops (uops) which are used to read inputs, weights and biases,
- * and writes results back to the acc and out scratchpads.
- *
- * Also, the TensorGemm uses the reset field in the Gemm instruction to
- * clear or zero-out the acc-scratchpad locations based on the micro-ops.
- */
-class TensorGemm(debug: Boolean = false)(implicit p: Parameters) extends Module {
-  val io = IO(new Bundle {
-    val start = Input(Bool())
-    val done = Output(Bool())
-    val inst = Input(UInt(INST_BITS.W))
-    val uop = new UopMaster
-    val inp = new TensorMaster(tensorType = "inp")
-    val wgt = new TensorMaster(tensorType = "wgt")
-    val acc = new TensorMaster(tensorType = "acc")
-    val out = new TensorMaster(tensorType = "out")
-  })
-  val sIdle :: sReadUop :: sComputeIdx :: sReadTensor :: sExe :: sWait :: Nil =
-    Enum(6)
-  val state = RegInit(sIdle)
-  val mvc = Module(new MatrixVectorMultiplication)
-  val dec = io.inst.asTypeOf(new GemmDecode)
-  val uop_idx = Reg(chiselTypeOf(dec.uop_end))
-  val uop_end = dec.uop_end
-  val uop_acc = Reg(chiselTypeOf(dec.uop_end))
-  val uop_inp = Reg(chiselTypeOf(dec.uop_end))
-  val uop_wgt = Reg(chiselTypeOf(dec.uop_end))
-  val cnt_o = Reg(chiselTypeOf(dec.lp_0))
-  val acc_o = Reg(chiselTypeOf(dec.uop_end))
-  val inp_o = Reg(chiselTypeOf(dec.uop_end))
-  val wgt_o = Reg(chiselTypeOf(dec.uop_end))
-  val cnt_i = Reg(chiselTypeOf(dec.lp_1))
-  val acc_i = Reg(chiselTypeOf(dec.uop_end))
-  val inp_i = Reg(chiselTypeOf(dec.uop_end))
-  val wgt_i = Reg(chiselTypeOf(dec.uop_end))
-  val pBits = log2Ceil(p(CoreKey).blockOut) + 1
-  val inflight = Reg(UInt(pBits.W))
-  // Latency is defined as two in the following, because there is one cycle in the MAC module,
-  // and another cycle in the pipelined adders as the first layer of the accumulator
-  val wrpipe = Module(new Pipe(chiselTypeOf(dec.uop_end), latency = 2))
-  val done = inflight === 0.U &
-    ((state === sExe &
-      cnt_o === dec.lp_0 - 1.U &
-      cnt_i === dec.lp_1 - 1.U &
-      uop_idx === uop_end - 1.U &
-      inflight === 0.U) |
-      state === sWait)
-
-  switch(state) {
-    is(sIdle) {
-      when(io.start) {
-        state := sReadUop
-      }
-    }
-    is(sReadUop) {
-      state := sComputeIdx
-    }
-    is(sComputeIdx) {
-      state := sReadTensor
-    }
-    is(sReadTensor) {
-      state := sExe
-    }
-    is(sExe) {
-      when(
-        (cnt_o === dec.lp_0 - 1.U) &&
-          (cnt_i === dec.lp_1 - 1.U) &&
-          (uop_idx === uop_end - 1.U)) {
-        when(inflight =/= 0.U) {
-          state := sWait
-        }.otherwise {
-          state := sIdle
-        }
-      }.otherwise {
-        state := sReadUop
-      }
-    }
-    is(sWait) {
-      when(inflight === 0.U) {
-        state := sIdle
-      }
-    }
-  }
-
-  when(state === sIdle) {
-    inflight := 0.U
-  }.elsewhen(!dec.reset) {
-    when((state === sReadTensor) && mvc.io.acc_o.data.valid) { // issue & commit
-      inflight := inflight
-    }.elsewhen(state === sReadTensor) { // issue a tensor
-      inflight := inflight + 1.U
-    }.elsewhen(mvc.io.acc_o.data.valid) { // commit a tensor
-      inflight := inflight - 1.U
-    }
-  }
-
-  when(
-    state === sIdle ||
-      (state === sExe &&
-        uop_idx === uop_end - 1.U)) {
-    uop_idx := dec.uop_begin
-  }.elsewhen(state === sExe && dec.uop_begin =/= uop_end) {
-    uop_idx := uop_idx + 1.U
-  }
-
-  when(state === sIdle) {
-    cnt_o := 0.U
-    acc_o := 0.U
-    inp_o := 0.U
-    wgt_o := 0.U
-  }.elsewhen(
-    state === sExe &&
-      uop_idx === uop_end - 1.U &&
-      cnt_i === dec.lp_1 - 1.U) {
-    cnt_o := cnt_o + 1.U
-    acc_o := acc_o + dec.acc_0
-    inp_o := inp_o + dec.inp_0
-    wgt_o := wgt_o + dec.wgt_0
-  }
-
-  when(state === sIdle) {
-    cnt_i := 0.U
-    acc_i := 0.U
-    inp_i := 0.U
-    wgt_i := 0.U
-  }.elsewhen(state === sReadUop && cnt_i === dec.lp_1) {
-    cnt_i := 0.U
-    acc_i := acc_o
-    inp_i := inp_o
-    wgt_i := wgt_o
-  }.elsewhen(state === sExe && uop_idx === uop_end - 1.U) {
-    cnt_i := cnt_i + 1.U
-    acc_i := acc_i + dec.acc_1
-    inp_i := inp_i + dec.inp_1
-    wgt_i := wgt_i + dec.wgt_1
-  }
-
-  when(state === sComputeIdx && io.uop.data.valid) {
-    uop_acc := io.uop.data.bits.u0 + acc_i
-    uop_inp := io.uop.data.bits.u1 + inp_i
-    uop_wgt := io.uop.data.bits.u2 + wgt_i
-  }
-
-  wrpipe.io.enq.valid := state === sExe & ~dec.reset
-  wrpipe.io.enq.bits := uop_acc
-
-  // uop
-  io.uop.idx.valid := state === sReadUop
-  io.uop.idx.bits := uop_idx
-
-  // inp
-  io.inp.rd.idx.valid := state === sReadTensor
-  io.inp.rd.idx.bits := uop_inp
-  io.inp.tieoffWrite() // read-only
-
-  // wgt
-  io.wgt.rd.idx.valid := state === sReadTensor
-  io.wgt.rd.idx.bits := uop_wgt
-  io.wgt.tieoffWrite() // read-only
-
-  // acc_i
-  io.acc.rd.idx.valid := state === sReadTensor
-  io.acc.rd.idx.bits := uop_acc
-
-  // mvc
-  mvc.io.reset := dec.reset & state === sExe
-  mvc.io.inp.data <> io.inp.rd.data
-  mvc.io.wgt.data <> io.wgt.rd.data
-  mvc.io.acc_i.data <> io.acc.rd.data
-
-  // acc_o
-  io.acc.wr.valid := mvc.io.acc_o.data.valid &
-    Mux(dec.reset, true.B, wrpipe.io.deq.valid)
-  io.acc.wr.bits.idx := Mux(dec.reset, uop_acc, wrpipe.io.deq.bits)
-  io.acc.wr.bits.data <> mvc.io.acc_o.data.bits
-
-  // out
-  io.out.wr.valid := mvc.io.out.data.valid & wrpipe.io.deq.valid
-  io.out.wr.bits.idx := wrpipe.io.deq.bits
-  io.out.wr.bits.data <> mvc.io.out.data.bits
-  io.out.tieoffRead() // write-only
-
-  io.done := done
-
-  if (debug) {
-    when(state === sReadUop && ~dec.reset) {
-      printf("[TensorGemm] [uop] idx:%x\n", uop_idx)
-    }
-
-    when(state === sReadTensor && ~dec.reset) {
-      printf("[TensorGemm] [uop] acc:%x inp:%x wgt:%x\n", uop_acc, uop_inp, uop_wgt)
-    }
-
-    io.inp.rd.data.bits.zipWithIndex.foreach {
-      case (r, i) =>
-        when(io.inp.rd.data.valid && ~dec.reset) {
-          printf("[TensorGemm] [inp] i:%x val:%x\n", i.U, r.asUInt)
-        }
-    }
-
-    io.wgt.rd.data.bits.zipWithIndex.foreach {
-      case (r, i) =>
-        when(io.wgt.rd.data.valid && ~dec.reset) {
-          printf("[TensorGemm] [wgt] i:%x val:%x\n", i.U, r.asUInt)
-        }
-    }
-
-    io.acc.rd.data.bits.foreach { tensor =>
-      tensor.zipWithIndex.foreach {
-        case (elem, i) =>
-          when(io.acc.rd.data.valid && ~dec.reset) {
-            printf("[TensorGemm] [acc_i] i:%x val:%x\n", i.U, elem)
-          }
-      }
-    }
-
-    mvc.io.acc_o.data.bits.foreach { tensor =>
-      tensor.zipWithIndex.foreach {
-        case (elem, i) =>
-          when(mvc.io.acc_o.data.valid && ~dec.reset) {
-            printf("[TensorGemm] [acc_o] i:%x val:%x\n", i.U, elem)
-          }
-      }
-    }
-
-    mvc.io.out.data.bits.foreach { tensor =>
-      tensor.zipWithIndex.foreach {
-        case (elem, i) =>
-          when(mvc.io.out.data.valid && ~dec.reset) {
-            printf("[TensorGemm] [out] i:%x val:%x\n", i.U, elem)
-          }
-      }
-    }
-  }
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/core/TensorLoad.scala b/vta/vta-hw/hardware/chisel/src/main/scala/core/TensorLoad.scala
deleted file mode 100644
index 5ab690d..0000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/core/TensorLoad.scala
+++ /dev/null
@@ -1,302 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.core
-
-import chisel3._
-import chisel3.util._
-import vta.util.config._
-import vta.shell._
-
-/** TensorLoad.
- *
- * Load 1D and 2D tensors from main memory (DRAM) to input/weight
- * scratchpads (SRAM). Also, there is support for zero padding, while
- * doing the load. Zero-padding works on the y and x axis, and it is
- * managed by TensorPadCtrl. The TensorDataCtrl is in charge of
- * handling the way tensors are stored on the scratchpads.
- */
-class TensorLoad(tensorType: String = "none", debug: Boolean = false)(
-    implicit p: Parameters)
-    extends Module {
-  val tp = new TensorParams(tensorType)
-  val mp = p(ShellKey).memParams
-  val io = IO(new Bundle {
-    val start = Input(Bool())
-    val done = Output(Bool())
-    val inst = Input(UInt(INST_BITS.W))
-    val baddr = Input(UInt(mp.addrBits.W))
-    val vme_rd = new VMEReadMaster
-    val tensor = new TensorClient(tensorType)
-  })
-  val sizeFactor = tp.tensorLength * tp.numMemBlock
-  val strideFactor = tp.tensorLength * tp.tensorWidth
-
-  val dec = io.inst.asTypeOf(new MemDecode)
-  val dataCtrl = Module(
-    new TensorDataCtrl(tensorType, sizeFactor, strideFactor))
-  val dataCtrlDone = RegInit(false.B)
-  val yPadCtrl0 = Module(new TensorPadCtrl(padType = "YPad0", sizeFactor))
-  val yPadCtrl1 = Module(new TensorPadCtrl(padType = "YPad1", sizeFactor))
-  val xPadCtrl0 = Module(new TensorPadCtrl(padType = "XPad0", sizeFactor))
-  val xPadCtrl1 = Module(new TensorPadCtrl(padType = "XPad1", sizeFactor))
-
-  val tag = Reg(UInt(log2Ceil(tp.numMemBlock).W))
-  val set = Reg(UInt(log2Ceil(tp.tensorLength).W))
-
-  val sIdle :: sYPad0 :: sXPad0 :: sReadCmd :: sReadData :: sXPad1 :: sYPad1 :: Nil =
-    Enum(7)
-  val state = RegInit(sIdle)
-
-  // control
-  switch(state) {
-    is(sIdle) {
-      when(io.start) {
-        when(dec.ypad_0 =/= 0.U) {
-          state := sYPad0
-        }.elsewhen(dec.xpad_0 =/= 0.U) {
-          state := sXPad0
-        }.otherwise {
-          state := sReadCmd
-        }
-      }
-    }
-    is(sYPad0) {
-      when(yPadCtrl0.io.done) {
-        when(dec.xpad_0 =/= 0.U) {
-          state := sXPad0
-        }.otherwise {
-          state := sReadCmd
-        }
-      }
-    }
-    is(sXPad0) {
-      when(xPadCtrl0.io.done) {
-        state := sReadCmd
-      }
-    }
-    is(sReadCmd) {
-      when(io.vme_rd.cmd.ready) {
-        state := sReadData
-      }
-    }
-    is(sReadData) {
-      when(io.vme_rd.data.valid) {
-        when(dataCtrl.io.done) {
-          when(dec.xpad_1 =/= 0.U) {
-            state := sXPad1
-          }.elsewhen(dec.ypad_1 =/= 0.U) {
-            state := sYPad1
-          }.otherwise {
-            state := sIdle
-          }
-        }.elsewhen(dataCtrl.io.stride) {
-          when(dec.xpad_1 =/= 0.U) {
-            state := sXPad1
-          }.elsewhen(dec.xpad_0 =/= 0.U) {
-            state := sXPad0
-          }.otherwise {
-            state := sReadCmd
-          }
-        }.elsewhen(dataCtrl.io.split) {
-          state := sReadCmd
-        }
-      }
-    }
-    is(sXPad1) {
-      when(xPadCtrl1.io.done) {
-        when(dataCtrlDone) {
-          when(dec.ypad_1 =/= 0.U) {
-            state := sYPad1
-          }.otherwise {
-            state := sIdle
-          }
-        }.otherwise {
-          when(dec.xpad_0 =/= 0.U) {
-            state := sXPad0
-          }.otherwise {
-            state := sReadCmd
-          }
-        }
-      }
-    }
-    is(sYPad1) {
-      when(yPadCtrl1.io.done && dataCtrlDone) {
-        state := sIdle
-      }
-    }
-  }
-
-  // data controller
-  dataCtrl.io.start := state === sIdle & io.start
-  dataCtrl.io.inst := io.inst
-  dataCtrl.io.baddr := io.baddr
-  dataCtrl.io.xinit := io.vme_rd.cmd.fire()
-  dataCtrl.io.xupdate := io.vme_rd.data.fire()
-  dataCtrl.io.yupdate := io.vme_rd.data.fire()
-
-  when(state === sIdle) {
-    dataCtrlDone := false.B
-  }.elsewhen(io.vme_rd.data.fire() && dataCtrl.io.done) {
-    dataCtrlDone := true.B
-  }
-
-  // pad
-  yPadCtrl0.io.start := dec.ypad_0 =/= 0.U & state === sIdle & io.start
-
-  yPadCtrl1.io.start := dec.ypad_1 =/= 0.U &
-    ((io.vme_rd.data.fire() & dataCtrl.io.done & dec.xpad_1 === 0.U) |
-      (state === sXPad1 & xPadCtrl1.io.done & dataCtrlDone))
-
-  xPadCtrl0.io.start := dec.xpad_0 =/= 0.U &
-    ((state === sIdle & io.start) |
-      (state === sYPad0 & yPadCtrl0.io.done) |
-      (io.vme_rd.data.fire() & ~dataCtrlDone & dataCtrl.io.stride & dec.xpad_1 === 0.U) |
-      (state === sXPad1 & xPadCtrl1.io.done & ~dataCtrlDone))
-
-  xPadCtrl1.io.start := dec.xpad_1 =/= 0.U & io.vme_rd.data.fire() &
-    ((dataCtrl.io.done) | (~dataCtrl.io.done & dataCtrl.io.stride & dec.xpad_1 =/= 0.U))
-
-  yPadCtrl0.io.inst := io.inst
-  yPadCtrl1.io.inst := io.inst
-  xPadCtrl0.io.inst := io.inst
-  xPadCtrl1.io.inst := io.inst
-
-  // read-from-dram
-  io.vme_rd.cmd.valid := state === sReadCmd
-  io.vme_rd.cmd.bits.addr := dataCtrl.io.addr
-  io.vme_rd.cmd.bits.len := dataCtrl.io.len
-
-  io.vme_rd.data.ready := state === sReadData
-
-  // write-to-sram
-  val isZeroPad = state === sYPad0 |
-    state === sXPad0 |
-    state === sXPad1 |
-    state === sYPad1
-
-  when(state === sIdle || state === sReadCmd || tag === (tp.numMemBlock - 1).U) {
-    tag := 0.U
-  }.elsewhen(io.vme_rd.data.fire() || isZeroPad) {
-    tag := tag + 1.U
-  }
-
-  when(state === sIdle || dataCtrlDone || (set === (tp.tensorLength - 1).U && tag === (tp.numMemBlock - 1).U)) {
-    set := 0.U
-  }.elsewhen((io.vme_rd.data.fire() || isZeroPad) && tag === (tp.numMemBlock - 1).U) {
-    set := set + 1.U
-  }
-
-  val waddr_cur = Reg(UInt(tp.memAddrBits.W))
-  val waddr_nxt = Reg(UInt(tp.memAddrBits.W))
-  when(state === sIdle) {
-    waddr_cur := dec.sram_offset
-    waddr_nxt := dec.sram_offset
-  }.elsewhen((io.vme_rd.data.fire() || isZeroPad)
-    && set === (tp.tensorLength - 1).U
-    && tag === (tp.numMemBlock - 1).U)
-  {
-    waddr_cur := waddr_cur + 1.U
-  }.elsewhen(dataCtrl.io.stride && io.vme_rd.data.fire()) {
-    waddr_cur := waddr_nxt + dec.xsize
-    waddr_nxt := waddr_nxt + dec.xsize
-  }
-
-  val tensorFile = Seq.fill(tp.tensorLength) {
-    SyncReadMem(tp.memDepth, Vec(tp.numMemBlock, UInt(tp.memBlockBits.W)))
-  }
-  val wmask = Seq.fill(tp.tensorLength) { Wire(Vec(tp.numMemBlock, Bool())) }
-  val wdata = Seq.fill(tp.tensorLength) {
-    Wire(Vec(tp.numMemBlock, UInt(tp.memBlockBits.W)))
-  }
-  val no_mask = Wire(Vec(tp.numMemBlock, Bool()))
-  no_mask.foreach { m =>
-    m := true.B
-  }
-
-  for (i <- 0 until tp.tensorLength) {
-    for (j <- 0 until tp.numMemBlock) {
-      wmask(i)(j) := tag === j.U
-      wdata(i)(j) := Mux(isZeroPad, 0.U, io.vme_rd.data.bits)
-    }
-    val tdata = io.tensor.wr.bits.data(i).asUInt.asTypeOf(wdata(i))
-    val muxWen =
-      Mux(state === sIdle,
-        io.tensor.wr.valid,
-        (io.vme_rd.data.fire() | isZeroPad) & set === i.U)
-    val muxWaddr = Mux(state === sIdle, io.tensor.wr.bits.idx, waddr_cur)
-    val muxWdata = Mux(state === sIdle, tdata, wdata(i))
-    val muxWmask = Mux(state === sIdle, no_mask, wmask(i))
-    when(muxWen) {
-      tensorFile(i).write(muxWaddr, muxWdata, muxWmask)
-    }
-  }
-
-  // read-from-sram
-  val rvalid = RegNext(io.tensor.rd.idx.valid)
-  io.tensor.rd.data.valid := rvalid
-
-  val rdata =
-    tensorFile.map(_.read(io.tensor.rd.idx.bits, io.tensor.rd.idx.valid))
-  rdata.zipWithIndex.foreach {
-    case (r, i) =>
-      io.tensor.rd.data.bits(i) := r.asUInt.asTypeOf(io.tensor.rd.data.bits(i))
-  }
-
-  // done
-  val done_no_pad = io.vme_rd.data.fire() & dataCtrl.io.done & dec.xpad_1 === 0.U & dec.ypad_1 === 0.U
-  val done_x_pad = state === sXPad1 & xPadCtrl1.io.done & dataCtrlDone & dec.ypad_1 === 0.U
-  val done_y_pad = state === sYPad1 & dataCtrlDone & yPadCtrl1.io.done
-  io.done := done_no_pad | done_x_pad | done_y_pad
-
-  // debug
-  if (debug) {
-    if (tensorType == "inp") {
-      when(io.vme_rd.cmd.fire()) {
-        printf("[TensorLoad] [inp] cmd addr:%x len:%x\n",
-          dataCtrl.io.addr,
-          dataCtrl.io.len)
-      }
-      when(state === sYPad0) {
-        printf("[TensorLoad] [inp] sYPad0\n")
-      }
-      when(state === sYPad1) {
-        printf("[TensorLoad] [inp] sYPad1\n")
-      }
-      when(state === sXPad0) {
-        printf("[TensorLoad] [inp] sXPad0\n")
-      }
-      when(state === sXPad1) {
-        printf("[TensorLoad] [inp] sXPad1\n")
-      }
-    } else if (tensorType == "wgt") {
-      when(io.vme_rd.cmd.fire()) {
-        printf("[TensorLoad] [wgt] cmd addr:%x len:%x\n",
-          dataCtrl.io.addr,
-          dataCtrl.io.len)
-      }
-    } else if (tensorType == "acc") {
-      when(io.vme_rd.cmd.fire()) {
-        printf("[TensorLoad] [acc] cmd addr:%x len:%x\n",
-          dataCtrl.io.addr,
-          dataCtrl.io.len)
-      }
-    }
-  }
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/core/TensorStore.scala b/vta/vta-hw/hardware/chisel/src/main/scala/core/TensorStore.scala
deleted file mode 100644
index 9b4bf74..0000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/core/TensorStore.scala
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.core
-
-import chisel3._
-import chisel3.util._
-import vta.util.config._
-import vta.shell._
-
-/** TensorStore.
- *
- * Store 1D and 2D tensors from out-scratchpad (SRAM) to main memory (DRAM).
- */
-class TensorStore(tensorType: String = "none", debug: Boolean = false)(
-    implicit p: Parameters)
-    extends Module {
-  val tp = new TensorParams(tensorType)
-  val mp = p(ShellKey).memParams
-  val io = IO(new Bundle {
-    val start = Input(Bool())
-    val done = Output(Bool())
-    val inst = Input(UInt(INST_BITS.W))
-    val baddr = Input(UInt(mp.addrBits.W))
-    val vme_wr = new VMEWriteMaster
-    val tensor = new TensorClient(tensorType)
-  })
-  val tensorLength = tp.tensorLength
-  val tensorWidth = tp.tensorWidth
-  val tensorElemBits = tp.tensorElemBits
-  val memBlockBits = tp.memBlockBits
-  val memDepth = tp.memDepth
-  val numMemBlock = tp.numMemBlock
-
-  val dec = io.inst.asTypeOf(new MemDecode)
-  val waddr_cur = Reg(chiselTypeOf(io.vme_wr.cmd.bits.addr))
-  val waddr_nxt = Reg(chiselTypeOf(io.vme_wr.cmd.bits.addr))
-  val xcnt = Reg(chiselTypeOf(io.vme_wr.cmd.bits.len))
-  val xlen = Reg(chiselTypeOf(io.vme_wr.cmd.bits.len))
-  val xrem = Reg(chiselTypeOf(dec.xsize))
-  val xsize = (dec.xsize << log2Ceil(tensorLength * numMemBlock)) - 1.U
-  val xmax = (1 << mp.lenBits).U
-  val xmax_bytes = ((1 << mp.lenBits) * mp.dataBits / 8).U
-  val ycnt = Reg(chiselTypeOf(dec.ysize))
-  val ysize = dec.ysize
-  val tag = Reg(UInt(8.W))
-  val set = Reg(UInt(8.W))
-
-  val xfer_bytes = Reg(chiselTypeOf(io.vme_wr.cmd.bits.addr))
-  val xstride_bytes = dec.xstride << log2Ceil(tensorLength * tensorWidth)
-  val maskOffset = VecInit(Seq.fill(M_DRAM_OFFSET_BITS)(true.B)).asUInt
-  val elemBytes = (p(CoreKey).batch * p(CoreKey).blockOut * p(CoreKey).outBits) / 8
-  val pulse_bytes_bits = log2Ceil(mp.dataBits >> 3)
-
-  val xfer_init_addr = io.baddr | (maskOffset & (dec.dram_offset << log2Ceil(elemBytes)))
-  val xfer_split_addr = waddr_cur + xfer_bytes
-  val xfer_stride_addr = waddr_nxt + xstride_bytes
-
-  val xfer_init_bytes   = xmax_bytes - xfer_init_addr % xmax_bytes
-  val xfer_init_pulses  = xfer_init_bytes >> pulse_bytes_bits
-  val xfer_split_bytes  = xmax_bytes - xfer_split_addr % xmax_bytes
-  val xfer_split_pulses = xfer_split_bytes >> pulse_bytes_bits
-  val xfer_stride_bytes = xmax_bytes - xfer_stride_addr % xmax_bytes
-  val xfer_stride_pulses= xfer_stride_bytes >> pulse_bytes_bits
-
-  val sIdle :: sWriteCmd :: sWriteData :: sReadMem :: sWriteAck :: Nil = Enum(5)
-  val state = RegInit(sIdle)
-
-  // control
-  switch(state) {
-    is(sIdle) {
-      xfer_bytes := xfer_init_bytes
-      when (io.start) {
-        state := sWriteCmd
-        when (xsize < xfer_init_pulses) {
-          xlen := xsize
-          xrem := 0.U
-        }.otherwise {
-          xlen := xfer_init_pulses - 1.U
-          xrem := xsize - xfer_init_pulses
-        }
-      }
-    }
-    is(sWriteCmd) {
-      when(io.vme_wr.cmd.ready) {
-        state := sWriteData
-      }
-    }
-    is(sWriteData) {
-      when(io.vme_wr.data.ready) {
-        when(xcnt === xlen) {
-          state := sWriteAck
-        }.elsewhen(tag === (numMemBlock - 1).U) {
-          state := sReadMem
-        }
-      }
-    }
-    is(sReadMem) {
-      state := sWriteData
-    }
-    is(sWriteAck) {
-      when(io.vme_wr.ack) {
-        when(xrem === 0.U) {
-          when(ycnt === ysize - 1.U) {
-            state := sIdle
-          }.otherwise { // stride
-            state := sWriteCmd
-            xfer_bytes := xfer_stride_bytes
-            when(xsize < xfer_stride_pulses) {
-              xlen := xsize
-              xrem := 0.U
-            }.otherwise {
-              xlen := xfer_stride_pulses - 1.U
-              xrem := xsize - xfer_stride_pulses
-            }
-          }
-        } // split
-        .elsewhen(xrem < xfer_split_pulses) {
-          state := sWriteCmd
-          xfer_bytes := xfer_split_bytes
-          xlen := xrem
-          xrem := 0.U
-        }
-        .otherwise {
-          state := sWriteCmd
-          xfer_bytes := xfer_split_bytes
-          xlen := xfer_split_pulses - 1.U
-          xrem := xrem - xfer_split_pulses
-        }
-      }
-    }
-  }
-
-  // write-to-sram
-  val tensorFile = Seq.fill(tensorLength) {
-    SyncReadMem(memDepth, Vec(numMemBlock, UInt(memBlockBits.W)))
-  }
-  val wdata_t = Wire(Vec(numMemBlock, UInt(memBlockBits.W)))
-  val no_mask = Wire(Vec(numMemBlock, Bool()))
-
-  wdata_t := DontCare
-  no_mask.foreach { m =>
-    m := true.B
-  }
-
-  for (i <- 0 until tensorLength) {
-    val inWrData = io.tensor.wr.bits.data(i).asUInt.asTypeOf(wdata_t)
-    when(io.tensor.wr.valid) {
-      tensorFile(i).write(io.tensor.wr.bits.idx, inWrData, no_mask)
-    }
-  }
-
-  // read-from-sram
-  val stride = state === sWriteAck &
-    io.vme_wr.ack &
-    xcnt === xlen + 1.U &
-    xrem === 0.U &
-    ycnt =/= ysize - 1.U
-
-  when(state === sIdle) {
-    ycnt := 0.U
-  }.elsewhen(stride) {
-    ycnt := ycnt + 1.U
-  }
-
-  when(state === sWriteCmd || tag === (numMemBlock - 1).U) {
-    tag := 0.U
-  }.elsewhen(io.vme_wr.data.fire()) {
-    tag := tag + 1.U
-  }
-
-  when(
-    state === sWriteCmd || (set === (tensorLength - 1).U && tag === (numMemBlock - 1).U)) {
-    set := 0.U
-  }.elsewhen(io.vme_wr.data.fire() && tag === (numMemBlock - 1).U) {
-    set := set + 1.U
-  }
-
-  val raddr_cur = Reg(UInt(tp.memAddrBits.W))
-  val raddr_nxt = Reg(UInt(tp.memAddrBits.W))
-  when(state === sIdle) {
-    raddr_cur := dec.sram_offset
-    raddr_nxt := dec.sram_offset
-  }.elsewhen(io.vme_wr.data.fire() && set === (tensorLength - 1).U && tag === (numMemBlock - 1).U) {
-    raddr_cur := raddr_cur + 1.U
-  }.elsewhen(stride) {
-    raddr_cur := raddr_nxt + dec.xsize
-    raddr_nxt := raddr_nxt + dec.xsize
-  }
-
-  val tread = Seq.tabulate(tensorLength) { i =>
-    i.U ->
-      tensorFile(i).read(raddr_cur, state === sWriteCmd | state === sReadMem)
-  }
-  val mdata = MuxLookup(set, 0.U.asTypeOf(chiselTypeOf(wdata_t)), tread)
-
-  // write-to-dram
-  when(state === sIdle) {
-    waddr_cur := xfer_init_addr
-    waddr_nxt := xfer_init_addr
-  }.elsewhen(state === sWriteAck && io.vme_wr.ack && xrem =/= 0.U) {
-    waddr_cur := xfer_split_addr
-  }.elsewhen(stride) {
-    waddr_cur := xfer_stride_addr
-    waddr_nxt := xfer_stride_addr
-  }
-
-  io.vme_wr.cmd.valid := state === sWriteCmd
-  io.vme_wr.cmd.bits.addr := waddr_cur
-  io.vme_wr.cmd.bits.len := xlen
-
-  io.vme_wr.data.valid := state === sWriteData
-  io.vme_wr.data.bits := mdata(tag)
-
-  when(state === sWriteCmd) {
-    xcnt := 0.U
-  }.elsewhen(io.vme_wr.data.fire()) {
-    xcnt := xcnt + 1.U
-  }
-
-  // disable external read-from-sram requests
-  io.tensor.tieoffRead()
-
-  // done
-  io.done := state === sWriteAck & io.vme_wr.ack & xrem === 0.U & ycnt === ysize - 1.U
-
-  // debug
-  if (debug) {
-    when(io.vme_wr.cmd.fire()) {
-      printf("[TensorStore] ysize:%x ycnt:%x raddr:%x waddr:%x len:%x rem:%x\n",
-        ysize, ycnt, raddr_cur, waddr_cur, xlen, xrem)
-    }
-    when(io.vme_wr.data.fire()) {
-      printf("[TensorStore] data:%x\n", io.vme_wr.data.bits)
-    }
-    when(io.vme_wr.ack) {
-      printf("[TensorStore] ack\n")
-    }
-  }
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/core/TensorUtil.scala b/vta/vta-hw/hardware/chisel/src/main/scala/core/TensorUtil.scala
deleted file mode 100644
index d0a8ba7..0000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/core/TensorUtil.scala
+++ /dev/null
@@ -1,355 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.core
-
-import chisel3._
-import chisel3.util._
-import vta.util.config._
-import vta.shell._
-
-/** TensorParams.
- *
- * This Bundle derives parameters for each tensorType, including inputs (inp),
- * weights (wgt), biases (acc), and outputs (out). This is used to avoid
- * doing the same boring calculations over and over again.
- */
-class TensorParams(tensorType: String = "none")(implicit p: Parameters) extends Bundle {
-  val errorMsg =
-    s"\n\n[VTA] [TensorParams] only inp, wgt, acc, and out supported\n\n"
-
-  require(tensorType == "inp" || tensorType == "wgt"
-    || tensorType == "acc" || tensorType == "out",
-    errorMsg)
-
-  val (tensorLength, tensorWidth, tensorElemBits) =
-    if (tensorType == "inp")
-      (p(CoreKey).batch, p(CoreKey).blockIn, p(CoreKey).inpBits)
-    else if (tensorType == "wgt")
-      (p(CoreKey).blockOut, p(CoreKey).blockIn, p(CoreKey).wgtBits)
-    else if (tensorType == "acc")
-      (p(CoreKey).batch, p(CoreKey).blockOut, p(CoreKey).accBits)
-    else
-      (p(CoreKey).batch, p(CoreKey).blockOut, p(CoreKey).outBits)
-
-  val memBlockBits = p(ShellKey).memParams.dataBits
-  val numMemBlock = (tensorWidth * tensorElemBits) / memBlockBits
-
-  val memDepth =
-    if (tensorType == "inp")
-      p(CoreKey).inpMemDepth
-    else if (tensorType == "wgt")
-      p(CoreKey).wgtMemDepth
-    else if (tensorType == "acc")
-      p(CoreKey).accMemDepth
-    else
-      p(CoreKey).outMemDepth
-
-  val memAddrBits = log2Ceil(memDepth)
-}
-
-/** TensorMaster.
- *
- * This interface issue read and write tensor-requests to scratchpads. For example,
- * The TensorGemm unit uses this interface for managing the inputs (inp), weights (wgt),
- * biases (acc), and outputs (out).
- *
- */
-class TensorMaster(tensorType: String = "none")
-  (implicit p: Parameters) extends TensorParams(tensorType) {
-  val rd = new Bundle {
-    val idx = ValidIO(UInt(memAddrBits.W))
-    val data = Flipped(
-      ValidIO(Vec(tensorLength, Vec(tensorWidth, UInt(tensorElemBits.W)))))
-  }
-  val wr = ValidIO(new Bundle {
-    val idx = UInt(memAddrBits.W)
-    val data = Vec(tensorLength, Vec(tensorWidth, UInt(tensorElemBits.W)))
-  })
-  def tieoffRead() {
-    rd.idx.valid := false.B
-    rd.idx.bits := 0.U
-  }
-  def tieoffWrite() {
-    wr.valid := false.B
-    wr.bits.idx := 0.U
-    wr.bits.data.foreach { b =>
-      b.foreach { c =>
-        c := 0.U
-      }
-    }
-  }
-  override def cloneType =
-    new TensorMaster(tensorType).asInstanceOf[this.type]
-}
-
-/** TensorClient.
- *
- * This interface receives read and write tensor-requests to scratchpads. For example,
- * The TensorLoad unit uses this interface for receiving read and write requests from
- * the TensorGemm unit.
- */
-class TensorClient(tensorType: String = "none")
-  (implicit p: Parameters) extends TensorParams(tensorType) {
-  val rd = new Bundle {
-    val idx = Flipped(ValidIO(UInt(memAddrBits.W)))
-    val data = ValidIO(
-      Vec(tensorLength, Vec(tensorWidth, UInt(tensorElemBits.W))))
-  }
-  val wr = Flipped(ValidIO(new Bundle {
-    val idx = UInt(memAddrBits.W)
-    val data = Vec(tensorLength, Vec(tensorWidth, UInt(tensorElemBits.W)))
-  }))
-  def tieoffRead() {
-    rd.data.valid := false.B
-    rd.data.bits.foreach { b =>
-      b.foreach { c =>
-        c := 0.U
-      }
-    }
-  }
-  override def cloneType =
-    new TensorClient(tensorType).asInstanceOf[this.type]
-}
-
-/** TensorMasterData.
- *
- * This interface is only used for datapath only purposes and the direction convention
- * is based on the TensorMaster interface, which means this is an input. This interface
- * is used on datapath only module such MatrixVectorCore or AluVector.
- */
-class TensorMasterData(tensorType: String = "none")
-  (implicit p: Parameters) extends TensorParams(tensorType) {
-  val data = Flipped(
-    ValidIO(Vec(tensorLength, Vec(tensorWidth, UInt(tensorElemBits.W)))))
-  override def cloneType =
-    new TensorMasterData(tensorType).asInstanceOf[this.type]
-}
-
-/** TensorClientData.
- *
- * This interface is only used for datapath only purposes and the direction convention
- * is based on the TensorClient interface, which means this is an output. This interface
- * is used on datapath only module such MatrixVectorCore or AluVector.
- */
-class TensorClientData(tensorType: String = "none")
-  (implicit p: Parameters) extends TensorParams(tensorType) {
-  val data = ValidIO(
-    Vec(tensorLength, Vec(tensorWidth, UInt(tensorElemBits.W))))
-  override def cloneType =
-    new TensorClientData(tensorType).asInstanceOf[this.type]
-}
-
-/** TensorPadCtrl. Zero-padding controller for TensorLoad. */
-class TensorPadCtrl(padType: String = "none", sizeFactor: Int = 1) extends Module {
-  val errorMsg =
-    s"\n\n\n[VTA-ERROR] only YPad0, YPad1, XPad0, or XPad1 supported\n\n\n"
-  require(padType == "YPad0" || padType == "YPad1"
-    || padType == "XPad0" || padType == "XPad1",
-    errorMsg)
-
-  val io = IO(new Bundle {
-    val start = Input(Bool())
-    val done = Output(Bool())
-    val inst = Input(UInt(INST_BITS.W))
-  })
-
-  val dec = io.inst.asTypeOf(new MemDecode)
-
-  val xmax = Reg(chiselTypeOf(dec.xsize))
-  val ymax = Reg(chiselTypeOf(dec.ypad_0))
-  val xcnt = Reg(chiselTypeOf(dec.xsize))
-  val ycnt = Reg(chiselTypeOf(dec.ypad_0))
-
-  val xval =
-    if (padType == "YPad0" || padType == "YPad1")
-      ((dec.xpad_0 + dec.xsize + dec.xpad_1) << log2Ceil(sizeFactor)) - 1.U
-    else if (padType == "XPad0")
-      (dec.xpad_0 << log2Ceil(sizeFactor)) - 1.U
-    else
-      (dec.xpad_1 << log2Ceil(sizeFactor)) - 1.U
-
-  val yval =
-    if (padType == "YPad0")
-      Mux(dec.ypad_0 =/= 0.U, dec.ypad_0 - 1.U, 0.U)
-    else if (padType == "YPad1")
-      Mux(dec.ypad_1 =/= 0.U, dec.ypad_1 - 1.U, 0.U)
-    else
-      0.U
-
-  val sIdle :: sActive :: Nil = Enum(2)
-  val state = RegInit(sIdle)
-
-  switch(state) {
-    is(sIdle) {
-      when(io.start) {
-        state := sActive
-      }
-    }
-    is(sActive) {
-      when(ycnt === ymax && xcnt === xmax) {
-        state := sIdle
-      }
-    }
-  }
-
-  when(state === sIdle) {
-    xmax := xval
-    ymax := yval
-  }
-
-  when(state === sIdle || xcnt === xmax) {
-    xcnt := 0.U
-  }.elsewhen(state === sActive) {
-    xcnt := xcnt + 1.U
-  }
-
-  when(state === sIdle || ymax === 0.U) {
-    ycnt := 0.U
-  }.elsewhen(state === sActive && xcnt === xmax) {
-    ycnt := ycnt + 1.U
-  }
-
-  io.done := state === sActive & ycnt === ymax & xcnt === xmax
-}
-
-/** TensorDataCtrl. Data controller for TensorLoad. */
-class TensorDataCtrl(tensorType: String = "none",
-    sizeFactor: Int = 1, strideFactor: Int = 1)(implicit p: Parameters) extends Module {
-  val mp = p(ShellKey).memParams
-  val io = IO(new Bundle {
-    val start = Input(Bool())
-    val done = Output(Bool())
-    val inst = Input(UInt(INST_BITS.W))
-    val baddr = Input(UInt(mp.addrBits.W))
-    val xinit = Input(Bool())
-    val xupdate = Input(Bool())
-    val yupdate = Input(Bool())
-    val stride = Output(Bool())
-    val split = Output(Bool())
-    val commit = Output(Bool())
-    val addr = Output(UInt(mp.addrBits.W))
-    val len = Output(UInt(mp.lenBits.W))
-  })
-
-  val dec = io.inst.asTypeOf(new MemDecode)
-
-  val caddr = Reg(UInt(mp.addrBits.W))
-  val baddr = Reg(UInt(mp.addrBits.W))
-  val len = Reg(UInt(mp.lenBits.W))
-  val maskOffset = VecInit(Seq.fill(M_DRAM_OFFSET_BITS)(true.B)).asUInt
-  val elemBytes =
-    if (tensorType == "inp") {
-      (p(CoreKey).batch * p(CoreKey).blockIn * p(CoreKey).inpBits) / 8
-    } else if (tensorType == "wgt") {
-      (p(CoreKey).blockOut * p(CoreKey).blockIn * p(CoreKey).wgtBits) / 8
-    } else {
-      (p(CoreKey).batch * p(CoreKey).blockOut * p(CoreKey).accBits) / 8
-    }
-
-  val xmax_bytes = ((1 << mp.lenBits) * mp.dataBits / 8).U
-  val xcnt = Reg(UInt(mp.lenBits.W))
-  val xrem = Reg(chiselTypeOf(dec.xsize))
-  val xsize = (dec.xsize << log2Ceil(sizeFactor)) - 1.U
-  val xmax = (1 << mp.lenBits).U
-  val ycnt = Reg(chiselTypeOf(dec.ysize))
-
-  val xfer_bytes = Reg(UInt(mp.addrBits.W))
-  val pulse_bytes_bits = log2Ceil(mp.dataBits >> 3)
-  val xstride_bytes = dec.xstride << log2Ceil(elemBytes)
-
-  val xfer_init_addr = io.baddr | (maskOffset & (dec.dram_offset << log2Ceil(elemBytes)))
-  val xfer_split_addr = caddr + xfer_bytes
-  val xfer_stride_addr = baddr + xstride_bytes
-
-  val xfer_init_bytes   = xmax_bytes - xfer_init_addr % xmax_bytes
-  val xfer_init_pulses  = xfer_init_bytes >> pulse_bytes_bits
-  val xfer_split_bytes  = xmax_bytes - xfer_split_addr % xmax_bytes
-  val xfer_split_pulses = xfer_split_bytes >> pulse_bytes_bits
-  val xfer_stride_bytes = xmax_bytes - xfer_stride_addr % xmax_bytes
-  val xfer_stride_pulses= xfer_stride_bytes >> pulse_bytes_bits
-
-  val stride = xcnt === len &
-    xrem === 0.U &
-    ycnt =/= dec.ysize - 1.U
-
-  val split = xcnt === len & xrem =/= 0.U
-
-  when(io.start) {
-    xfer_bytes := xfer_init_bytes
-    when(xsize < xfer_init_pulses) {
-      len := xsize
-      xrem := 0.U
-    }.otherwise {
-      len := xfer_init_pulses - 1.U
-      xrem := xsize - xfer_init_pulses
-    }
-  }.elsewhen(io.xupdate && stride) {
-    xfer_bytes := xfer_stride_bytes
-    when(xsize < xfer_stride_pulses) {
-      len := xsize
-      xrem := 0.U
-    }.otherwise {
-      len := xfer_stride_pulses - 1.U
-      xrem := xsize - xfer_stride_pulses
-    }
-  }.elsewhen(io.xupdate && split) {
-    xfer_bytes := xfer_split_bytes
-    when(xrem < xfer_split_pulses) {
-      len := xrem
-      xrem := 0.U
-    }.otherwise {
-      len := xfer_split_pulses - 1.U
-      xrem := xrem - xfer_split_pulses
-    }
-  }
-
-  when(io.xinit) {
-    xcnt := 0.U
-  }.elsewhen(io.xupdate) {
-    xcnt := xcnt + 1.U
-  }
-
-  when(io.start) {
-    ycnt := 0.U
-  }.elsewhen(io.yupdate && stride) {
-    ycnt := ycnt + 1.U
-  }
-
-  when(io.start) {
-    caddr := xfer_init_addr
-    baddr := xfer_init_addr
-  }.elsewhen(io.yupdate) {
-    when(split) {
-      caddr := xfer_split_addr
-    }.elsewhen(stride) {
-      caddr := xfer_stride_addr
-      baddr := xfer_stride_addr
-    }
-  }
-
-  io.stride := stride
-  io.split := split
-  io.commit := xcnt === len
-  io.addr := caddr
-  io.len := len
-  io.done := xcnt === len &
-    xrem === 0.U &
-    ycnt === dec.ysize - 1.U
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/core/package.scala b/vta/vta-hw/hardware/chisel/src/main/scala/core/package.scala
deleted file mode 100644
index 673d390..0000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/core/package.scala
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta
-
-/** This trick makes ISAConstants globally available */
-package object core extends vta.core.ISAConstants
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/dpi/VTAHostDPI.scala b/vta/vta-hw/hardware/chisel/src/main/scala/dpi/VTAHostDPI.scala
deleted file mode 100644
index a428916..0000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/dpi/VTAHostDPI.scala
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.dpi
-
-import chisel3._
-import chisel3.util._
-import vta.util.config._
-import vta.interface.axi._
-import vta.shell._
-
-/** Host DPI parameters */
-trait VTAHostDPIParams {
-  val dpiAddrBits = 8
-  val dpiDataBits = 32
-}
-
-/** Host master interface.
- *
- * This interface is tipically used by the Host
- */
-class VTAHostDPIMaster extends Bundle with VTAHostDPIParams {
-  val req = new Bundle {
-    val valid = Output(Bool())
-    val opcode = Output(Bool())
-    val addr = Output(UInt(dpiAddrBits.W))
-    val value = Output(UInt(dpiDataBits.W))
-    val deq = Input(Bool())
-  }
-  val resp = Flipped(ValidIO(UInt(dpiDataBits.W)))
-}
-
-/** Host client interface.
- *
- * This interface is tipically used by the Accelerator
- */
-class VTAHostDPIClient extends Bundle with VTAHostDPIParams {
-  val req = new Bundle {
-    val valid = Input(Bool())
-    val opcode = Input(Bool())
-    val addr = Input(UInt(dpiAddrBits.W))
-    val value = Input(UInt(dpiDataBits.W))
-    val deq = Output(Bool())
-  }
-  val resp = ValidIO(UInt(dpiDataBits.W))
-}
-
-/** Host DPI module.
- *
- * Wrapper for Host Verilog DPI module.
- */
-class VTAHostDPI extends BlackBox with HasBlackBoxResource {
-  val io = IO(new Bundle {
-    val clock = Input(Clock())
-    val reset = Input(Bool())
-    val dpi = new VTAHostDPIMaster
-  })
-  setResource("/verilog/VTAHostDPI.v")
-}
-
-/** Host DPI to AXI Converter.
- *
- * Convert Host DPI to AXI for VTAShell
- */
-class VTAHostDPIToAXI(debug: Boolean = false)(implicit p: Parameters) extends Module {
-  val io = IO(new Bundle {
-    val dpi = new VTAHostDPIClient
-    val axi = new AXILiteMaster(p(ShellKey).hostParams)
-  })
-  val addr = RegInit(0.U.asTypeOf(chiselTypeOf(io.dpi.req.addr)))
-  val data = RegInit(0.U.asTypeOf(chiselTypeOf(io.dpi.req.value)))
-  val sIdle :: sReadAddress :: sReadData :: sWriteAddress :: sWriteData :: sWriteResponse :: Nil =
-    Enum(6)
-  val state = RegInit(sIdle)
-
-  switch(state) {
-    is(sIdle) {
-      when(io.dpi.req.valid) {
-        when(io.dpi.req.opcode) {
-          state := sWriteAddress
-        }.otherwise {
-          state := sReadAddress
-        }
-      }
-    }
-    is(sReadAddress) {
-      when(io.axi.ar.ready) {
-        state := sReadData
-      }
-    }
-    is(sReadData) {
-      when(io.axi.r.valid) {
-        state := sIdle
-      }
-    }
-    is(sWriteAddress) {
-      when(io.axi.aw.ready) {
-        state := sWriteData
-      }
-    }
-    is(sWriteData) {
-      when(io.axi.w.ready) {
-        state := sWriteResponse
-      }
-    }
-    is(sWriteResponse) {
-      when(io.axi.b.valid) {
-        state := sIdle
-      }
-    }
-  }
-
-  when(state === sIdle && io.dpi.req.valid) {
-    addr := io.dpi.req.addr
-    data := io.dpi.req.value
-  }
-
-  io.axi.aw.valid := state === sWriteAddress
-  io.axi.aw.bits.addr := addr
-  io.axi.w.valid := state === sWriteData
-  io.axi.w.bits.data := data
-  io.axi.w.bits.strb := "h_f".U
-  io.axi.b.ready := state === sWriteResponse
-
-  io.axi.ar.valid := state === sReadAddress
-  io.axi.ar.bits.addr := addr
-  io.axi.r.ready := state === sReadData
-
-  io.dpi.req.deq := (state === sReadAddress & io.axi.ar.ready) | (state === sWriteAddress & io.axi.aw.ready)
-  io.dpi.resp.valid := io.axi.r.valid
-  io.dpi.resp.bits := io.axi.r.bits.data
-
-  if (debug) {
-    when(state === sWriteAddress && io.axi.aw.ready) {
-      printf("[VTAHostDPIToAXI] [AW] addr:%x\n", addr)
-    }
-    when(state === sReadAddress && io.axi.ar.ready) {
-      printf("[VTAHostDPIToAXI] [AR] addr:%x\n", addr)
-    }
-    when(io.axi.r.fire()) {
-      printf("[VTAHostDPIToAXI] [R] value:%x\n", io.axi.r.bits.data)
-    }
-    when(io.axi.w.fire()) {
-      printf("[VTAHostDPIToAXI] [W] value:%x\n", io.axi.w.bits.data)
-    }
-  }
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/dpi/VTAMemDPI.scala b/vta/vta-hw/hardware/chisel/src/main/scala/dpi/VTAMemDPI.scala
deleted file mode 100644
index bffbc1c..0000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/dpi/VTAMemDPI.scala
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.dpi
-
-import chisel3._
-import chisel3.util._
-import vta.util.config._
-import vta.interface.axi._
-import vta.shell._
-
-/** Memory DPI parameters */
-trait VTAMemDPIParams {
-  val dpiLenBits = 8
-  val dpiAddrBits = 64
-  val dpiDataBits = 64
-}
-
-/** Memory master interface.
- *
- * This interface is tipically used by the Accelerator
- */
-class VTAMemDPIMaster extends Bundle with VTAMemDPIParams {
-  val req = new Bundle {
-    val valid = Output(Bool())
-    val opcode = Output(Bool())
-    val len = Output(UInt(dpiLenBits.W))
-    val addr = Output(UInt(dpiAddrBits.W))
-  }
-  val wr = ValidIO(UInt(dpiDataBits.W))
-  val rd = Flipped(Decoupled(UInt(dpiDataBits.W)))
-}
-
-/** Memory client interface.
- *
- * This interface is tipically used by the Host
- */
-class VTAMemDPIClient extends Bundle with VTAMemDPIParams {
-  val req = new Bundle {
-    val valid = Input(Bool())
-    val opcode = Input(Bool())
-    val len = Input(UInt(dpiLenBits.W))
-    val addr = Input(UInt(dpiAddrBits.W))
-  }
-  val wr = Flipped(ValidIO(UInt(dpiDataBits.W)))
-  val rd = Decoupled(UInt(dpiDataBits.W))
-}
-
-/** Memory DPI module.
- *
- * Wrapper for Memory Verilog DPI module.
- */
-class VTAMemDPI extends BlackBox with HasBlackBoxResource {
-  val io = IO(new Bundle {
-    val clock = Input(Clock())
-    val reset = Input(Bool())
-    val dpi = new VTAMemDPIClient
-  })
-  setResource("/verilog/VTAMemDPI.v")
-}
-
-class VTAMemDPIToAXI(debug: Boolean = false)(implicit p: Parameters) extends Module {
-  val io = IO(new Bundle {
-    val dpi = new VTAMemDPIMaster
-    val axi = new AXIClient(p(ShellKey).memParams)
-  })
-  val opcode = RegInit(false.B)
-  val len = RegInit(0.U.asTypeOf(chiselTypeOf(io.dpi.req.len)))
-  val addr = RegInit(0.U.asTypeOf(chiselTypeOf(io.dpi.req.addr)))
-  val sIdle :: sReadAddress :: sReadData :: sWriteAddress :: sWriteData :: sWriteResponse :: Nil =
-    Enum(6)
-  val state = RegInit(sIdle)
-
-  switch(state) {
-    is(sIdle) {
-      when(io.axi.ar.valid) {
-        state := sReadAddress
-      }.elsewhen(io.axi.aw.valid) {
-        state := sWriteAddress
-      }
-    }
-    is(sReadAddress) {
-      when(io.axi.ar.valid) {
-        state := sReadData
-      }
-    }
-    is(sReadData) {
-      when(io.axi.r.ready && io.dpi.rd.valid && len === 0.U) {
-        state := sIdle
-      }
-    }
-    is(sWriteAddress) {
-      when(io.axi.aw.valid) {
-        state := sWriteData
-      }
-    }
-    is(sWriteData) {
-      when(io.axi.w.valid && io.axi.w.bits.last) {
-        state := sWriteResponse
-      }
-    }
-    is(sWriteResponse) {
-      when(io.axi.b.ready) {
-        state := sIdle
-      }
-    }
-  }
-
-  when(state === sIdle) {
-    when(io.axi.ar.valid) {
-      opcode := false.B
-      len := io.axi.ar.bits.len
-      addr := io.axi.ar.bits.addr
-    }.elsewhen(io.axi.aw.valid) {
-      opcode := true.B
-      len := io.axi.aw.bits.len
-      addr := io.axi.aw.bits.addr
-    }
-  }.elsewhen(state === sReadData) {
-    when(io.axi.r.ready && io.dpi.rd.valid && len =/= 0.U) {
-      len := len - 1.U
-    }
-  }
-
-  io.dpi.req.valid := (state === sReadAddress & io.axi.ar.valid) | (state === sWriteAddress & io.axi.aw.valid)
-  io.dpi.req.opcode := opcode
-  io.dpi.req.len := len
-  io.dpi.req.addr := addr
-
-  io.axi.ar.ready := state === sReadAddress
-  io.axi.aw.ready := state === sWriteAddress
-
-  io.axi.r.valid := state === sReadData & io.dpi.rd.valid
-  io.axi.r.bits.data := io.dpi.rd.bits
-  io.axi.r.bits.last := len === 0.U
-  io.axi.r.bits.resp := 0.U
-  io.axi.r.bits.user := 0.U
-  io.axi.r.bits.id := 0.U
-  io.dpi.rd.ready := state === sReadData & io.axi.r.ready
-
-  io.dpi.wr.valid := state === sWriteData & io.axi.w.valid
-  io.dpi.wr.bits := io.axi.w.bits.data
-  io.axi.w.ready := state === sWriteData
-
-  io.axi.b.valid := state === sWriteResponse
-  io.axi.b.bits.resp := 0.U
-  io.axi.b.bits.user := 0.U
-  io.axi.b.bits.id := 0.U
-
-  if (debug) {
-    when(state === sReadAddress && io.axi.ar.valid) {
-      printf("[VTAMemDPIToAXI] [AR] addr:%x len:%x\n", addr, len)
-    }
-    when(state === sWriteAddress && io.axi.aw.valid) {
-      printf("[VTAMemDPIToAXI] [AW] addr:%x len:%x\n", addr, len)
-    }
-    when(io.axi.r.fire()) {
-      printf("[VTAMemDPIToAXI] [R] last:%x data:%x\n",
-        io.axi.r.bits.last,
-        io.axi.r.bits.data)
-    }
-    when(io.axi.w.fire()) {
-      printf("[VTAMemDPIToAXI] [W] last:%x data:%x\n",
-        io.axi.w.bits.last,
-        io.axi.w.bits.data)
-    }
-  }
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/dpi/VTASimDPI.scala b/vta/vta-hw/hardware/chisel/src/main/scala/dpi/VTASimDPI.scala
deleted file mode 100644
index 2f25328..0000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/dpi/VTASimDPI.scala
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.dpi
-
-import chisel3._
-import chisel3.util._
-import vta.util.config._
-import vta.interface.axi._
-import vta.shell._
-
-/** Sim DPI module.
- *
- * Wrapper for Sim Verilog DPI module.
- */
-class VTASimDPI extends BlackBox with HasBlackBoxResource {
-  val io = IO(new Bundle {
-    val clock = Input(Clock())
-    val reset = Input(Bool())
-    val dpi_wait = Output(Bool())
-  })
-  setResource("/verilog/VTASimDPI.v")
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/interface/axi/AXI.scala b/vta/vta-hw/hardware/chisel/src/main/scala/interface/axi/AXI.scala
deleted file mode 100644
index 5151590..0000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/interface/axi/AXI.scala
+++ /dev/null
@@ -1,312 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.interface.axi
-
-import chisel3._
-import chisel3.util._
-import vta.util.genericbundle._
-
-case class AXIParams(
-    coherent: Boolean = false,
-    idBits: Int = 1,
-    addrBits: Int = 32,
-    dataBits: Int = 64,
-    lenBits: Int = 8,
-    userBits: Int = 1
-) {
-  require(addrBits > 0)
-  require(dataBits >= 8 && dataBits % 2 == 0)
-
-  val strbBits = dataBits / 8
-  val sizeBits = 3
-  val burstBits = 2
-  val lockBits = 2
-  val cacheBits = 4
-  val protBits = 3
-  val qosBits = 4
-  val regionBits = 4
-  val respBits = 2
-  val sizeConst = log2Ceil(dataBits / 8)
-  val idConst = 0
-  val userConst = if (coherent) 1 else 0
-  val burstConst = 1
-  val lockConst = 0
-  val cacheConst = if (coherent) 15 else 3
-  val protConst = if (coherent) 4 else 0
-  val qosConst = 0
-  val regionConst = 0
-}
-
-abstract class AXIBase(params: AXIParams)
-  extends GenericParameterizedBundle(params)
-
-// AXILite
-
-class AXILiteAddress(params: AXIParams) extends AXIBase(params) {
-  val addr = UInt(params.addrBits.W)
-}
-
-class AXILiteWriteData(params: AXIParams) extends AXIBase(params) {
-  val data = UInt(params.dataBits.W)
-  val strb = UInt(params.strbBits.W)
-}
-
-class AXILiteWriteResponse(params: AXIParams) extends AXIBase(params) {
-  val resp = UInt(params.respBits.W)
-}
-
-class AXILiteReadData(params: AXIParams) extends AXIBase(params) {
-  val data = UInt(params.dataBits.W)
-  val resp = UInt(params.respBits.W)
-}
-
-class AXILiteMaster(params: AXIParams) extends AXIBase(params) {
-  val aw = Decoupled(new AXILiteAddress(params))
-  val w = Decoupled(new AXILiteWriteData(params))
-  val b = Flipped(Decoupled(new AXILiteWriteResponse(params)))
-  val ar = Decoupled(new AXILiteAddress(params))
-  val r = Flipped(Decoupled(new AXILiteReadData(params)))
-
-  def tieoff() {
-    aw.valid := false.B
-    aw.bits.addr := 0.U
-    w.valid := false.B
-    w.bits.data := 0.U
-    w.bits.strb := 0.U
-    b.ready := false.B
-    ar.valid := false.B
-    ar.bits.addr := 0.U
-    r.ready := false.B
-  }
-}
-
-class AXILiteClient(params: AXIParams) extends AXIBase(params) {
-  val aw = Flipped(Decoupled(new AXILiteAddress(params)))
-  val w = Flipped(Decoupled(new AXILiteWriteData(params)))
-  val b = Decoupled(new AXILiteWriteResponse(params))
-  val ar = Flipped(Decoupled(new AXILiteAddress(params)))
-  val r = Decoupled(new AXILiteReadData(params))
-
-  def tieoff() {
-    aw.ready := false.B
-    w.ready := false.B
-    b.valid := false.B
-    b.bits.resp := 0.U
-    ar.ready := false.B
-    r.valid := false.B
-    r.bits.resp := 0.U
-    r.bits.data := 0.U
-  }
-}
-
-// AXI extends AXILite
-
-class AXIAddress(params: AXIParams) extends AXILiteAddress(params) {
-  val id = UInt(params.idBits.W)
-  val user = UInt(params.userBits.W)
-  val len = UInt(params.lenBits.W)
-  val size = UInt(params.sizeBits.W)
-  val burst = UInt(params.burstBits.W)
-  val lock = UInt(params.lockBits.W)
-  val cache = UInt(params.cacheBits.W)
-  val prot = UInt(params.protBits.W)
-  val qos = UInt(params.qosBits.W)
-  val region = UInt(params.regionBits.W)
-}
-
-class AXIWriteData(params: AXIParams) extends AXILiteWriteData(params) {
-  val last = Bool()
-  val id = UInt(params.idBits.W)
-  val user = UInt(params.userBits.W)
-}
-
-class AXIWriteResponse(params: AXIParams) extends AXILiteWriteResponse(params) {
-  val id = UInt(params.idBits.W)
-  val user = UInt(params.userBits.W)
-}
-
-class AXIReadData(params: AXIParams) extends AXILiteReadData(params) {
-  val last = Bool()
-  val id = UInt(params.idBits.W)
-  val user = UInt(params.userBits.W)
-}
-
-class AXIMaster(params: AXIParams) extends AXIBase(params) {
-  val aw = Decoupled(new AXIAddress(params))
-  val w = Decoupled(new AXIWriteData(params))
-  val b = Flipped(Decoupled(new AXIWriteResponse(params)))
-  val ar = Decoupled(new AXIAddress(params))
-  val r = Flipped(Decoupled(new AXIReadData(params)))
-
-  def tieoff() {
-    aw.valid := false.B
-    aw.bits.addr := 0.U
-    aw.bits.id := 0.U
-    aw.bits.user := 0.U
-    aw.bits.len := 0.U
-    aw.bits.size := 0.U
-    aw.bits.burst := 0.U
-    aw.bits.lock := 0.U
-    aw.bits.cache := 0.U
-    aw.bits.prot := 0.U
-    aw.bits.qos := 0.U
-    aw.bits.region := 0.U
-    w.valid := false.B
-    w.bits.data := 0.U
-    w.bits.strb := 0.U
-    w.bits.last := false.B
-    w.bits.id := 0.U
-    w.bits.user := 0.U
-    b.ready := false.B
-    ar.valid := false.B
-    ar.bits.addr := 0.U
-    ar.bits.id := 0.U
-    ar.bits.user := 0.U
-    ar.bits.len := 0.U
-    ar.bits.size := 0.U
-    ar.bits.burst := 0.U
-    ar.bits.lock := 0.U
-    ar.bits.cache := 0.U
-    ar.bits.prot := 0.U
-    ar.bits.qos := 0.U
-    ar.bits.region := 0.U
-    r.ready := false.B
-  }
-
-  def setConst() {
-    aw.bits.user := params.userConst.U
-    aw.bits.burst := params.burstConst.U
-    aw.bits.lock := params.lockConst.U
-    aw.bits.cache := params.cacheConst.U
-    aw.bits.prot := params.protConst.U
-    aw.bits.qos := params.qosConst.U
-    aw.bits.region := params.regionConst.U
-    aw.bits.size := params.sizeConst.U
-    aw.bits.id := params.idConst.U
-    w.bits.id := params.idConst.U
-    w.bits.user := params.userConst.U
-    w.bits.strb := Fill(params.strbBits, true.B)
-    ar.bits.user := params.userConst.U
-    ar.bits.burst := params.burstConst.U
-    ar.bits.lock := params.lockConst.U
-    ar.bits.cache := params.cacheConst.U
-    ar.bits.prot := params.protConst.U
-    ar.bits.qos := params.qosConst.U
-    ar.bits.region := params.regionConst.U
-    ar.bits.size := params.sizeConst.U
-    ar.bits.id := params.idConst.U
-  }
-}
-
-class AXIClient(params: AXIParams) extends AXIBase(params) {
-  val aw = Flipped(Decoupled(new AXIAddress(params)))
-  val w = Flipped(Decoupled(new AXIWriteData(params)))
-  val b = Decoupled(new AXIWriteResponse(params))
-  val ar = Flipped(Decoupled(new AXIAddress(params)))
-  val r = Decoupled(new AXIReadData(params))
-
-  def tieoff() {
-    aw.ready := false.B
-    w.ready := false.B
-    b.valid := false.B
-    b.bits.resp := 0.U
-    b.bits.user := 0.U
-    b.bits.id := 0.U
-    ar.ready := false.B
-    r.valid := false.B
-    r.bits.resp := 0.U
-    r.bits.data := 0.U
-    r.bits.user := 0.U
-    r.bits.last := false.B
-    r.bits.id := 0.U
-  }
-}
-
-// XilinxAXILiteClient and XilinxAXIMaster bundles are needed
-// for wrapper purposes, because the package RTL tool in Xilinx Vivado
-// only allows certain name formats
-
-class XilinxAXILiteClient(params: AXIParams) extends AXIBase(params) {
-  val AWVALID = Input(Bool())
-  val AWREADY = Output(Bool())
-  val AWADDR = Input(UInt(params.addrBits.W))
-  val WVALID = Input(Bool())
-  val WREADY = Output(Bool())
-  val WDATA = Input(UInt(params.dataBits.W))
-  val WSTRB = Input(UInt(params.strbBits.W))
-  val BVALID = Output(Bool())
-  val BREADY = Input(Bool())
-  val BRESP = Output(UInt(params.respBits.W))
-  val ARVALID = Input(Bool())
-  val ARREADY = Output(Bool())
-  val ARADDR = Input(UInt(params.addrBits.W))
-  val RVALID = Output(Bool())
-  val RREADY = Input(Bool())
-  val RDATA = Output(UInt(params.dataBits.W))
-  val RRESP = Output(UInt(params.respBits.W))
-}
-
-class XilinxAXIMaster(params: AXIParams) extends AXIBase(params) {
-  val AWVALID = Output(Bool())
-  val AWREADY = Input(Bool())
-  val AWADDR = Output(UInt(params.addrBits.W))
-  val AWID = Output(UInt(params.idBits.W))
-  val AWUSER = Output(UInt(params.userBits.W))
-  val AWLEN = Output(UInt(params.lenBits.W))
-  val AWSIZE = Output(UInt(params.sizeBits.W))
-  val AWBURST = Output(UInt(params.burstBits.W))
-  val AWLOCK = Output(UInt(params.lockBits.W))
-  val AWCACHE = Output(UInt(params.cacheBits.W))
-  val AWPROT = Output(UInt(params.protBits.W))
-  val AWQOS = Output(UInt(params.qosBits.W))
-  val AWREGION = Output(UInt(params.regionBits.W))
-  val WVALID = Output(Bool())
-  val WREADY = Input(Bool())
-  val WDATA = Output(UInt(params.dataBits.W))
-  val WSTRB = Output(UInt(params.strbBits.W))
-  val WLAST = Output(Bool())
-  val WID = Output(UInt(params.idBits.W))
-  val WUSER = Output(UInt(params.userBits.W))
-  val BVALID = Input(Bool())
-  val BREADY = Output(Bool())
-  val BRESP = Input(UInt(params.respBits.W))
-  val BID = Input(UInt(params.idBits.W))
-  val BUSER = Input(UInt(params.userBits.W))
-  val ARVALID = Output(Bool())
-  val ARREADY = Input(Bool())
-  val ARADDR = Output(UInt(params.addrBits.W))
-  val ARID = Output(UInt(params.idBits.W))
-  val ARUSER = Output(UInt(params.userBits.W))
-  val ARLEN = Output(UInt(params.lenBits.W))
-  val ARSIZE = Output(UInt(params.sizeBits.W))
-  val ARBURST = Output(UInt(params.burstBits.W))
-  val ARLOCK = Output(UInt(params.lockBits.W))
-  val ARCACHE = Output(UInt(params.cacheBits.W))
-  val ARPROT = Output(UInt(params.protBits.W))
-  val ARQOS = Output(UInt(params.qosBits.W))
-  val ARREGION = Output(UInt(params.regionBits.W))
-  val RVALID = Input(Bool())
-  val RREADY = Output(Bool())
-  val RDATA = Input(UInt(params.dataBits.W))
-  val RRESP = Input(UInt(params.respBits.W))
-  val RLAST = Input(Bool())
-  val RID = Input(UInt(params.idBits.W))
-  val RUSER = Input(UInt(params.userBits.W))
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/shell/Configs.scala b/vta/vta-hw/hardware/chisel/src/main/scala/shell/Configs.scala
deleted file mode 100644
index b0c5402..0000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/shell/Configs.scala
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.shell
-
-import chisel3._
-import chisel3.util._
-import vta.util.config._
-import vta.interface.axi._
-
-/** PynqConfig. Shell configuration for Pynq */
-class PynqConfig extends Config((site, here, up) => {
-  case ShellKey =>
-    ShellParams(
-      hostParams = AXIParams(coherent = false,
-        addrBits = 16,
-        dataBits = 32,
-        lenBits = 8,
-        userBits = 1),
-      memParams = AXIParams(coherent = true,
-        addrBits = 32,
-        dataBits = 64,
-        lenBits = 8,
-        userBits = 1),
-      vcrParams = VCRParams(),
-      vmeParams = VMEParams()
-    )
-})
-
-/** F1Config. Shell configuration for F1 */
-class F1Config extends Config((site, here, up) => {
-  case ShellKey =>
-    ShellParams(
-      hostParams = AXIParams(coherent = false,
-        addrBits = 16,
-        dataBits = 32,
-        lenBits = 8,
-        userBits = 1),
-      memParams = AXIParams(coherent = false,
-        addrBits = 64,
-        dataBits = 64,
-        lenBits = 8,
-        userBits = 1),
-      vcrParams = VCRParams(),
-      vmeParams = VMEParams()
-    )
-})
-
-/** De10Config. Shell configuration for De10 */
-class De10Config extends Config((site, here, up) => {
-  case ShellKey =>
-    ShellParams(
-      hostParams =
-        AXIParams(addrBits = 16, dataBits = 32, idBits = 13, lenBits = 4),
-      memParams = AXIParams(
-        addrBits = 32,
-        dataBits = 64,
-        userBits = 5,
-        lenBits = 4,  // limit to 16 beats, instead of 256 beats in AXI4
-        coherent = true),
-      vcrParams = VCRParams(),
-      vmeParams = VMEParams()
-    )
-})
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/shell/IntelShell.scala b/vta/vta-hw/hardware/chisel/src/main/scala/shell/IntelShell.scala
deleted file mode 100644
index e1b6995..0000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/shell/IntelShell.scala
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.shell
-
-import chisel3._
-import vta.util.config._
-import vta.interface.axi._
-import vta.core._
-
-/** IntelShell.
- *
- * The IntelShell is based on a VME, VCR and core. This creates a complete VTA
- * system that can be used for simulation or real hardware.
- */
-class IntelShell(implicit p: Parameters) extends Module {
-  val io = IO(new Bundle {
-    val host = new AXIClient(p(ShellKey).hostParams)
-    val mem = new AXIMaster(p(ShellKey).memParams)
-  })
-
-  val vcr = Module(new VCR)
-  val vme = Module(new VME)
-  val core = Module(new Core)
-
-  core.io.vcr <> vcr.io.vcr
-  vme.io.vme <> core.io.vme
-
-  // vcr.io.host <> io.host
-  io.host.aw.ready := vcr.io.host.aw.ready
-  vcr.io.host.aw.valid := io.host.aw.valid
-  vcr.io.host.aw.bits.addr := io.host.aw.bits.addr
-  io.host.w.ready := vcr.io.host.w.ready
-  vcr.io.host.w.valid := io.host.w.valid
-  vcr.io.host.w.bits.data := io.host.w.bits.data
-  vcr.io.host.w.bits.strb := io.host.w.bits.strb
-  vcr.io.host.b.ready := io.host.b.ready
-  io.host.b.valid := vcr.io.host.b.valid
-  io.host.b.bits.resp := vcr.io.host.b.bits.resp
-  io.host.b.bits.id := io.host.w.bits.id
-
-  io.host.ar.ready := vcr.io.host.ar.ready
-  vcr.io.host.ar.valid := io.host.ar.valid
-  vcr.io.host.ar.bits.addr := io.host.ar.bits.addr
-  vcr.io.host.r.ready := io.host.r.ready
-  io.host.r.valid := vcr.io.host.r.valid
-  io.host.r.bits.data := vcr.io.host.r.bits.data
-  io.host.r.bits.resp := vcr.io.host.r.bits.resp
-  io.host.r.bits.id := io.host.ar.bits.id
-
-  io.host.b.bits.user <> DontCare
-  io.host.r.bits.user <> DontCare
-  io.host.r.bits.last := 1.U
-
-  io.mem <> vme.io.mem
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/shell/SimShell.scala b/vta/vta-hw/hardware/chisel/src/main/scala/shell/SimShell.scala
deleted file mode 100644
index 0909d1b..0000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/shell/SimShell.scala
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.shell
-
-import chisel3._
-import chisel3.experimental.MultiIOModule
-import vta.util.config._
-import vta.interface.axi._
-import vta.shell._
-import vta.dpi._
-
-/** VTAHost.
- *
- * This module translate the DPI protocol into AXI. This is a simulation only
- * module and used to test host-to-VTA communication. This module should be updated
- * for testing hosts using a different bus protocol, other than AXI.
- */
-class VTAHost(implicit p: Parameters) extends Module {
-  val io = IO(new Bundle {
-    val axi = new AXILiteMaster(p(ShellKey).hostParams)
-  })
-  val host_dpi = Module(new VTAHostDPI)
-  val host_axi = Module(new VTAHostDPIToAXI)
-  host_dpi.io.reset := reset
-  host_dpi.io.clock := clock
-  host_axi.io.dpi <> host_dpi.io.dpi
-  io.axi <> host_axi.io.axi
-}
-
-/** VTAMem.
- *
- * This module translate the DPI protocol into AXI. This is a simulation only
- * module and used to test VTA-to-memory communication. This module should be updated
- * for testing memories using a different bus protocol, other than AXI.
- */
-class VTAMem(implicit p: Parameters) extends Module {
-  val io = IO(new Bundle {
-    val axi = new AXIClient(p(ShellKey).memParams)
-  })
-  val mem_dpi = Module(new VTAMemDPI)
-  val mem_axi = Module(new VTAMemDPIToAXI)
-  mem_dpi.io.reset := reset
-  mem_dpi.io.clock := clock
-  mem_dpi.io.dpi <> mem_axi.io.dpi
-  mem_axi.io.axi <> io.axi
-}
-
-/** VTASim.
- *
- * This module is used to handle hardware simulation thread, such as halting
- * or terminating the simulation thread. The sim_wait port is used to halt
- * the simulation thread when it is asserted and resume it when it is
- * de-asserted.
- */
-class VTASim(implicit p: Parameters) extends MultiIOModule {
-  val sim_wait = IO(Output(Bool()))
-  val sim = Module(new VTASimDPI)
-  sim.io.reset := reset
-  sim.io.clock := clock
-  sim_wait := sim.io.dpi_wait
-}
-
-/** SimShell.
- *
- * The simulation shell instantiate the sim, host and memory DPI modules that
- * are connected to the VTAShell. An extra clock, sim_clock, is used to eval
- * the VTASim DPI function when the main simulation clock is on halt state.
- */
-class SimShell(implicit p: Parameters) extends MultiIOModule {
-  val mem = IO(new AXIClient(p(ShellKey).memParams))
-  val host = IO(new AXILiteMaster(p(ShellKey).hostParams))
-  val sim_clock = IO(Input(Clock()))
-  val sim_wait = IO(Output(Bool()))
-  val mod_sim = Module(new VTASim)
-  val mod_host = Module(new VTAHost)
-  val mod_mem = Module(new VTAMem)
-  mem <> mod_mem.io.axi
-  host <> mod_host.io.axi
-  mod_sim.reset := reset
-  mod_sim.clock := sim_clock
-  sim_wait := mod_sim.sim_wait
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/shell/VCR.scala b/vta/vta-hw/hardware/chisel/src/main/scala/shell/VCR.scala
deleted file mode 100644
index 9a80cd7..0000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/shell/VCR.scala
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.shell
-
-import chisel3._
-import chisel3.util._
-import vta.util.config._
-import vta.util.genericbundle._
-import vta.interface.axi._
-
-/** VCR parameters.
- *
- * These parameters are used on VCR interfaces and modules.
- */
-case class VCRParams() {
-  val nCtrl = 1
-  val nECnt = 1
-  val nVals = 1
-  val nPtrs = 6
-  val nUCnt = 1
-  val regBits = 32
-}
-
-/** VCRBase. Parametrize base class. */
-abstract class VCRBase(implicit p: Parameters) extends GenericParameterizedBundle(p)
-
-/** VCRMaster.
- *
- * This is the master interface used by VCR in the VTAShell to control
- * the Core unit.
- */
-class VCRMaster(implicit p: Parameters) extends VCRBase {
-  val vp = p(ShellKey).vcrParams
-  val mp = p(ShellKey).memParams
-  val launch = Output(Bool())
-  val finish = Input(Bool())
-  val ecnt = Vec(vp.nECnt, Flipped(ValidIO(UInt(vp.regBits.W))))
-  val vals = Output(Vec(vp.nVals, UInt(vp.regBits.W)))
-  val ptrs = Output(Vec(vp.nPtrs, UInt(mp.addrBits.W)))
-  val ucnt = Vec(vp.nUCnt, Flipped(ValidIO(UInt(vp.regBits.W))))
-}
-
-/** VCRClient.
- *
- * This is the client interface used by the Core module to communicate
- * to the VCR in the VTAShell.
- */
-class VCRClient(implicit p: Parameters) extends VCRBase {
-  val vp = p(ShellKey).vcrParams
-  val mp = p(ShellKey).memParams
-  val launch = Input(Bool())
-  val finish = Output(Bool())
-  val ecnt = Vec(vp.nECnt, ValidIO(UInt(vp.regBits.W)))
-  val vals = Input(Vec(vp.nVals, UInt(vp.regBits.W)))
-  val ptrs = Input(Vec(vp.nPtrs, UInt(mp.addrBits.W)))
-  val ucnt = Vec(vp.nUCnt, ValidIO(UInt(vp.regBits.W)))
-}
-
-/** VTA Control Registers (VCR).
- *
- * This unit provides control registers (32 and 64 bits) to be used by a control'
- * unit, typically a host processor. These registers are read-only by the core
- * at the moment but this will likely change once we add support to general purpose
- * registers that could be used as event counters by the Core unit.
- */
-class VCR(implicit p: Parameters) extends Module {
-  val io = IO(new Bundle {
-    val host = new AXILiteClient(p(ShellKey).hostParams)
-    val vcr = new VCRMaster
-  })
-
-  val vp = p(ShellKey).vcrParams
-  val mp = p(ShellKey).memParams
-  val hp = p(ShellKey).hostParams
-
-  // Write control (AW, W, B)
-  val waddr = RegInit("h_ffff".U(hp.addrBits.W)) // init with invalid address
-  val wdata = io.host.w.bits.data
-  val sWriteAddress :: sWriteData :: sWriteResponse :: Nil = Enum(3)
-  val wstate = RegInit(sWriteAddress)
-
-  // read control (AR, R)
-  val sReadAddress :: sReadData :: Nil = Enum(2)
-  val rstate = RegInit(sReadAddress)
-  val rdata = RegInit(0.U(vp.regBits.W))
-
-  // registers
-  val nPtrs = if (mp.addrBits == 32) vp.nPtrs else 2 * vp.nPtrs
-  val nTotal = vp.nCtrl + vp.nECnt + vp.nVals + nPtrs + vp.nUCnt
-
-  val reg = Seq.fill(nTotal)(RegInit(0.U(vp.regBits.W)))
-  val addr = Seq.tabulate(nTotal)(_ * 4)
-  val reg_map = (addr zip reg) map { case (a, r) => a.U -> r }
-  val eo = vp.nCtrl
-  val vo = eo + vp.nECnt
-  val po = vo + vp.nVals
-  val uo = po + nPtrs
-
-  switch(wstate) {
-    is(sWriteAddress) {
-      when(io.host.aw.valid) {
-        wstate := sWriteData
-      }
-    }
-    is(sWriteData) {
-      when(io.host.w.valid) {
-        wstate := sWriteResponse
-      }
-    }
-    is(sWriteResponse) {
-      when(io.host.b.ready) {
-        wstate := sWriteAddress
-      }
-    }
-  }
-
-  when(io.host.aw.fire()) { waddr := io.host.aw.bits.addr }
-
-  io.host.aw.ready := wstate === sWriteAddress
-  io.host.w.ready := wstate === sWriteData
-  io.host.b.valid := wstate === sWriteResponse
-  io.host.b.bits.resp := 0.U
-
-  switch(rstate) {
-    is(sReadAddress) {
-      when(io.host.ar.valid) {
-        rstate := sReadData
-      }
-    }
-    is(sReadData) {
-      when(io.host.r.ready) {
-        rstate := sReadAddress
-      }
-    }
-  }
-
-  io.host.ar.ready := rstate === sReadAddress
-  io.host.r.valid := rstate === sReadData
-  io.host.r.bits.data := rdata
-  io.host.r.bits.resp := 0.U
-
-  when(io.vcr.finish) {
-    reg(0) := "b_10".U
-  }.elsewhen(io.host.w.fire() && addr(0).U === waddr) {
-    reg(0) := wdata
-  }
-
-  for (i <- 0 until vp.nECnt) {
-    when(io.vcr.ecnt(i).valid) {
-      reg(eo + i) := io.vcr.ecnt(i).bits
-    }.elsewhen(io.host.w.fire() && addr(eo + i).U === waddr) {
-      reg(eo + i) := wdata
-    }
-  }
-
-  for (i <- 0 until (vp.nVals + nPtrs)) {
-    when(io.host.w.fire() && addr(vo + i).U === waddr) {
-      reg(vo + i) := wdata
-    }
-  }
-
-  when(io.host.ar.fire()) {
-    rdata := MuxLookup(io.host.ar.bits.addr, 0.U, reg_map)
-  }
-
-  io.vcr.launch := reg(0)(0)
-
-  for (i <- 0 until vp.nVals) {
-    io.vcr.vals(i) := reg(vo + i)
-  }
-
-  if (mp.addrBits == 32) { // 32-bit pointers
-    for (i <- 0 until nPtrs) {
-      io.vcr.ptrs(i) := reg(po + i)
-    }
-  } else { // 64-bits pointers
-    for (i <- 0 until (nPtrs / 2)) {
-      io.vcr.ptrs(i) := Cat(reg(po + 2 * i + 1), reg(po + 2 * i))
-    }
-  }
-
-  for (i <- 0 until vp.nUCnt) {
-    when(io.vcr.ucnt(i).valid) {
-      reg(uo + i) := io.vcr.ucnt(i).bits
-    }.elsewhen(io.host.w.fire() && addr(uo + i).U === waddr) {
-      reg(uo + i) := wdata
-    }
-  }
-}
diff --git a/vta/vta-hw/hardware/chisel/src/main/scala/shell/VME.scala b/vta/vta-hw/hardware/chisel/src/main/scala/shell/VME.scala
deleted file mode 100644
index 41b24d1..0000000
--- a/vta/vta-hw/hardware/chisel/src/main/scala/shell/VME.scala
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package vta.shell
-
-import chisel3._
-import chisel3.util._
-import vta.util.config._
-import vta.util.genericbundle._
-import vta.interface.axi._
-
-/** VME parameters.
- *
- * These parameters are used on VME interfaces and modules.
- */
-case class VMEParams() {
-  val nReadClients: Int = 5
-  val nWriteClients: Int = 1
-  require(nReadClients > 0,
-    s"\n\n[VTA] [VMEParams] nReadClients must be larger than 0\n\n")
-  require(
-    nWriteClients == 1,
-    s"\n\n[VTA] [VMEParams] nWriteClients must be 1, only one-write-client support atm\n\n")
-}
-
-/** VMEBase. Parametrize base class. */
-abstract class VMEBase(implicit p: Parameters) extends GenericParameterizedBundle(p)
-
-/** VMECmd.
- *
- * This interface is used for creating write and read requests to memory.
- */
-class VMECmd(implicit p: Parameters) extends VMEBase {
-  val addrBits = p(ShellKey).memParams.addrBits
-  val lenBits = p(ShellKey).memParams.lenBits
-  val addr = UInt(addrBits.W)
-  val len = UInt(lenBits.W)
-}
-
... 10387 lines suppressed ...


Mime
View raw message