diff --git a/poros/CMakeLists.txt b/poros/CMakeLists.txt
new file mode 100644
index 0000000000..3245bbae39
--- /dev/null
+++ b/poros/CMakeLists.txt
@@ -0,0 +1,209 @@
+cmake_minimum_required(VERSION 3.21)
+project(poros)
+set(CMAKE_CXX_STANDARD 14)
+add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)
+
+
+option(BUILD_STATIC "build lib${PROJECT_NAME}.a static lib" OFF)
+option(BUILD_KERNEL "build lib${PROJECT_NAME}-kernel.so shared lib" OFF)
+option(BUILD_STATIC_KERNEL "build lib${PROJECT_NAME}-kernel.a static lib" OFF)
+option(BUILD_TOOL "build ${PROJECT_NAME}-tool, an executable binary output" OFF)
+option(TEST "build for test. copy '.so' to site-packages automatically after compile" OFF)
+option(DEBUG "build for debug. add '-g' flag to gcc for detailed debug information" ON)
+option(UT "build for unit test" OFF)
+
+
+# minimum requirements
+set(PYTHON_MINIMUM_VERSION 3.6)
+set(CUDA_MINIMUM_VERSION 10.2)
+set(PYTORCH_MINIMUM_VERSION 1.9)
+set(TENSORRT_MINIMUM_VERSION 8.2)
+set(CUDNN_MINIMUM_VERSION 8.0)
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
+
+# find cuda
+find_package(CUDAToolkit ${CUDA_MINIMUM_VERSION} REQUIRED)
+
+# find python3
+find_package(Python3 ${PYTHON_MINIMUM_VERSION} REQUIRED COMPONENTS Interpreter Development)
+message(STATUS "Found Python: ${Python3_VERSION_MAJOR}.${Python3_VERSION_MINOR}.${Python3_VERSION_PATCH}")
+
+if (NOT Python3_SITELIB)
+    message(FATAL_ERROR "site-packages not found. ")
+else ()
+    message(STATUS "site-packages: ${Python3_SITELIB}")
+endif ()
+
+# find pytorch
+find_package(Torch ${PYTORCH_MINIMUM_VERSION} REQUIRED HINTS ${Python3_SITELIB})
+
+# find TensorRT
+find_package(TensorRT ${TENSORRT_MINIMUM_VERSION} REQUIRED)
+get_filename_component(TENSORRT_LIB_DIR ${TensorRT_LIBRARIES} DIRECTORY)
+
+
+# find CUDNN
+find_package(CUDNN ${CUDNN_MINIMUM_VERSION} REQUIRED)
+
+
+## release headers
+# engine
+file(GLOB headers "${PROJECT_SOURCE_DIR}/src/poros/engine/iengine.h" "${PROJECT_SOURCE_DIR}/src/poros/engine/engine_context.h")
+file(COPY ${headers} DESTINATION "${PROJECT_SOURCE_DIR}/build/include/poros/engine")
+# compile
+file(GLOB headers "${PROJECT_SOURCE_DIR}/src/poros/compile/poros_module.h")
+file(COPY ${headers} DESTINATION "${PROJECT_SOURCE_DIR}/build/include/poros/compile")
+file(GLOB headers "${PROJECT_SOURCE_DIR}/src/poros/compile/compile.h")
+file(COPY ${headers} DESTINATION "${PROJECT_SOURCE_DIR}/build/include/poros/compile")
+# converter
+file(GLOB headers "${PROJECT_SOURCE_DIR}/src/poros/converter/iconverter.h")
+file(COPY ${headers} DESTINATION "${PROJECT_SOURCE_DIR}/build/include/poros/converter")
+# iplugin
+file(GLOB headers "${PROJECT_SOURCE_DIR}/src/poros/iplugin/*.h")
+file(COPY ${headers} DESTINATION "${PROJECT_SOURCE_DIR}/build/include/poros/iplugin")
+## context
+file(GLOB headers "${PROJECT_SOURCE_DIR}/src/poros/context/*.h")
+file(COPY ${headers} DESTINATION "${PROJECT_SOURCE_DIR}/build/include/poros/context")
+## context
+file(GLOB headers "${PROJECT_SOURCE_DIR}/src/poros/context/*.h")
+file(COPY ${headers} DESTINATION "${PROJECT_SOURCE_DIR}/build/include/poros/context")
+## lowering
+file(GLOB headers "${PROJECT_SOURCE_DIR}/src/poros/lowering/*.h")
+file(COPY ${headers} DESTINATION "${PROJECT_SOURCE_DIR}/build/include/poros/lowering")
+## util
+file(GLOB headers "${PROJECT_SOURCE_DIR}/src/poros/util/*.h")
+file(COPY ${headers} DESTINATION "${PROJECT_SOURCE_DIR}/build/include/poros/util")
+## log
+file(GLOB headers "${PROJECT_SOURCE_DIR}/src/poros/log/*.h")
+file(COPY ${headers} DESTINATION "${PROJECT_SOURCE_DIR}/build/include/poros/log")
+
+
+include_directories(${TORCH_INCLUDE_DIRS})
+include_directories(${TensorRT_INCLUDE_DIRS})
+include_directories(${CUDA_INCLUDE_DIRS})
+include_directories(${CUDNN_INCLUDE_PATH})
+include_directories(src)
+include_directories(src/poros/compile)
+
+
+add_compile_options(-D__const__= -D_GNU_SOURCE)
+add_compile_options(-lpthread -lcrypto -lrt -ldl -lz -fPIC -rdynamic)
+add_compile_options(-std=c++17 -O2 -g -pipe -W -Wall -fPIC -Wno-deprecated-declarations -Wno-unused-parameter)
+if (DEBUG)
+    add_compile_options(-g) # for debug
+endif ()
+
+add_compile_options(
+        -Wall
+        -Wno-comment
+        -Wno-error=implicit-fallthrough
+        -Wno-error=unused-but-set-variable
+        -Wno-error=misleading-indentation
+        -Wno-error=unused-function
+        -Wno-error=terminate
+        -Wno-unused-parameter
+        -Wno-deprecated-declarations
+)
+
+
+file(
+        GLOB POROS_CPP_FILES
+        "./src/poros/*/*.cpp"
+        "./src/poros/converter/*/*.cpp"
+        "./src/poros/converter/gpu/plugins/*.cpp"
+)
+
+
+# libporos.so
+add_library(${PROJECT_NAME} SHARED ${POROS_CPP_FILES})
+
+#set_target_properties(${PROJECT_NAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/python/poros/lib)
+set_target_properties(${PROJECT_NAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/build/lib)
+
+#add_custom_command(
+#    TARGET ${PROJECT_NAME}
+#    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/output/lib/lib${PROJECT_NAME}.so ${CMAKE_CURRENT_SOURCE_DIR}/python/poros/lib/
+#)
+
+# copy libporos.so to python site-packages/poros/lib for testing
+if (TEST)
+    add_custom_command(
+            TARGET ${PROJECT_NAME}
+            POST_BUILD
+            COMMENT "copy ${LIBRARY_OUTPUT_PATH}/lib${PROJECT_NAME}.so to ${Python3_SITELIB}/poros/lib/lib${PROJECT_NAME}.so for rapid testing"
+            COMMAND ${CMAKE_COMMAND} -E copy ${LIBRARY_OUTPUT_DIRECTORY}/lib${PROJECT_NAME}.so ${Python3_SITELIB}/poros/lib/lib${PROJECT_NAME}.so
+    )
+endif ()
+
+
+if (BUILD_STATIC)
+    add_library(${PROJECT_NAME}-static STATIC ${POROS_CPP_FILES})
+    set_target_properties(${PROJECT_NAME}-static PROPERTIES OUTPUT_NAME ${PROJECT_NAME})
+endif ()
+
+# build gflags
+set(GFLAGS_NAMESPACE google)
+add_subdirectory(third_party/gflags)
+
+
+find_package(BLAS)
+
+add_custom_target(
+        Creating_Symlink ALL
+        COMMAND_EXPAND_LISTS
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_SOURCE_DIR}/third_party
+        COMMAND ${CMAKE_COMMAND} -E create_symlink ${TENSORRT_LIB_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/third_party/tensorrtlib
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+        COMMENT "Creating Symlink ${CMAKE_CURRENT_SOURCE_DIR}/third_party/tensorrtlib -> ${TENSORRT_LIB_DIR} "
+        VERBATIM
+)
+
+# executable
+if (BUILD_TOOL)
+    set(POROS_TOOL ${PROJECT_NAME}-tool)
+
+
+    add_executable(${POROS_TOOL})
+
+    target_sources(${POROS_TOOL} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/tools/main.cpp)
+    target_sources(${POROS_TOOL} PUBLIC ${POROS_CPP_FILES})
+
+    target_link_libraries(${POROS_TOOL} gflags::gflags)
+    target_link_libraries(${POROS_TOOL} TensorRT::TensorRT TensorRT::Plugin)
+    target_link_libraries(${POROS_TOOL} torch)
+#    target_link_libraries(${POROS_TOOL} CUDA::toolkit)
+    target_link_libraries(${POROS_TOOL} CUDA::cudart CUDA::cusolver CUDA::cublas CUDA::cusolver CUDA::cusparse)
+    target_link_libraries(${POROS_TOOL} BLAS::BLAS)
+
+endif ()
+
+
+
+# kernel
+file(
+        GLOB POROS_KERNEL_CPP_FILES
+        ./src/poros/compile/*.cpp
+        ./src/poros/context/*.cpp
+        ./src/poros/iplugin/*.cpp
+        ./src/poros/log/*.cpp
+        ./src/poros/lowering/*.cpp
+        ./src/poros/util/*.cpp
+        ./src/poros/engine/engine.cpp
+)
+
+# kernel SHARED
+if (BUILD_KERNEL)
+    add_library(${PROJECT_NAME}-kernel SHARED ${POROS_KERNEL_CPP_FILES})
+endif ()
+
+# kernel STATIC
+if (BUILD_STATIC_KERNEL)
+    add_library(${PROJECT_NAME}-kernel-static STATIC ${POROS_KERNEL_CPP_FILES})
+    set_target_properties(${PROJECT_NAME}-kernel-static PROPERTIES OUTPUT_NAME ${PROJECT_NAME}-kernel)
+endif ()
+
+if (UT)
+    add_subdirectory(third_party/googletest)
+    add_subdirectory(unittest)
+endif ()
diff --git a/poros/LICENSE b/poros/LICENSE
new file mode 100644
index 0000000000..f58676f3e2
--- /dev/null
+++ b/poros/LICENSE
@@ -0,0 +1,203 @@
+Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
\ No newline at end of file
diff --git a/poros/README.md b/poros/README.md
new file mode 100644
index 0000000000..d87d0d3c67
--- /dev/null
+++ b/poros/README.md
@@ -0,0 +1,207 @@
+# Poros AI Inference Accelerator
+
+## Description
+
+Poros is an AI Inference Accelerator for deep learning framework. It can provide significantly lower inference latency comparing with original model, and provide much flexibility for dynamic graphs.
+Poros mainly works on the TorchScript IR currently, that means it supports the models from PyTorch, ONNX, TensorFlow and any other framework that can be converted to TorchScript. also, we are planting to support more IRs in the future.
+Poros is designed to supports multiple hardware backends conveniently, For now, Poros supports GPU and XPU (BAIDU-Kunlun) Device, It's welcomed to add additional devices.
+
+## How It Works
+
+Figure 1 is the architecture of Poros. The central part marked by the red dotted line is Model Optimizer, the main
+module of Poros. IR graphs are optimized by IR lowering, op fusing and op converting, and then segmented into engine
+related subgraph by maximize the op nums of each engine kernel and minimize the total count of engine kernels.
+
+![image](https://user-images.githubusercontent.com/54064850/203691621-e75d7c17-320c-4dff-8abe-58c3c9db99a2.png)
+
+In order to achieve the above goals on GPU, we've rewritten nearly one hundred TorchScript OPs with TensorRT API, which
+reduced extra subgraphs caused by unsupported op during subgraph partitioning. Dozens of lowering strategy including op
+fusions were employed to reduce the actual calculating load of CUDA Kernels.
+
+## Dependencies
+
+Poros is developed based on PyTorch, CUDA, TensorRT (TRT Engine), CuDNN. The minimum_required (recommended) versions of
+these packages are listed as below:
+
+| Package  | Minimum Version | Recommended Version |
+|----------|-----------------|---------------------|
+| PyTorch  | 1.9.0           | 1.12.1              |
+| CUDA     | 10.2            | 11.3                |
+| TensorRT | 8.2             | 8.4                 |
+| CuDNN    | 7.6.5           | 8.4                 |
+| Python   | 3.6.5           | 3.8                 |
+
+If you want to build for GPU Inference, it's better to align the CUDA version with the version that PyTorch built on.
+For example, we recommend you to use CUDA 11.1+ if the installed PyTorch version is 1.11.0+cu111, or some "undefined
+reference CUDA...." errors may appear during building.
+
+> There is a known cuBlas related issue of CUDA 10.2. If you are using CUDA 10.2, make sure these two patches have be installed. 
+> https://developer.nvidia.com/cuda-10.2-download-archive?target_os=Linux&target_arch=x86_64&target_distro=Ubuntu&target_version=1804&target_type=runfilelocal
+
+## How To Build
+
+### 0. Install Dependencies
+
+get Poros source code:
+
+```shell
+git clone https://github.com/PaddlePaddle/FastDeploy.git
+cd poros
+git submodule update --init --recursive --jobs 0 -f
+```
+
+We strongly recommend you to prepare the building environment with anaconda3:
+
+```shell
+conda create --name poros python=3.8
+conda activate poros
+export CMAKE_PREFIX_PATH=$CONDA_PREFIX
+conda install cmake==3.22.1 pytorch==1.12.1 cudatoolkit=11.3 numpy -c pytorch
+```
+**If CUDA has been installed as system driver, cudatoolkit is not necessary. And CMake version requires >= 3.21, GCC version requires >= 8.2.**
+
+
+Poros uses cmake to manage dependencies. It will find all dependency packages automatically as long as the packages were
+installed to the usual location. Otherwise, you should assign the install location of these packages manually.
+
+```shell
+export CUDAToolkit_ROOT=/cuda/install/dir/  #point CUDAToolkit_ROOT to the CUDA installation dir
+export TENSORRT_ROOT=/tensorrt/install/dir/ #download from Nvidia and upack, no need to install into system
+export CUDNN_ROOT=/cudnn/install/dir/       #download from Nvidia and upack, no need to install into system
+```
+Add cuda, tensorrt and cudnn into your environment variables.
+
+```shell
+export PATH=$CUDAToolkit_ROOT/bin:$PATH
+export LD_LIBRARY_PATH=$CUDAToolkit_ROOT/lib64:$TENSORRT_ROOT/lib:$CUDNN_ROOT/lib:$LD_LIBRARY_PATH
+```
+
+Additional dependency `mkl` is needed while building with PyTorch1.11 + CUDA11.1
+It can be added into cmake by installing, if not, you can try to add it by:
+```shell
+conda install mkl
+```
+
+Other packages that Poros depend on are: gflags, googletest etc. , they can be downloaded
+by ` git submodule update --init --recursive --jobs 0 -f`
+
+### 1. Build Project with CMake
+
+```shell
+cd poros
+mkdir build
+cd build
+cmake ..
+make 
+```
+
+By default, only the shared library (libporos.so) will be built.
+
+**To build a static lib (libporos.a):**
+
+```shell
+cmake -DBUILD_STATIC=on ..
+make 
+```
+
+Poros `kernel` contains the framework of Poros, as well as the IR lowering strategy, the sub-graph segmentation strategy
+and the engine manager without any specific engine (e.g. TensorRT). For Developers who want to use their own
+engines, `kernel` can be built separately with options as below:
+
+**To build a shared kernel lib (libporos-kernel.so):**
+
+```shell
+cmake -DBUILD_KERNEL=on ..
+make 
+```
+
+**To build a static kernel lib (libporos-kernel.a):**
+
+```shell
+cmake -DBUILD_STATIC_KERNEL=on ..
+make 
+```
+
+### 2. Build Distributing Package with setuptools (Python3)
+
+After the libporos.so has been built, you can build the `.whl` package for Python3:
+
+```shell
+cd ../python
+python3 setup.py bdist_wheel
+```
+
+The output looks like: `poros-0.1.0-cp38-cp38m-linux_x86_64.whl`. It can be installed easily with pip:
+
+```shell
+cd dist
+pip3 install poros-0.1.0-cp38-cp38m-linux_x86_64.whl
+```
+or, you can use `python3 setup.py develop` to create symbolic link to `python` dir.
+
+### 3. Build Executable Binary
+
+We provide an example C++ shell for users who want to build an executable binary. The `main.cpp` file locates
+at `tools/main.cpp`, you modify the code according to your needs. The executable binary `poros-tool` can be built with
+this command:
+
+```shell
+mkdir build
+cd build
+cmake -DBUILD_TOOL=on ..
+make 
+```
+
+### 4. Build Test
+```shell
+cmake -DUT=on ..
+make 
+./unit_test # run unit test
+```
+
+
+## How To Use
+
+### 1. Python Usage:
+
+```python
+import poros
+import torch
+from torchvision import models
+
+original_model = models.resnet50(pretrained=True).cuda().eval() #load/download pre-trained model
+option = poros.PorosOptions() #set poros option
+poros_model = poros.compile(torch.jit.script(original_model), input_datas, option) #build the model
+
+input = torch.randn(1,3,224,224, dtype=torch.float32).cuda()
+poros_res = poros_model(input) # use compiled model in the same way as the original model
+
+```
+
+The complete benchmark example (resnet50) .py script is `python/example/test_resnet.py`
+
+```shell
+python3 python/example/test_resnet.py
+```
+
+### 2. CPP Usage:
+
+If the executable binary `poros-tool` is built, you can run the benchmark like this:
+
+```shell
+./poros-tool --module_file_path ../../poros/tools/std_pretrained_resnet50_gpu.pt --test_mode=original #original PyTorch model
+./poros-tool --module_file_path ../../poros/tools/std_pretrained_resnet50_gpu.pt --test_mode=poros #poros compiled model
+```
+> PyTorch has changed the packaging format of model since 1.4+, while the pretrained model of resnet50 is still using the old format (.tar).
+> You may need to convert the format to the newer one (.zip) by your self. Convert command like this:
+> ```python
+> original_model = models.resnet50(pretrained=True).cuda().eval()
+> torch.save(original_model, 'std_pretrained_resnet50_gpu.pt', _use_new_zipfile_serialization=False)
+> ```
+
+## Benchmark
+
+Take a look at the [Benchmark](docs/Benchmark.md).
+
+## Acknowledgement
+Poros has been incubated for more than 2 years. In this project, NVIDIA helped us a lot (especially  Gary Ji, Vincent Zhang, Jie Fang). They answered lots of technical questions about GPU and gave us many suggestions. Appreciate for their great support.
diff --git a/poros/cmake/FindTensorRT.cmake b/poros/cmake/FindTensorRT.cmake
new file mode 100644
index 0000000000..32a3eb57cf
--- /dev/null
+++ b/poros/cmake/FindTensorRT.cmake
@@ -0,0 +1,87 @@
+#####################################
+## tensorrt specific configuration ##
+#####################################
+
+set(_TensorRT_SEARCHES)
+
+if (DEFINED ENV{TENSORRT_ROOT})
+    set(_TensorRT_SEARCH_ROOT PATHS $ENV{TENSORRT_ROOT} NO_DEFAULT_PATH)
+    list(APPEND _TensorRT_SEARCHES _TensorRT_SEARCH_ROOT)
+endif ()
+
+if (DEFINED ENV{TensorRT_INCLUDE_DIR})
+    set(TensorRT_INCLUDE_DIR $ENV{TensorRT_INCLUDE_DIR})
+endif ()
+
+if (DEFINED ENV{TensorRT_LIBRARY})
+    set(TensorRT_LIBRARY $ENV{TensorRT_LIBRARY})
+endif ()
+
+# appends some common paths
+set(_TensorRT_SEARCH_NORMAL
+        PATHS "/usr/src/tensorrt/" # or custom tensorrt path
+        PATHS "/usr/local/tensorrt/" # or custom tensorrt path
+        PATHS "${PROJECT_SOURCE_DIR}/third_party/TensorRT/" # or custom tensorrt path
+        )
+list(APPEND _TensorRT_SEARCHES _TensorRT_SEARCH_NORMAL)
+
+# Include dir
+foreach (search ${_TensorRT_SEARCHES})
+    find_path(TensorRT_INCLUDE_DIR NAMES NvInfer.h NvInferPlugin.h ${${search}} PATH_SUFFIXES include)
+endforeach ()
+
+if (NOT TensorRT_LIBRARY)
+    foreach (search ${_TensorRT_SEARCHES})
+        find_library(TensorRT_LIBRARY NAMES nvinfer ${${search}} PATH_SUFFIXES lib)
+    endforeach ()
+endif ()
+
+if (NOT TensorRT_PLUGIN_LIBRARY)
+    foreach (search ${_TensorRT_SEARCHES})
+        find_library(TensorRT_PLUGIN_LIBRARY NAMES nvinfer_plugin ${${search}} PATH_SUFFIXES lib)
+    endforeach ()
+endif ()
+
+
+
+if (TensorRT_INCLUDE_DIR AND EXISTS "${TensorRT_INCLUDE_DIR}/NvInferVersion.h")
+    file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInferVersion.h" TensorRT_MAJOR REGEX "^#define NV_TENSORRT_MAJOR [0-9]+.*$")
+    file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInferVersion.h" TensorRT_MINOR REGEX "^#define NV_TENSORRT_MINOR [0-9]+.*$")
+    file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInferVersion.h" TensorRT_PATCH REGEX "^#define NV_TENSORRT_PATCH [0-9]+.*$")
+
+    string(REGEX REPLACE "^#define NV_TENSORRT_MAJOR ([0-9]+).*$" "\\1" TensorRT_VERSION_MAJOR "${TensorRT_MAJOR}")
+    string(REGEX REPLACE "^#define NV_TENSORRT_MINOR ([0-9]+).*$" "\\1" TensorRT_VERSION_MINOR "${TensorRT_MINOR}")
+    string(REGEX REPLACE "^#define NV_TENSORRT_PATCH ([0-9]+).*$" "\\1" TensorRT_VERSION_PATCH "${TensorRT_PATCH}")
+    set(TensorRT_VERSION_STRING "${TensorRT_VERSION_MAJOR}.${TensorRT_VERSION_MINOR}.${TensorRT_VERSION_PATCH}")
+endif ()
+
+
+include(FindPackageHandleStandardArgs)
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(TensorRT REQUIRED_VARS TensorRT_LIBRARY TensorRT_PLUGIN_LIBRARY TensorRT_INCLUDE_DIR VERSION_VAR TensorRT_VERSION_STRING)
+message(STATUS "TensorRT_LIBRARY: ${TensorRT_LIBRARY}")
+message(STATUS "TensorRT_PLUGIN_LIBRARY: ${TensorRT_PLUGIN_LIBRARY}")
+message(STATUS "TensorRT_INCLUDE_DIR: ${TensorRT_INCLUDE_DIR}")
+message(STATUS "TensorRT: ${TensorRT_VERSION_STRING}")
+if (TensorRT_FOUND)
+    set(TensorRT_INCLUDE_DIRS ${TensorRT_INCLUDE_DIR})
+
+    if (NOT TensorRT_LIBRARIES)
+        set(TensorRT_LIBRARIES ${TensorRT_LIBRARY})
+    endif ()
+
+    if (NOT TARGET TensorRT::TensorRT)
+        add_library(TensorRT::TensorRT UNKNOWN IMPORTED)
+        set_target_properties(TensorRT::TensorRT PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${TensorRT_INCLUDE_DIRS}")
+        set_property(TARGET TensorRT::TensorRT APPEND PROPERTY IMPORTED_LOCATION "${TensorRT_LIBRARY}")
+        set_property(TARGET TensorRT::TensorRT APPEND PROPERTY VERSION "${TensorRT_VERSION_STRING}")
+    endif ()
+
+    if (NOT TARGET TensorRT::Plugin)
+        add_library(TensorRT::Plugin UNKNOWN IMPORTED)
+        set_target_properties(TensorRT::Plugin PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${TensorRT_INCLUDE_DIRS}")
+        set_property(TARGET TensorRT::Plugin APPEND PROPERTY IMPORTED_LOCATION "${TensorRT_PLUGIN_LIBRARY}")
+        set_property(TARGET TensorRT::Plugin APPEND PROPERTY VERSION "${TensorRT_VERSION_STRING}")
+    endif ()
+endif ()
+
+
diff --git a/poros/docs/Benchmark.md b/poros/docs/Benchmark.md
new file mode 100644
index 0000000000..a0021a4b60
--- /dev/null
+++ b/poros/docs/Benchmark.md
@@ -0,0 +1,130 @@
+# Benchmark
+## Environment
+This benchmark is tested on CentOS7 with GPU `A10` and CPU `Intel(R) Xeon(R) Platinum 8350C CPU @ 2.60GHz`, and its environment is as follows:
+| Package  |  Version |
+|----------|----------|
+| CUDA     | 11.3     |
+| cuDNN    | 8.3.2.44 |
+| TensorRT | 8.4.1.5  |
+| Python   | 3.8.13   |
+| PyTorch  | 1.12.1   |
+
+## Performance
+The following is the result of comparison between pytorch eager and poros, which measured by average latency time (ms) of model infering 1000 times.
+
+### 1. ResNet50
+Input shape: bx3x224x224  
+| Batch size | PyTorch (ms) |  Poros (ms) |
+|------------|--------------|-------------|
+| 1          |   6.17       |  1.70       |
+| 2          |   6.02       |  2.41       |
+| 4          |   6.33       |  3.23       |
+| 8          |   8.55       |  4.75       |
+| 16         |   16.22      |  7.82       |
+| 32         |   32.09      |  14.00      |  
+
+Model source: https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py  
+
+### 2. VGG16
+Input shape: bx3x224x224
+| Batch size | PyTorch (ms) |  Poros (ms) |
+|------------|--------------|-------------|
+| 1          |   3.20       |  2.71       |
+| 2          |   4.97       |  3.78       |
+| 4          |   8.20       |  6.09       |
+| 8          |   14.64      |  10.20      |
+| 16         |   27.47      |  19.17      |
+| 32         |   53.09      |  36.47      |
+
+Model source: https://github.com/pytorch/vision/blob/main/torchvision/models/vgg.py  
+
+### 3. MobileNetV2
+Input shape: bx3x224x224
+| Batch size | PyTorch (ms) |  Poros (ms) |
+|------------|--------------|-------------|
+| 1          |   3.85       |  0.65       |
+| 2          |   3.75       |  0.86       |
+| 4          |   3.90       |  1.19       |
+| 8          |   4.18       |  2.08       |
+| 16         |   8.43       |  3.83       |
+| 32         |   16.57      |  7.14       |
+
+Model source: https://github.com/tonylins/pytorch-mobilenet-v2/blob/master/MobileNetV2.py  
+
+### 4. InceptionV3
+Input shape: bx3x224x224
+| Batch size | PyTorch (ms) |  Poros (ms) |
+|------------|--------------|-------------|
+| 1          |   10.05      |  2.51       |
+| 2          |   10.13      |  3.22       |
+| 4          |   10.08      |  3.70       |
+| 8          |   10.15      |  4.95       |
+| 16         |   12.51      |  7.11       |
+| 32         |   21.43      |  11.22      |
+
+Model source: https://github.com/pytorch/vision/blob/main/torchvision/models/inception.py  
+
+### 5. Efficientnet_b0
+Input shape: bx3x224x224
+| Batch size | PyTorch (ms) |  Poros (ms) |
+|------------|--------------|-------------|
+| 1          |   8.28       |  1.28       |
+| 2          |   8.50       |  1.57       |
+| 4          |   8.49       |  2.29       |
+| 8          |   8.83       |  3.65       |
+| 16         |   10.65      |  6.62       |
+| 32         |   20.51      |  12.51      |
+
+Model source: https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/efficientnet.py  
+
+### 6. Bert-base-uncased
+Input shape: bx128
+| Batch size | PyTorch (ms) |  Poros (ms) |
+|------------|--------------|-------------|
+| 1          |   6.40       |  2.02       |
+| 2          |   7.14       |  2.59       |
+| 4          |   11.58      |  4.39       |
+| 8          |   21.64      |  8.41       |
+| 16         |   44.20      |  16.90      |
+| 32         |   92.69      |  32.21      |
+
+Model source: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py  
+
+### 7. Vision Transformer (ViT)
+Input shape: bx3x224x224
+| Batch size | PyTorch (ms) |  Poros (ms) |
+|------------|--------------|-------------|
+| 1          |   6.38       |  3.07       |
+| 2          |   10.35      |  4.57       |
+| 4          |   19.06      |  8.37       |
+| 8          |   36.71      |  16.34      |
+| 16         |   73.84      |  29.92      |
+| 32         |   147.70     |  58.11      |
+
+Model source: https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/vision_transformer.py 
+
+### 8. YOLOv5s
+Input shape: bx3x640x640
+| Batch size | PyTorch (ms) |  Poros (ms) |
+|------------|--------------|-------------|
+| 1          |   6.17       |  2.22       |
+| 2          |   5.93       |  3.96       |
+| 4          |   10.02      |  6.84       |
+| 8          |   20.02      |  12.86      |
+| 16         |   38.17      |  24.80      |
+| 32         |   77.19      |  49.16      |
+
+Model source: https://github.com/ultralytics/yolov5/blob/master/models/yolo.py  
+
+### 9. Swin Transformer
+Input shape: bx3x224x224
+| Batch size | PyTorch (ms) |  Poros (ms) |
+|------------|--------------|-------------|
+| 1          |   14.11      |  7.68       |
+| 2          |   22.73      |  11.99      |
+| 4          |   42.21      |  21.74      |
+| 8          |   83.07      |  42.18      |
+| 16         |   162.34     |  78.34      |
+| 32         |   317.43     |  149.72     |
+
+Model source: https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py 
\ No newline at end of file
diff --git a/poros/example/test_resnet.py b/poros/example/test_resnet.py
new file mode 100644
index 0000000000..7c9ad1f878
--- /dev/null
+++ b/poros/example/test_resnet.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2022 Baidu, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+test resnet50
+"""
+
+import time
+import poros
+import torch
+from torchvision import models
+
+torch.set_grad_enabled(False)
+
+def  load_example_input_datas():
+    """fake data"""
+    data_list = []
+    input_1 = torch.randn(1, 3, 224, 224, dtype=torch.float32).cuda()
+    data_list.append(input_1)
+    return data_list
+
+
+if __name__ == '__main__':
+
+    input_datas = load_example_input_datas()
+    original_model = models.resnet50(pretrained=True).cuda().eval()
+
+    option = poros.PorosOptions()
+    # option.max_workspace_size = 1 << 30
+    # option.is_dynamic = False
+    # option.debug = True
+    # option.unconst_ops_thres = 0
+
+
+    try:
+        poros_model = poros.compile(torch.jit.script(original_model), input_datas, option)
+    except Exception as e:
+        print("compile poros_model failed. error msg: {}".format(e))
+        exit(0)
+
+
+    for input in input_datas:
+        ori_res = original_model(input)
+        poros_res = poros_model(input)
+        res_diff = torch.abs(ori_res - poros_res)
+        print("max_diff", torch.max(res_diff))
+        print(poros_res.shape)
+
+    # warm up
+    for i in range (100):
+        for input in input_datas:
+            ori_res = original_model(input)
+            poros_res = poros_model(input)
+
+    count = 1000
+
+    # POROS benchmark
+    torch.cuda.synchronize()
+    st = time.time()
+    for i in range (count):
+        # step4: 预测。
+        for input in input_datas:
+            poros_res = poros_model(input)
+
+    torch.cuda.synchronize()
+    poros_elapsed_time = time.time() - st
+    print("poros infer time:{:.5f}ms/infer".format(poros_elapsed_time))
+
+    # original benchmark
+    torch.cuda.synchronize()
+    st = time.time()
+    for i in range (count):
+        # step4: 预测。
+        for input in input_datas:
+            ori_res = original_model(input)
+
+    torch.cuda.synchronize()
+    original_elapsed_time = time.time() - st
+    print("original infer time/:{:.5f}ms/infer".format(original_elapsed_time))
+    print("speedup: +{:.2f}%".format((original_elapsed_time / poros_elapsed_time - 1 ) * 100))
\ No newline at end of file
diff --git a/poros/python/example/example.py b/poros/python/example/example.py
new file mode 100644
index 0000000000..695c3008bf
--- /dev/null
+++ b/poros/python/example/example.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2022 Baidu, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+
+import torch
+import poros
+import numpy as np
+
+def load_example_model():
+    """load model示例，正常load model即可"""
+    import torchvision.models as models
+    std_resnet = models.resnet50(pretrained=True)
+    std_resnet.cuda()
+    std_resnet.eval()
+    return std_resnet
+
+
+def  load_example_input_datas():
+    """加载预热数据"""
+    data_list = []
+    #max size
+    input_1 = np.ones((3, 3, 96, 320), np.float32)
+    input_tensor = torch.from_numpy(input_1).cuda()
+    data_list.append(input_tensor)
+
+    #min size
+    input_2 = np.ones((1, 3, 96, 320), np.float32)
+    input_tensor2 = torch.from_numpy(input_2).cuda()
+    data_list.append(input_tensor2)
+    
+    #opt size
+    input_3 = np.ones((1, 3, 96, 320), np.float32)
+    input_tensor3 = torch.from_numpy(input_3).cuda()
+    data_list.append(input_tensor3)
+
+    return data_list
+
+
+if __name__ == '__main__':
+    print("this is an example for poros")
+
+    # step1: 按照正常的torch模块的步骤，load模型和参数，此处以resnet50为例
+    # load_example_model 过程中load的原始pytorch模型（python代码），必须是完成了poros预处理的python代码
+    # poros预处理相关wiki: 【待补充】
+    original_model = load_example_model()
+
+    # step2: 准备预热数据。
+    # 请准备 1-3 份内容不一样的预热数据(example中是准备了3份一样的预热数据，只是示例，实际中尽量不要这样做)
+    # 每一份预热数据用tuple封装，除非该模型只有一个输入，且这个输入类型是torch.Tensor.
+    # 多份预热数据用list连接。
+    # ！！！注意: 预热数据是必须的。
+    input_datas = load_example_input_datas()
+
+    # step3: 调用poros，编译原始的model，得到PorosModel
+    # 当 option.is_dynamic 为true时，设置的预热数据的个数必须为3的倍数。
+    # 当 option.is_dynamic 为false是，设置的预热数据至少为1份。
+    option = poros.PorosOptions()
+    option.is_dynamic = True
+    #option.debug = True
+
+    try:
+        poros_model = poros.compile(original_model, input_datas, option)
+    except Exception as e:
+        print("compile poros_model failed. error msg: {}".format(e))
+        #poros_model = original_model
+        exit(0)
+    
+    # 序列化&反序列化 
+    # poros.save(poros_model, "poros_model.pt")
+    # poros_model = poros.load("poros_model.pt", option)
+
+    # 准备测试用的batch数据
+    input = np.ones((3, 3, 96, 320), np.float32)
+    batch_tensor = torch.from_numpy(input).cuda()
+
+    # step4: 预测。
+    #result = poros_model(input_datas[0])
+    result = poros_model(batch_tensor)
+    #result = original_model(batch_tensor)
+
+    print(result.size())
+    print(result)
diff --git a/poros/python/poros/__init__.py b/poros/python/poros/__init__.py
new file mode 100644
index 0000000000..0c2550fa20
--- /dev/null
+++ b/poros/python/poros/__init__.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2022 Baidu, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+init file for poros
+"""
+
+import os
+import sys
+
+if sys.version_info < (3, 6):
+    raise Exception("Poros can only work on Python 3.6+")
+
+import ctypes
+import torch
+
+from poros._compile import *
+from poros._module import PorosOptions
+
+def _register_with_torch():
+    poros_dir = os.path.dirname(__file__)
+    torch.ops.load_library(poros_dir + '/lib/libporos.so')
+
+_register_with_torch()
\ No newline at end of file
diff --git a/poros/python/poros/_compile.py b/poros/python/poros/_compile.py
new file mode 100644
index 0000000000..056a3b0be8
--- /dev/null
+++ b/poros/python/poros/_compile.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2022 Baidu, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+compile function for poros.
+"""
+
+from typing import List, Dict, Any
+import torch
+from torch import nn
+
+import poros._C
+from poros._input_convert import convert_prewarm_inputs
+from poros._input_convert import convert_poros_option
+from poros._module import PorosModule
+
+
+def wrap_cpp_module(cpp_module):
+    """
+    Wrap torch._C.ScriptModule to porosModule, recursively for all submodules
+    """
+    def init_fn(script_module):
+        """init_fn"""
+        for name, cpp_module in torch._C.ModuleDict(script_module._c).items():
+            setattr(script_module, name, wrap_cpp_module(cpp_module))
+        script_module._concrete_type = torch._C.ConcreteModuleType.from_jit_type(script_module._c._type())
+
+        for idx, fn in enumerate(script_module._c._get_forward_pre_hooks()):
+            script_module._forward_pre_hooks[idx] = fn
+        for idx, fn in enumerate(script_module._c._get_forward_hooks()):
+            script_module._forward_hooks[idx] = fn
+
+    return PorosModule._construct(cpp_module, init_fn)
+
+
+def load(filename, poros_options):
+    """
+    Args:
+        filename( str): poros model save path
+        poros_options(PorosOptions / Dict of settings): compile settings for poros
+    Returns:
+        PorosModule: Compiled Module of poros, 
+                    when run it will partially execute via inlined engine (which is TensorRT)
+    """
+    compiled_cpp_mod = poros._C.load(filename, convert_poros_option(poros_options))
+    compiled_module = wrap_cpp_module(compiled_cpp_mod)
+    return compiled_module
+
+def save(module, filename):
+    """
+    Args:
+        module（PorosModule）: poros module
+        filename( str): poros model save path
+    """
+    assert type(module).__name__ == "PorosModule", "The type of module must be PorosModule"
+    assert type(filename).__name__ == "str", "The type of filename must be str"
+    module.save(filename)
+
+def compile(module, prewarm_inputs, poros_options):
+    """
+    Compile a TorchScriptModule/nn.Module to porosModule
+    Converts specifically the forward method of the original Module
+    Args:
+        module (torch.nn.Module / torch.jit.ScriptModule): Source module
+        input (list of tensor input): prewarmed data.
+        poros_options(PorosOptions / Dict of settings): compile settings for poros
+    Returns:
+        PorosModule: Compiled Module of poros, 
+                    when run it will partially execute via inlined engine (which is TensorRT)
+    """  
+    if poros_options.device == "GPU":  
+        assert "cuda" in str(list(module.state_dict().values())[0].device), \
+            "If the poros_options.device is GPU, the module.device should also is GPU"
+
+    sp_model = None
+    if isinstance(module, torch.jit.ScriptModule):
+        sp_model = module
+    else:
+        if poros_options.preprocess_mode == 0:
+            sp_model = torch.jit.script(module, optimize=None, _frames_up=0, _rcb=None)
+        elif poros_options.preprocess_mode == 1:
+            sp_model = torch.jit.trace(module, prewarm_inputs[0])
+        else:
+            raise ValueError(
+                "preprocess_mode value err: The range of preprocess_mode is [0,1]")
+    
+    if sp_model is None:
+        raise TypeError(
+            "can't trans to poros module currently")
+
+    wraped_inputs = convert_prewarm_inputs(prewarm_inputs)
+
+    compiled_cpp_mod = poros._C.compile_graph(sp_model._c, wraped_inputs, convert_poros_option(poros_options))
+    compiled_module = wrap_cpp_module(compiled_cpp_mod)
+    return compiled_module
diff --git a/poros/python/poros/_input_convert.py b/poros/python/poros/_input_convert.py
new file mode 100644
index 0000000000..256cb06abc
--- /dev/null
+++ b/poros/python/poros/_input_convert.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2022 Baidu, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+ convert prewarm-data to c10:ivalues list that can handle in poros.
+"""
+
+import typing #  List & Dict & Any
+import torch
+import poros._C
+
+from poros._module import DynamicOptions
+from poros._module import PorosOptions
+from poros._parse_util import _parse_device
+
+def make_to_tuple(prewarm_input):
+    """
+    wrap a single torch.Tensor input to tuple
+    """
+    if isinstance(prewarm_input, torch.Tensor):
+        return (prewarm_input,)
+    # done primarily so that weird iterables fail here and not pybind11 code
+    if not isinstance(prewarm_input, tuple):
+        return tuple(prewarm_input)
+    return prewarm_input
+
+
+def convert_prewarm_inputs(prewarm_inputs):
+    # type: (Any) -> poros._C.PreWarmDatas
+    """
+    convert prewarm-data to c10:ivalues list that can handle in poros.
+    we can accept 3 kinds of prewarm_inputs:
+        one input that has a single tensor [torch.Tensor]
+        one input that has multiple variables [tuple]
+        more than one input, that each input has a single tensor [List of torch.Tensor]
+        more that one input, that each input has multiple variables [List of tuple]
+    """
+    wraped_prewarm_inputs = []
+    if isinstance(prewarm_inputs, torch.Tensor):
+        wraped_prewarm_inputs.append(make_to_tuple(prewarm_inputs))
+    elif isinstance(prewarm_inputs, tuple):
+        wraped_prewarm_inputs.append(prewarm_inputs)
+    elif isinstance(prewarm_inputs, list):
+        for member in prewarm_inputs:
+            if isinstance(member, torch.Tensor):
+                wraped_prewarm_inputs.append(make_to_tuple(member))
+            elif isinstance(member, tuple):
+                wraped_prewarm_inputs.append(member)
+            else:
+                raise TypeError("prewarm_inputs for poros should be torch.Tensor or wraped as tuple, fix it")
+    else:
+        raise TypeError("prewarm_inputs for poros should be torch.Tensor or wraped as tuple or inputs-lists, fix it")
+    return wraped_prewarm_inputs     
+    # info = poros._C.PreWarmDatas()
+    # info.set_data(prewarm_inputs)
+
+def convert_poros_option(poros_option):
+    # type: Dict[str, Any] -> poros._C.PorosOptions
+    """
+    converter key-value poros_option to PorosOptions that can handle in poros
+    """
+    option = poros._C.PorosOptions()
+    if poros_option is None:
+        #default situation. if user do not set the poros_option
+        return option
+    elif isinstance(poros_option, dict):
+        if "debug" in poros_option:
+            assert isinstance(poros_option["debug"], bool)
+            option.debug = poros_option["debug"]
+
+        if "use_fp16" in poros_option:
+            assert isinstance(poros_option["use_fp16"], bool)
+            option.use_fp16 = poros_option["use_fp16"]
+        
+        if "max_workspace_size" in poros_option:
+            assert type(poros_option["max_workspace_size"]) is int
+            option.max_workspace_size = poros_option["max_workspace_size"]
+
+        if "device" in poros_option:
+            option.device = _parse_device(poros_option["device"])
+        
+        if "is_dynamic" in poros_option:
+            assert isinstance(poros_option["is_dynamic"], bool)
+            option.is_dynamic = poros_option["is_dynamic"]
+
+        if "long_to_int" in poros_option:
+            assert isinstance(poros_option["long_to_int"], bool)
+            option.long_to_int = poros_option["long_to_int"]
+        
+        if "device_id" in poros_option:
+            assert type(poros_option["device_id"]) is int
+            option.device_id = poros_option["device_id"]
+
+        if "preprocess_mode" in poros_option:
+            assert type(poros_option["preprocess_mode"]) is int
+            option.preprocess_mode= poros_option["preprocess_mode"]
+
+        return option
+    elif isinstance(poros_option, PorosOptions):
+        return poros_option.to_internal()
+    else:
+        raise TypeError("poros_option for poros should be PorosOptions or a attribute dict fix it")
diff --git a/poros/python/poros/_module.py b/poros/python/poros/_module.py
new file mode 100644
index 0000000000..13349eaf06
--- /dev/null
+++ b/poros/python/poros/_module.py
@@ -0,0 +1,174 @@
+# Copyright (c) 2022 Baidu, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+poros module here
+"""
+
+import poros._C
+from poros._parse_util import _parse_device
+
+from torch.jit._script import RecursiveScriptModule
+from torch.jit._script import ScriptModule
+from torch.jit._script import script
+
+class DynamicOptions(object):
+    """
+    dynamic settings for poros
+    """
+    def __init__(self):
+        """set defalut dynamic options"""
+        self.is_dynamic = False
+        self.min_shapes = []
+        self.opt_shapes = []
+        self.max_shapes = []
+
+    def set_dynamic_options(self, min, opt, max):
+        """situation when give three inputs is given"""
+        option_list = [min, opt, max]
+        for item in option_list:
+            if not (isinstance(item, list)):
+                raise TypeError("dynamic_option for poros should be IntList, fix it")
+        option_list.sort()
+        self.min_shapes = option_list[0]
+        self.opt_shapes = option_list[1]
+        self.max_shapes = option_list[2]
+
+    def set_dynamic_option(self, opt):
+        """situation when only one input is given"""
+        if not isinstance(opt, list):
+            raise TypeError("dynamic_option for poros should be IntList, fix it")
+        else:
+            self.min_shapes = opt
+            self.opt_shapes = opt
+            self.max_shapes = opt
+            self.is_dynamic = False
+    
+    def get_dynamic_options(self):
+        """get dynamic options"""
+        return [self.min_shapes, self.opt_shapes, self.max_shapes]
+
+    def to_internal(self):
+        """
+        change DynamicOptions in python env to DynamicShapeOptions in c++ env
+        """
+        option = poros._C.DynamicShapeOptions()
+        assert isinstance(self.is_dynamic, bool)
+        option.is_dynamic = self.is_dynamic
+
+        assert isinstance(self.min_shapes, list)
+        option.min_shapes = self.min_shapes
+        assert isinstance(self.opt_shapes, list)
+        option.opt_shapes = self.opt_shapes
+        assert isinstance(self.max_shapes, list)
+        option.max_shapes = self.max_shapes
+        return option
+
+class PorosOptions(object):
+    """
+    options for poros
+    """
+    available_devices = ["GPU", "CPU", "XPU"]
+    available_debug_mode = [True, False]
+    def __init__(self):
+        self.device = "GPU"
+        self.debug = False
+        self.use_fp16 = False
+        self.max_workspace_size = 1 << 30
+        self.is_dynamic = False
+        self.long_to_int = True
+        self.device_id = -1
+        self.unconst_ops_thres = -1
+        self.use_nvidia_tf32 = True
+        self.preprocess_mode = 0
+        self.unsupport_op_list = []
+
+    def to_internal(self):
+        """
+        change PorosOptions in python env to PorosOptions in c++ env
+        """
+        option = poros._C.PorosOptions()
+        option.device = _parse_device(self.device)
+        assert isinstance(self.debug, bool)
+        option.debug = self.debug
+        assert isinstance(self.use_fp16, bool)
+        option.use_fp16 = self.use_fp16
+        assert type(self.max_workspace_size) is int
+        option.max_workspace_size = self.max_workspace_size
+        assert isinstance(self.is_dynamic, bool)
+        option.is_dynamic = self.is_dynamic
+        assert isinstance(self.long_to_int, bool)
+        option.long_to_int = self.long_to_int
+        assert type(self.device_id) is int
+        option.device_id = self.device_id
+        assert type(self.unconst_ops_thres) is int
+        option.unconst_ops_thres = self.unconst_ops_thres
+        assert type(self.use_nvidia_tf32) is bool
+        option.use_nvidia_tf32 = self.use_nvidia_tf32
+        assert type(self.preprocess_mode) is int
+        option.preprocess_mode = self.preprocess_mode
+        assert type(self.unsupport_op_list) is list
+        option.unsupport_op_list = self.unsupport_op_list
+
+        return option
+
+    def set_device(self, device):
+        """set device"""
+        if device not in PorosOptions.available_devices:
+            raise TypeError("device for poros invalid, only %s supported, fix it" % (PorosOptions.available_devices))
+        self.device = device
+    
+    def set_debug(self, debug):
+        """set debug"""
+        if debug not in PorosOptions.available_debug_mode:
+            raise TypeError("device for poros invalid, only %s supported, fix it" % (PorosOptions.available_debug_mode))
+        self.debug = debug
+
+
+class PorosModule(RecursiveScriptModule):
+    """
+    The core data structure of poros. 
+    """
+    def __init__(self, cpp_module):
+        super(PorosModule, self).__init__(cpp_module)
+        # self.options = PorosOptions()
+        # if option is not None and isinstance(option, PorosOptions):
+        #     self.options = option
+            
+    @staticmethod
+    def _construct(cpp_module, init_fn):
+        """
+        Construct a PorosModule that's ready for use. 
+        Args:
+            cpp_module:  The C++ Module that will hold the actual state of
+                            this PorosModule instance.
+            init_fn:  Lambda that initializes the PorosModule passed to it.
+        """
+        script_module = PorosModule(cpp_module)
+        init_fn(script_module)
+
+        # Finalize the ScriptModule: replace the nn.Module state with our
+        # custom implementations and flip the _initializing bit.
+        PorosModule._finalize_scriptmodule(script_module)
+        return script_module
+    
+    @property
+    def supported_engine(self):
+        """supported engine"""
+        return ["tensorrt"]
+
+    # @property
+    # def options(self):
+    #     """current options"""
+    #     return self.options
+
diff --git a/poros/python/poros/_parse_util.py b/poros/python/poros/_parse_util.py
new file mode 100644
index 0000000000..a3b74dc24a
--- /dev/null
+++ b/poros/python/poros/_parse_util.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2022 Baidu, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+ parse util for some python settings to poros c++ setting.
+"""
+
+import typing #  List & Dict & Any
+import poros._C
+
+def _parse_device(device):
+    # type: Any -> poros._C.Device
+    """
+    converter device info to Device struct that can handle in poros
+    """
+    if isinstance(device, poros._C.Device):
+        return device
+    elif isinstance(device, str):
+        if device == "GPU" or device == "gpu":
+            return poros._C.Device.GPU
+        elif device == "CPU" or device == "cpu":
+            return poros._C.Device.CPU
+        elif device == "XPU" or device == "xpu":
+            return poros._C.Device.XPU
+        else:
+            ValueError("Got a device type unknown (type: " + str(device) + ")")
+    else:
+        raise TypeError("Device specification must be of type string or poros.Device, but got: " +
+                str(type(device)))
\ No newline at end of file
diff --git a/poros/python/poros/csrc/poros_py.cpp b/poros/python/poros/csrc/poros_py.cpp
new file mode 100644
index 0000000000..caed7e985a
--- /dev/null
+++ b/poros/python/poros/csrc/poros_py.cpp
@@ -0,0 +1,102 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file poros_py.cpp
+* @author tianjinjin@baidu.com
+* @date Thu Jul  1 10:25:01 CST 2021
+* @brief 
+**/
+
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+#include "Python.h"
+
+#include "torch/csrc/jit/python/pybind_utils.h"
+#include "torch/csrc/utils/pybind.h"
+#include "torch/custom_class.h"
+#include "torch/script.h"
+#include "torch/torch.h"
+
+#include "poros/compile/compile.h"
+
+namespace py = pybind11;
+
+namespace poros {
+namespace pyapi {
+
+torch::jit::Module compile_graph(const torch::jit::Module& mod, 
+                                const py::list& input_list,
+                                baidu::mirana::poros::PorosOptions& poros_option) {
+    auto function_schema = mod.get_method("forward").function().getSchema();
+    py::gil_scoped_acquire gil;
+    std::vector<torch::jit::Stack> prewarm_datas;
+    for (auto& input_tuple : input_list) {
+        torch::jit::Stack stack;
+        for (auto& input: input_tuple) {
+            stack.push_back(torch::jit::toTypeInferredIValue(input));
+        }
+        prewarm_datas.push_back(stack);
+    }
+    
+    auto poros_mod = baidu::mirana::poros::CompileGraph(mod, prewarm_datas, poros_option);
+    if (poros_mod) {
+        return *poros_mod;
+    } else {
+        throw c10::Error("comile failed", "");
+    }
+}
+
+torch::jit::Module load(const std::string& filename, const baidu::mirana::poros::PorosOptions& options) {
+    auto poros_mod = baidu::mirana::poros::Load(filename, options);
+    return *poros_mod;
+}
+
+PYBIND11_MODULE(_C, m) {
+    py::enum_<baidu::mirana::poros::Device>(m, "Device", "Enum to specify device kind to build poros Module")
+        .value("GPU", baidu::mirana::poros::Device::GPU, "Spiecify using GPU as the backend of poros Module")
+        .value("CPU", baidu::mirana::poros::Device::CPU, "Spiecify using CPU as the backend of poros Module")
+        .value("XPU", baidu::mirana::poros::Device::XPU, "Spiecify using XPU as the backend of poros Module")
+        .export_values();
+
+    py::class_<baidu::mirana::poros::PorosOptions>(m, "PorosOptions")
+        .def(py::init<>())
+        .def_readwrite("device", &baidu::mirana::poros::PorosOptions::device)
+        .def_readwrite("debug", &baidu::mirana::poros::PorosOptions::debug)
+        .def_readwrite("use_fp16", &baidu::mirana::poros::PorosOptions::use_fp16)
+        .def_readwrite("is_dynamic", &baidu::mirana::poros::PorosOptions::is_dynamic)
+        .def_readwrite("long_to_int", &baidu::mirana::poros::PorosOptions::long_to_int)
+        .def_readwrite("device_id", &baidu::mirana::poros::PorosOptions::device_id)
+        .def_readwrite("max_workspace_size", &baidu::mirana::poros::PorosOptions::max_workspace_size)
+        .def_readwrite("unconst_ops_thres", &baidu::mirana::poros::PorosOptions::unconst_ops_thres)
+        .def_readwrite("use_nvidia_tf32", &baidu::mirana::poros::PorosOptions::use_nvidia_tf32)
+        .def_readwrite("preprocess_mode", &baidu::mirana::poros::PorosOptions::preprocess_mode)
+        .def_readwrite("unsupport_op_list", &baidu::mirana::poros::PorosOptions::unsupport_op_list);
+        
+
+    m.doc() = "Poros C Bindings";
+    m.def(
+        "compile_graph",
+        &poros::pyapi::compile_graph,
+        "compile a PyTorch module and return a Poros module \
+        that can significantly lower the inference latency");
+    m.def(
+        "load",
+        &poros::pyapi::load,
+        "load poros model");
+}
+
+} // namespace pyapi
+} // namespace poros
diff --git a/poros/python/setup.py b/poros/python/setup.py
new file mode 100644
index 0000000000..fb0b8cfa6a
--- /dev/null
+++ b/poros/python/setup.py
@@ -0,0 +1,168 @@
+# Copyright (c) 2022 Baidu, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+A setup module for the poros Python package.
+"""
+
+# for python2 compatiblity
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import glob
+import sys
+import setuptools
+from setuptools import find_packages
+from setuptools.command.develop import develop
+from setuptools.command.install import install
+import shutil
+from torch.utils import cpp_extension
+from wheel.bdist_wheel import bdist_wheel
+from distutils.cmd import Command
+from distutils import spawn
+from distutils.sysconfig import get_python_lib
+import multiprocessing
+
+# Constant known variables used throughout this file
+THREAD_NUM = multiprocessing.cpu_count()
+CXX11_ABI = False
+CURRENT_PATH = os.path.dirname(os.path.abspath(__file__))
+
+if "--use-cxx11-abi" in sys.argv:
+    sys.argv.remove("--use-cxx11-abi")
+    CXX11_ABI = True
+
+def cmake_build():
+    """execute cmake build, to make the shared lib `libporos.so` """
+    cwd = os.getcwd()
+    if spawn.find_executable('cmake') is None:
+        sys.stderr.write("CMake is required to build this package.\n")
+        sys.exit(-1)
+    _source_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    _build_dir = os.path.join(_source_dir, 'build')
+    _prefix = get_python_lib()
+    try:
+        cmake_configure_command = [
+            'cmake',
+            '-H{0}'.format(_source_dir),
+            '-B{0}'.format(_build_dir),
+            '-DCMAKE_INSTALL_PREFIX={0}'.format(_prefix),
+        ]
+        _generator = os.getenv('CMAKE_GENERATOR')
+        if _generator is not None:
+            cmake_configure_command.append('-G{0}'.format(_generator))
+        spawn.spawn(cmake_configure_command)
+        spawn.spawn(
+            ['cmake', '--build', _build_dir, '-j', str(THREAD_NUM)])
+        os.chdir(cwd)
+    except spawn.DistutilsExecError:
+        sys.stderr.write("Error while building with CMake\n")
+        sys.exit(-1)
+
+class CleanCommand(Command):
+    """Custom clean command to tidy up the project root."""
+    PY_CLEAN_FILES = [
+        './build', './dist', './poros/__pycache__', './poros/lib', './*.pyc', './*.tgz', './*.egg-info'
+    ]
+    description = "Command to tidy up the project root"
+    user_options = []
+
+    def initialize_options(self):
+        pass
+
+    def finalize_options(self):
+        pass
+
+    def run(self):
+        for path_spec in self.PY_CLEAN_FILES:
+            # Make paths absolute and relative to this path
+            abs_paths = glob.glob(os.path.normpath(os.path.join(CURRENT_PATH, path_spec)))
+            for path in [str(p) for p in abs_paths]:
+                if not path.startswith(CURRENT_PATH):
+                    # Die if path in CLEAN_FILES is absolute + outside this directory
+                    raise ValueError("%s is not a path inside %s" % (path, CURRENT_PATH))
+                print('Removing %s' % os.path.relpath(path))
+                shutil.rmtree(path)
+
+
+if __name__ == "__main__":
+    """main setup function"""
+    poros_lib_path = os.path.join(CURRENT_PATH, "poros", "lib")
+
+    # build libporos.so
+    if "clean" not in sys.argv:
+        cmake_build()
+
+    if os.path.exists('./poros/lib') == False:
+        os.mkdir('./poros/lib')
+
+    shutil.copy("../build/lib/libporos.so", "./poros/lib/libporos.so")
+
+    # this is for torch customer extension
+    C = cpp_extension.CppExtension(
+        'poros._C', [
+            'poros/csrc/poros_py.cpp',
+        ],
+        library_dirs=[poros_lib_path, "../third_party/tensorrtlib"],
+        libraries=["poros"],
+        include_dirs=[
+            CURRENT_PATH + "/poros/csrc",
+            CURRENT_PATH + "/../build/include",
+
+        ],
+        extra_compile_args=[
+            "-Wno-deprecated",
+            "-Wno-deprecated-declarations",
+            "-Wno-unused-function",
+            '-Werror',
+            '-fopenmp',
+            '-D__const__=', '-g', '-O2', '-fPIC',
+        ],
+        extra_link_args=[
+            "-Wno-deprecated", "-Wno-deprecated-declarations",
+            "-Wno-unused-function",
+            "-Wl,--no-as-needed",
+            "-lporos",
+            "-lnvinfer",
+            "-lnvinfer_plugin",
+            "-Wl,-rpath,$ORIGIN/lib",
+            "-lpthread", "-ldl", "-lutil", "-lrt", "-lm", "-Xlinker", "-export-dynamic",
+        ],
+    )
+
+    setuptools.setup(
+        name="poros",
+        version="0.1.0",
+        author="PorosTeam@BaiDu",
+        description='A compiler backend for PyTorch and automatically accelerate inference using tensorrt engine',
+        ext_modules=[C],
+        packages=find_packages(),
+        include_package_data=True,
+        package_data={
+            'poros': ['lib/*.so'],
+        },
+        exclude_package_data={
+            '': ['*.cpp', '*.h'],
+            'poros': ['csrc/*.cpp'],
+        },
+        install_requires=[
+            'torch>=1.9.0',
+        ],
+        cmdclass={
+            'clean': CleanCommand,
+            'build_ext': cpp_extension.BuildExtension,
+        },
+    )
+    print("Setup baidu.poros python module success!")
diff --git a/poros/src/poros/compile/compile.cpp b/poros/src/poros/compile/compile.cpp
new file mode 100644
index 0000000000..347adf7cad
--- /dev/null
+++ b/poros/src/poros/compile/compile.cpp
@@ -0,0 +1,482 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file compile.h
+* @author tianjinjin@baidu.com
+* @author huangben@baidu.com
+* @date Fri Mar  5 11:39:03 CST 2021
+* @brief 
+**/
+#include "poros/compile/compile.h"
+
+//pytorch
+#include <torch/script.h>
+
+//pytorch passes
+#include <torch/csrc/jit/passes/clear_profiling.h>
+#include <torch/csrc/jit/passes/common_subexpression_elimination.h>
+#include <torch/csrc/jit/passes/constant_pooling.h>
+#include <torch/csrc/jit/passes/constant_propagation.h>
+#include <torch/csrc/jit/passes/dead_code_elimination.h>
+//#includ <torch/csrc/jit/passes/decompose_ops.h>
+#include <torch/csrc/jit/passes/freeze_module.h>
+#include <torch/csrc/jit/passes/inliner.h>
+#include <torch/csrc/jit/passes/inplace_check.h>
+#include <torch/csrc/jit/passes/lower_grad_of.h>
+#include <torch/csrc/jit/passes/lower_graph.h>
+#include <torch/csrc/jit/passes/lower_tuples.h>
+#include <torch/csrc/jit/passes/peephole.h>
+#include <torch/csrc/jit/passes/requires_grad_analysis.h>
+#include <torch/csrc/jit/passes/shape_analysis.h>
+
+#include "poros/compile/graph_prewarm.h"
+#include "poros/compile/graph_segment.h"
+#include "poros/context/poros_global.h"
+#include "poros/lowering/lowering_pass.h"
+#include "poros/lowering/op_fuse_pass.h"
+#include "poros/lowering/segment_post_processing.h"
+#include "poros/util/poros_util.h"
+// #include "poros/iplugin/plugin_create.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+Compiler::~Compiler() {
+    close();
+}
+
+int Compiler::init(const PorosOptions& options) {
+    _options = options;
+    if (options.debug == true) {
+        // when setting this, all the INFO level will be printed
+        c10::ShowLogInfoToStderr();
+    }
+    if (_options.unconst_ops_thres == -1) {
+        _options.unconst_ops_thres = 10;
+        if (_options.device == Device::XPU) {
+            _options.unconst_ops_thres = 1;
+        }
+    }
+    PorosGlobalContext::instance().set_poros_options(_options);
+    return 0;
+}
+
+int Compiler::preprocess_graph(std::shared_ptr<torch::jit::Graph>& graph) {
+    GRAPH_DEBUG("Before preprocess_graph:", graph);
+    
+    //step1: some passes provided by pytorch that insensitive for profile
+    {
+        torch::jit::Inline(*graph);
+        GRAPH_DUMP("inline graph:", graph);
+        /*PropagateInputShapes and  PropagateRequiresGrad maybe no need anymore*/
+        torch::jit::PropagateInputShapes(graph);
+        
+        torch::jit::ClearProfilingInformation(graph);
+        torch::jit::LowerGradOf(*graph);  //TODO: maybe no need
+        torch::jit::PropagateRequiresGrad(graph);
+
+        torch::jit::runRequiredPasses(graph);
+        /* DecomposeOps try to replace addmm & batch_norm & layer_norm. 
+        considering poros can handle batch_norm. so we do not unfold this. */
+        //torch::jit::DecomposeOps(graph);
+        torch::jit::ConstantPropagation(graph);
+        torch::jit::EliminateDeadCode(graph);
+        torch::jit::EliminateCommonSubexpression(graph);
+        torch::jit::ConstantPooling(graph);
+        torch::jit::PeepholeOptimize(graph, false);
+        torch::jit::EliminateDeadCode(graph);
+        torch::jit::LowerSimpleTuples(graph);  // TODO: maybe should not be here
+        torch::jit::CheckInplace(graph);
+        /*DO NOT set LowerAllTuples. 
+        * this may lead method output is not tuple. 
+        * and get error message like this:  "Method (but not graphs in general) require a single output." */
+        // torch::jit::LowerAllTuples(graph);
+    }
+
+    //step2: some passes provided by poros that insensitive for profile
+    {   
+        baidu::mirana::poros::replace_illegal_constant(graph);
+        baidu::mirana::poros::eliminate_exception_pass(graph);
+        baidu::mirana::poros::eliminate_maxpool_with_indices(graph);
+        baidu::mirana::poros::eliminate_simple_useless_nodes(graph);
+        baidu::mirana::poros::unpack_std(graph);
+        baidu::mirana::poros::unpack_var(graph);
+        baidu::mirana::poros::replace_log_softmax(graph);
+        baidu::mirana::poros::replace_log_sigmoid(graph);
+        baidu::mirana::poros::replace_pad(graph);
+        baidu::mirana::poros::fuse_ops_preprocess(graph);
+        torch::jit::runRequiredPasses(graph);
+    }
+
+    GRAPH_DEBUG("After preprocess_graph:", graph);
+    return 0;
+}
+
+int Compiler::compile(const torch::jit::Module& origin_module, 
+        const ivalue_vec_t& prewarm_datas, torch::jit::Module* optimized_module) {
+
+    _origin_module = &origin_module;
+    _prewarm_datas = prewarm_datas;
+
+    GRAPH_DUMP("origin_module graph:", origin_module.get_method("forward").graph());
+    torch::jit::setGraphExecutorOptimize(true);
+
+    std::shared_ptr<torch::jit::Graph> opt_graph = nullptr;
+    {
+        //step1: clone orign module to unfold module
+        torch::jit::Module intermediate_module = torch::jit::freeze_module(origin_module);
+        auto method = intermediate_module.get_method("forward");
+        auto graph = method.graph();
+        int ret = preprocess_graph(graph);
+        if (ret < 0) {
+            LOG(ERROR) << "preprocess_graph failed!";
+            return -1;
+        }
+        //attention. graph copy happened inside LowerGraph function
+        auto graph_and_ivalues = torch::jit::LowerGraph(*graph, intermediate_module._ivalue()); 
+        opt_graph = graph_and_ivalues.first;
+    }
+
+    //cpu的话 过了预处理就返回
+    if (_options.device == Device::CPU) {
+        merge_graph_to_module(opt_graph, *optimized_module, true);
+        return 0;
+    }
+
+    std::shared_ptr<torch::jit::Graph> prewarm_graph = graph_prewarm(opt_graph, prewarm_datas);
+    GRAPH_DUMP("prewarmed_module graph:", prewarm_graph);
+
+    //step2: try to find segments in unfold module
+    //划分完子图的模型
+    int ret = segment_graph(prewarm_graph);
+    if (ret < 0) {
+        LOG(ERROR) << "segment_graph failed!";
+        return -1;
+    }
+    GRAPH_DUMP("segmented_module graph:", prewarm_graph);
+
+    //step3: try to replace subgraph to engine graph
+    merge_graph_to_module(prewarm_graph, *optimized_module, true);
+    ret = optimize_subgraph(prewarm_graph, optimized_module); 
+    if (ret < 0) {
+        LOG(ERROR) << "optimize_subgraph failed!";
+        return -1;
+    }
+    GRAPH_DUMP("optimized_module graph:", optimized_module->get_method("forward").graph()); 
+    return 0;
+}
+
+int Compiler::segment_graph(std::shared_ptr<torch::jit::Graph>& g) {
+
+    IEngine* engine(nullptr);
+    std::string engine_name("");
+    if (_options.device == Device::GPU) {
+        engine_name = "TensorrtEngine";
+    } else if (_options.device == Device::XPU) {
+        engine_name = "XtclEngine";
+    } else {
+        engine = nullptr;
+    }
+
+    if (engine_name != "") {
+        engine = dynamic_cast<IEngine*>(create_plugin(engine_name, 
+                    PorosGlobalContext::instance()._engine_creator_map));
+        if (engine->init() < 0) {
+            delete engine;
+            return -1;
+        }
+    }
+    
+    graph_segment(g, engine);
+    GRAPH_DEBUG("After segment graph:", g);
+    delete engine;
+    return 0;
+}
+
+IEngine* Compiler::select_engine(const torch::jit::Node* n) {
+    if (n == nullptr || n->kind() != torch::jit::prim::CudaFusionGroup) {
+        return nullptr;
+    }
+
+    IEngine* engine(nullptr);
+    std::string engine_name("");
+    if (_options.device == Device::GPU) {
+        engine_name = "TensorrtEngine";
+    } else if (_options.device == Device::XPU) {
+        engine_name = "XtclEngine";
+    } else {
+        engine = nullptr;
+    } 
+
+    if (engine_name != "") {
+        engine = dynamic_cast<IEngine*>(create_plugin(engine_name, 
+                    PorosGlobalContext::instance()._engine_creator_map));
+        if (engine->init() < 0) {
+            return nullptr;
+        }
+        _engine_map[n] = engine;
+    }
+
+    return engine;
+}
+
+
+int Compiler::optimize_subgraph(const std::shared_ptr<torch::jit::Graph>& opt_graph, 
+        torch::jit::Module* optimized_module) {
+    auto block = opt_graph->block();
+    auto ret = optimize_subblock(block, optimized_module);
+    return ret;
+}
+
+
+int Compiler::optimize_subblock(torch::jit::Block* block,
+        torch::jit::Module* optimized_module) {
+
+    std::vector<torch::jit::Node*> to_optimized_nodes;
+    // 避免有的cudafusiongroup在子block内，需要这样遍历
+    find_to_optimized_nodes(block, to_optimized_nodes);
+    //保险起见，再sort一遍。
+    std::sort(to_optimized_nodes.begin(), to_optimized_nodes.end(), [&](torch::jit::Node* a, torch::jit::Node* b) {
+        return a->isBefore(b);
+    });
+
+    //size_t i = to_optimized_nodes.size();
+    for (auto iter = to_optimized_nodes.rbegin(); iter != to_optimized_nodes.rend(); iter++) {
+        auto node = *iter;
+
+        // todo:
+        // 1、目前不支持scalar的输出。若强行转engine输出的是tensor，与后面scalar类型不匹配，整个模型跑不起来。
+        // 2、subgraph没有输入的情况。
+        // 遇到这两种情况先unmerge掉，输出scalar的情况待支持 06.20
+        if (node->inputs().size() == 0) {
+            LOG(WARNING) << "Subgraph: " << node_info_with_attr(node) << " has no input. unmerge it.";
+            baidu::mirana::poros::unmerge_subgraph(node);
+            continue;
+        }
+        bool node_should_be_unmerged = false;
+        for (size_t i = 0; i < node->outputs().size(); i++) {
+            if (node->outputs()[i]->type()->kind() != c10::TypeKind::TensorType &&
+                !node->outputs()[i]->type()->isSubtypeOf(c10::ListType::ofTensors())) {
+                LOG(WARNING) << "Subgraph: " << node_info_with_attr(node) << " outputs contain non-tensor or non-tensor[] values. unmerge it.";
+                node_should_be_unmerged = true;
+                baidu::mirana::poros::unmerge_subgraph(node);
+                break;
+            }
+        }
+
+        if (node_should_be_unmerged) {
+            continue;
+        }
+
+        IEngine* engine = select_engine(node);
+        if (engine == nullptr) {
+            LOG(ERROR) << "can't find Engine for node: " << node->kind().toQualString();
+            return -1;
+        }
+        std::shared_ptr<torch::jit::Graph> subgraph = node->g(torch::jit::attr::Subgraph);
+        LOG(INFO) << "\n                     \n          ###########\n                     \n"
+                << "start to optimize graph: " << node_info_with_attr(node);
+
+        int non_constant_node_num = 0;
+        auto subblock = subgraph->block();
+
+        //engine->transform(*node, *optimized_module);
+        for (auto it = subblock->nodes().begin(); it != subblock->nodes().end(); ++it) {
+            if (it->kind() != torch::jit::prim::Constant) {
+                non_constant_node_num ++;
+                if (non_constant_node_num > _options.unconst_ops_thres) {
+                    break;
+                }
+            }
+        }
+        if (non_constant_node_num <= _options.unconst_ops_thres) {
+            LOG(INFO) << "subgraph size is too small, unmerge it.";
+            baidu::mirana::poros::unmerge_subgraph(node);
+        } else {
+            if (transform(engine, *node, *optimized_module) < 0) {
+                LOG(WARNING) << "transform failed, use origin sub_graph";
+                if (_options.debug) {
+                    GRAPH_DUMP("transform failed graph: ", subgraph);
+                    return -1;
+                }
+            }
+        }
+    }
+
+    for (auto it = block->nodes().begin(); it != block->nodes().end(); it++) {
+        for (torch::jit::Block* ib : it->blocks()) {
+            optimize_subblock(ib, optimized_module);
+        }
+    }
+    return 0;
+}
+
+void Compiler::close() {
+
+    for (auto&e : _engine_map) {
+        delete e.second;
+    }
+}
+
+int Compiler::transform(IEngine* engine, torch::jit::Node& subgraph_node,
+                            torch::jit::Module& module) {
+
+    AT_ASSERT(subgraph_node.kind() == torch::jit::prim::CudaFusionGroup);
+    std::shared_ptr<torch::jit::Graph> sub_graph_copy = subgraph_node.g(torch::jit::attr::Subgraph)->copy();
+    
+    std::string serialized_engine;
+
+    // 在拷贝的子图上删除无用的节点
+    if (!eliminate_subgraph_useless_nodes(sub_graph_copy, subgraph_node, false)) {
+        baidu::mirana::poros::unmerge_subgraph(&subgraph_node);
+        return -1;
+    }
+    
+    PorosGraph poros_graph = {sub_graph_copy.get(), &subgraph_node};
+    
+    //++poros_graph.allocated_index;
+    //int ret = engine->transform(poros_graph, serialized_engine);
+    int ret = engine->transform(poros_graph);
+    if (ret < 0) {
+        baidu::mirana::poros::unmerge_subgraph(&subgraph_node);
+        return ret;
+    }
+    // 子图转换成功，删除（替换）子图输入无用的节点
+    std::shared_ptr<torch::jit::Graph> sub_graph = subgraph_node.g(torch::jit::attr::Subgraph);
+    eliminate_subgraph_useless_nodes(sub_graph, subgraph_node, true);
+
+    auto parent_graph = subgraph_node.owningGraph();
+    
+    // 有的子图输出的tensor原本是long类型，但是trtengine只支持int类型。
+    // 那么就需要在engine后添加aten::to(long)的操作将其还原回去。
+    // 避免有的op会强制检查long类型（例如：aten::index）
+    subgraph_outputs_int2long(parent_graph, subgraph_node, sub_graph_copy);
+
+    
+    //std::pair<uint64_t, uint64_t> num_io = engine_ptr->num_io;
+    //AT_ASSERT(num_io.first == sub_graph->inputs().size());
+    //AT_ASSERT(num_io.second == sub_graph->outputs().size());
+
+    //add engine to attribute 
+    std::string name = engine->who_am_i() + "_" + std::to_string(_engine_index++);
+    engine->register_module_attribute(name, module);
+
+    //get self input. it's about the module func
+    torch::jit::Value* self = nullptr;
+    auto first_input_c = parent_graph->inputs()[0]->type()->cast<c10::ClassType>();
+    if (first_input_c->is_module()) {
+        self = parent_graph->inputs()[0];
+    } else {
+        self = parent_graph->insertInput(0, "self");  //should set as the first input param
+        self->setType(module._ivalue()->type());
+    }
+
+    torch::jit::WithInsertPoint guard(&subgraph_node);
+    //build new node & remove old graph
+    auto engine_node = parent_graph->createGetAttr(self, name);
+    engine_node->insertBefore(&subgraph_node);
+
+    std::vector<torch::jit::Value*> engine_inputs;
+    for (auto input : subgraph_node.inputs()) {
+        //TODO: consider situation that when input is not a tensor
+        engine_inputs.push_back(input);
+    }
+    
+    auto input_list_node = parent_graph->createList(c10::TensorType::get(), 
+            torch::jit::ArrayRef<torch::jit::Value*>(engine_inputs));
+    input_list_node->insertBefore(&subgraph_node);
+
+    std::vector<torch::jit::Value*> execute_node_inputs;
+    execute_node_inputs.push_back(input_list_node->outputs()[0]);
+    execute_node_inputs.push_back(engine_node->outputs()[0]);
+
+    auto execute_node = parent_graph->create(
+        c10::Symbol::fromQualString(engine->who_am_i() + "::execute_engine"),
+        torch::jit::ArrayRef<torch::jit::Value*>(execute_node_inputs),
+        1);
+    execute_node->insertBefore(&subgraph_node);
+    execute_node->outputs()[0]->setType(c10::ListType::ofTensors());
+
+    //auto unpack_node = parent_graph->createListUnpack(execute_node->outputs()[0], num_io.second);
+    auto unpack_node = parent_graph->createListUnpack(execute_node->outputs()[0], subgraph_node.outputs().size());
+    unpack_node->insertBefore(&subgraph_node);
+
+    //AT_ASSERT(subgraph_node.outputs().size() == unpack_node->outputs().size());  
+    for (size_t idx = 0; idx < unpack_node->outputs().size(); idx++) {
+        subgraph_node.outputs()[idx]->replaceAllUsesWith(unpack_node->outputs()[idx]);
+    }
+
+    subgraph_node.removeAllInputs();
+    subgraph_node.destroy();  //TODO: 没有清理subgraph，可能会有内存泄漏，确认一下
+    return 0;
+}
+
+/**
+ * @brief  compile graph
+ *
+ * @param [in] module : 原始module
+ * @param [in] input_ivalues : 预热数据
+ * @param [in] options : 参数
+ * @return optimized_module
+ * @retval !nullptr => succeed  nullptr => failed
+ **/
+std::unique_ptr<torch::jit::Module> CompileGraph(const torch::jit::Module& origin_module, 
+                            const std::vector<std::vector<c10::IValue> >& prewarm_datas, 
+                            const PorosOptions& options) {
+    Compiler compiler;
+    if (compiler.init(options) < 0) {
+        return nullptr;
+    }
+    
+    try {
+        std::unique_ptr<torch::jit::Module> optimized_module(new torch::jit::Module(origin_module._ivalue()->name() + "_poros"));
+        if (compiler.compile(origin_module, prewarm_datas, optimized_module.get()) < 0) {
+            return nullptr;
+        }
+
+        return optimized_module;
+    } catch (const c10::Error& e) {
+        LOG(ERROR) << e.msg();
+        return nullptr;
+    }
+}
+
+std::unique_ptr<PorosModule> Compile(const torch::jit::Module& module,
+        const std::vector<std::vector<c10::IValue> >& prewarm_datas,
+        const PorosOptions& options) {
+
+    auto compiled_module = CompileGraph(module, prewarm_datas, options);
+    if (compiled_module) {
+        std::unique_ptr<PorosModule> poros_module(new PorosModule(*compiled_module));
+        poros_module->_options = options;
+
+        if (options.device == Device::GPU) {
+            poros_module->to(at::kCUDA);
+        }
+
+        if (options.debug == true) {
+            // when setting this, all the INFO level will be printed
+            c10::ShowLogInfoToStderr();        
+        }
+        return poros_module;
+    } else {
+        return nullptr;
+    }
+}
+
+}//poros
+}//mirana
+}//baidu
diff --git a/poros/src/poros/compile/compile.h b/poros/src/poros/compile/compile.h
new file mode 100644
index 0000000000..702a981769
--- /dev/null
+++ b/poros/src/poros/compile/compile.h
@@ -0,0 +1,169 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file compile.h
+* @author tianjinjin@baidu.com
+* @author huangben@baidu.com
+* @date Fri Mar  5 11:39:03 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+#include <algorithm>
+#include <unordered_map>
+#include <set>
+#include <torch/script.h>
+
+#include "poros/engine/iengine.h"
+#include "poros/compile/poros_module.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+/**
+ * @brief  compile graph
+ *
+ * @param [in] module : 原始module
+ * @param [in] input_ivalues : 预热数据
+ * @param [in] options : 参数
+ * @return porosmodule
+ * @retval !nullptr => succeed  nullptr => failed
+ **/
+std::unique_ptr<PorosModule> Compile(const torch::jit::Module& module, 
+        const std::vector<std::vector<c10::IValue> >& prewarm_datas, 
+        const PorosOptions& options);
+
+class Compiler {
+public:
+    typedef std::unordered_map<const torch::jit::Node*, IEngine*> engine_map_t;
+    typedef std::vector<std::vector<c10::IValue> > ivalue_vec_t;
+
+    Compiler() : _origin_module(NULL) {}
+    ~Compiler();
+
+    /**
+     * @brief initial Compiler
+     *
+     * @param [in] options : poros options
+     * @return  int
+     * @retval 0 => succeed  <0 => failed
+    **/
+    int init(const PorosOptions& options);
+
+    /**
+     * @brief compile whole graph
+     *
+     * @param [in] origin_module 
+     * @param [in] prewarm_datas : ivalue_vec_t, vector of IValue
+     * @param [out] optimized_module : optimized graph
+     * @return  int
+     * @retval 0 => succeed  <0 => failed
+    **/
+    int compile(const torch::jit::Module& origin_module, 
+                const ivalue_vec_t& prewarm_datas,
+                torch::jit::Module* optimized_module);
+    
+private:
+
+    /**
+     * @brief preprocess this calculation graph
+     *
+     * @param [out] graph : preprcessed graph
+     * @return  int
+     * @retval 0 => succeed  <0 => failed
+    **/
+    int preprocess_graph(std::shared_ptr<torch::jit::Graph>& graph);
+
+    /**
+     * @brief segement this calculation graph
+     *
+     * @param [in/out] graph 
+     * @return  int
+     * @retval 0 => succeed  <0 => failed
+    **/
+    int segment_graph(std::shared_ptr<torch::jit::Graph>& graph);
+
+    
+    //子图优化
+    /**
+     * @brief optimize this calculation graph
+     *
+     * @param [in] opt_graph : 
+     * @param [out] optimized_module : optimized module
+     * @return  int
+     * @retval 0 => succeed  <0 => failed
+    **/
+    int optimize_subgraph(const std::shared_ptr<torch::jit::Graph>& opt_graph,
+            torch::jit::Module* optimized_module);
+
+    //子图优化(block)
+    int optimize_subblock(torch::jit::Block* block, 
+            torch::jit::Module* optimized_module);
+
+    /**
+     * @brief 将子图基于engine编译成新图
+     *
+     * @param [in] engine : 子图用到的engine
+     * @param [in] subgraph_node : 子图结点
+     * @return [out] module : 转化后的模型
+     * @retval 0 => succeed  <0 => failed
+    **/
+    int transform(IEngine* engine, torch::jit::Node& subgraph_node, 
+            torch::jit::Module& module);
+
+    /**
+     * @brief 根据子图和options选择engine
+     *
+     * @param [in] node : 子图代表结点
+     * @return  int
+     * @retval 0 => succeed  <0 => failed
+    **/
+    IEngine* select_engine(const torch::jit::Node* n);
+    
+    /**
+     * @brief destory
+     *
+     * @return  void
+    **/
+    void close();
+
+private:
+    int _max_segment_depth{5};                    //最大子图分割深度
+    ivalue_vec_t _prewarm_datas;                    //预热用的输入数据
+    PorosOptions _options;
+    engine_map_t _engine_map;                       //记录子图用的engine
+    const torch::jit::Module* _origin_module;       //原始模型
+    std::atomic<int> _engine_index = {0};            //记录engine的index
+};
+
+/**
+ * @brief  compile graph, 内部使用
+ *
+ * @param [in] module : 原始module
+ * @param [in] input_ivalues : 预热数据
+ * @param [in] options : 参数
+ * @return optimized_module
+ * @retval !nullptr => succeed  nullptr => failed
+ **/
+std::unique_ptr<torch::jit::Module> CompileGraph(const torch::jit::Module& module, 
+                                const std::vector<std::vector<c10::IValue> >& prewarm_datas, 
+                                const PorosOptions& options);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/compile/graph_prewarm.cpp b/poros/src/poros/compile/graph_prewarm.cpp
new file mode 100644
index 0000000000..645beea9c4
--- /dev/null
+++ b/poros/src/poros/compile/graph_prewarm.cpp
@@ -0,0 +1,191 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file graph_prewarm.cpp
+* @author tianjinjin@baidu.com
+* @date Fri Apr 23 11:41:59 CST 2021
+* @brief 
+**/
+#include "poros/compile/graph_prewarm.h"
+
+//pytorch
+#include <torch/csrc/jit/passes/common_subexpression_elimination.h>
+#include <torch/csrc/jit/passes/constant_pooling.h>
+#include <torch/csrc/jit/passes/constant_propagation.h>
+#include <torch/csrc/jit/passes/clear_profiling.h>
+#include <torch/csrc/jit/passes/dead_code_elimination.h>
+#include <torch/csrc/jit/passes/inplace_check.h>
+// #include <torch/csrc/jit/passes/loop_unrolling.h>
+#include <torch/csrc/jit/passes/lower_tuples.h>
+#include <torch/csrc/jit/passes/peephole.h>
+#include <torch/csrc/jit/passes/remove_mutation.h>
+#include <torch/csrc/jit/passes/tensorexpr_fuser.h>
+
+#include "poros/compile/ivalues_analysis.h"
+#include "poros/lowering/lowering_pass.h"
+#include "poros/lowering/op_fuse_pass.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+namespace {
+using namespace torch::jit;
+using Stack = std::vector<c10::IValue>;
+
+struct PorosGraphPrewarm {
+    PorosGraphPrewarm(std::shared_ptr<Graph> graph):graph_(std::move(graph)) {}
+
+    //graph 预热核心逻辑
+    std::shared_ptr<Graph> run(std::vector<Stack>& stack_vec) {
+
+        //step1: back up the prewarm data
+        //attention: the data in stack has changed during the interpreterstate procedure.
+        //so we should copy twice the stack before interpreter execution
+        //one for input_param_propagate, and the other for final interpreterState.
+        std::vector<Stack> stack_vec_final = stack_vec;
+        std::vector<Stack> stack_vec_copy = stack_vec;
+
+        //step2: the first round ivalue analysis
+        torch::jit::getProfilingMode() = true;
+        torch::jit::getExecutorMode() = true;
+        torch::jit::setGraphExecutorOptimize(false);
+        torch::jit::getNumProfiledRuns() = stack_vec.size();
+        GRAPH_DEBUG("before first round IvalueAnalysis Graph: ", graph_);
+        std::unique_ptr<IvalueAnalysis> ia = IvalueAnalysis::analysis_ivalue_for_graph(graph_);
+        ExecutionPlan plan = ExecutionPlan(ia->graph(), "first_round_prewarm");
+        for (size_t i = 0; i < stack_vec.size(); i++) {
+            InterpreterState(plan.code).run(stack_vec[i]);
+        }
+        std::shared_ptr<torch::jit::Graph> output_graph = ia->graph();
+        GRAPH_DEBUG("after first round IvalueAnalysis Graph: ", output_graph);
+        
+        //step3: necessary passes to eliminate the profile information in graph
+        {
+            baidu::mirana::poros::input_param_propagate(output_graph, stack_vec_copy);
+            std::vector<Stack>().swap(stack_vec_copy);
+            GRAPH_DEBUG("after input_param_propagate Graph: ", output_graph);
+            torch::jit::ProfilingRecord::removeProfileCounter(output_graph->block());
+            GRAPH_DEBUG("after removeProfileCounter Graph: ", output_graph);
+            baidu::mirana::poros::remove_simple_type_profile_nodes(output_graph);
+            GRAPH_DEBUG("after remove_simple_type_profile_nodes Graph: ", output_graph);
+            torch::jit::RemoveProfileNodesAndSpecializeTypes(output_graph);
+            GRAPH_DEBUG("after RemoveProfileNodesAndSpecializeTypes Graph: ", output_graph);
+        }
+
+        // step4: some passes can be run based on the prifiled graph and data
+        {
+            torch::jit::runRequiredPasses(output_graph);
+            torch::jit::EliminateDeadCode(output_graph);
+            torch::jit::EliminateCommonSubexpression(output_graph);
+            /* addmm is only done as an optimization for onnx, so we disable it */
+            torch::jit::PeepholeOptimize(output_graph, /*addmm_fusion_enabled*/false);
+            torch::jit::ConstantPropagation(output_graph);  //this is very necessary for prone if block!!!!
+            torch::jit::ConstantPooling(output_graph);
+            // torch::jit::UnrollLoops(output_graph);
+            baidu::mirana::poros::unrolling_loop(output_graph);
+            baidu::mirana::poros::freeze_percentformat(output_graph);
+            baidu::mirana::poros::freeze_aten_size(output_graph);
+            baidu::mirana::poros::freeze_aten_len(output_graph);
+            baidu::mirana::poros::unrolling_loop(output_graph);
+
+            //some mutation handle pass below
+            GRAPH_DEBUG("before remove mutation Graph: ", output_graph);
+            //TODO: handle this later, it cores when we using prepare_inplace_ops.
+            //prepare_inplace_ops(output_graph);
+            torch::jit::RemoveListMutation(output_graph);
+            torch::jit::RemoveTensorMutation(output_graph);
+            GRAPH_DEBUG("after remove mutation Graph: ", output_graph);
+
+            baidu::mirana::poros::fuse_ops_prewarm(output_graph);
+
+            // run some pass again after unrolled loops
+            torch::jit::PeepholeOptimize(output_graph, /*addmm_fusion_enabled*/false);
+            torch::jit::ConstantPropagation(output_graph);
+            torch::jit::EliminateCommonSubexpression(output_graph);
+            torch::jit::CheckInplace(output_graph);
+            torch::jit::runRequiredPasses(output_graph);
+
+            baidu::mirana::poros::eliminate_some_dict(output_graph);
+            baidu::mirana::poros::eliminate_some_list(output_graph);
+
+            torch::jit::PeepholeOptimize(output_graph, /*addmm_fusion_enabled*/false);
+            torch::jit::ConstantPropagation(output_graph);
+            torch::jit::ConstantPooling(output_graph);
+            baidu::mirana::poros::unrolling_loop(output_graph);
+            torch::jit::EliminateCommonSubexpression(output_graph);
+            baidu::mirana::poros::link_mutable_list(output_graph);
+            torch::jit::CheckInplace(output_graph);
+            torch::jit::runRequiredPasses(output_graph);
+
+            torch::jit::LowerSimpleTuples(output_graph);
+        }
+
+        //step5: prepare for second round ivalue analysis
+        //reset the profile number & clean profile information last round.
+        torch::jit::getNumProfiledRuns() = stack_vec_final.size();
+        torch::jit::ClearProfilingInformation(output_graph);
+        std::unique_ptr<IvalueAnalysis> ia_final = IvalueAnalysis::analysis_ivalue_for_graph(output_graph);
+        ExecutionPlan plan_final = ExecutionPlan(ia_final->graph(), "second_round_prewarm");
+        for (size_t i = 0; i < stack_vec_final.size(); i++) {
+            InterpreterState(plan_final.code).run(stack_vec_final[i]);
+        }
+        std::shared_ptr<torch::jit::Graph> final_graph = ia_final->graph();
+
+        //step6: store the final dynamic information
+        bool is_dynamic_shape = PorosGlobalContext::instance().get_poros_options().is_dynamic;
+        if (is_dynamic_shape) {
+            ia_final->gen_value_dyanamic_shape();
+        }
+        ia_final->gen_list_size();
+        ia_final->gen_int_intlist_value();
+
+        //step7: necessary passes to eliminate the profile record
+        {
+            torch::jit::ProfilingRecord::removeProfileCounter(final_graph->block());
+            baidu::mirana::poros::remove_simple_type_profile_nodes(final_graph);
+            torch::jit::RemoveProfileNodesAndSpecializeTypes(final_graph);
+            baidu::mirana::poros::freeze_aten_dim(final_graph);
+            baidu::mirana::poros::freeze_list_construct(final_graph);
+        }
+        
+        GRAPH_DUMP("final graph_prewarm Graph: ", final_graph);
+        return final_graph;
+    }
+
+private:
+    std::shared_ptr<Graph> graph_;
+
+}; // struct PorosGraphPrewarm
+
+}  // anonymous namespace
+
+std::shared_ptr<Graph> graph_prewarm(std::shared_ptr<Graph>& graph, 
+                            const std::vector<std::vector<c10::IValue> >& prewarm_datas) {
+    std::vector<std::vector<c10::IValue>> stacks;
+    for (size_t i = 0; i < prewarm_datas.size(); ++i) { //TODO: Make it better here
+        std::vector<c10::IValue> stack;
+        for (c10::IValue input : prewarm_datas[i]) {
+            stack.push_back(input);
+        }
+        stacks.push_back(stack);
+    }
+    std::shared_ptr<Graph> new_graph = PorosGraphPrewarm(graph).run(stacks);
+    return new_graph;
+}
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/compile/graph_prewarm.h b/poros/src/poros/compile/graph_prewarm.h
new file mode 100644
index 0000000000..417eaabcd1
--- /dev/null
+++ b/poros/src/poros/compile/graph_prewarm.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file graph_prewarm.h
+* @author tianjinjin@baidu.com
+* @date Thu Mar 18 14:33:54 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+//pytorch
+#include <torch/script.h>
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+/**
+ * @brief  首先，graph_prewarm 将给定的预热数据‘prewarm_datas’喂给添加了profile节点的graph。
+ *              从而得到graph中每个节点的我们期望存储的信息(tensor类节点的dim信息/ bool类节点的值 等..)
+ *         其次，graph_prewarm 基于这份存储的信息，进一步对graph进行图层面的优化，
+ *         最后，graph_prewarm 返回一份完成了图优化的graph。
+ * 
+ *         first, graph_prewarm feed the given prewarm_datas to the graph which has added many profile nodes.
+ *         we can get much information (dim information of tensor value / exact numerical value of 
+ *         the bool value, etc...) about each node in the graph by profiling the graph。
+ * 
+ *         second,graph_prewarm optimize the given graph at the graph level based on the profile data we captured
+ *         
+ *         finally，graph_prewarm return the graph that has fully optimized in graph level  
+ * 
+ * @param [in] graph : the graph to be warmed
+ * @param [in] prewarm_datas : prewarm data
+ * @return prewarmed_graph
+ **/
+std::shared_ptr<torch::jit::Graph> graph_prewarm(
+                                    std::shared_ptr<torch::jit::Graph>& graph, 
+                                    const std::vector<std::vector<c10::IValue> >& prewarm_datas);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/compile/graph_segment.cpp b/poros/src/poros/compile/graph_segment.cpp
new file mode 100644
index 0000000000..472ef11116
--- /dev/null
+++ b/poros/src/poros/compile/graph_segment.cpp
@@ -0,0 +1,1290 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file graph_segment.cpp
+* @author tianjinjin@baidu.com
+* @author tianshaoqing@baidu.com
+* @date Fri Mar 19 19:18:20 CST 2021
+* @brief 
+**/
+#include "poros/compile/graph_segment.h"
+
+//pytorch
+#include <torch/script.h>
+#include <torch/csrc/jit/ir/alias_analysis.h>
+#include <torch/csrc/jit/passes/common_subexpression_elimination.h>
+#include <torch/csrc/jit/passes/constant_pooling.h>
+#include <torch/csrc/jit/passes/dead_code_elimination.h>
+#include <torch/csrc/jit/passes/tensorexpr_fuser.h>
+// #include <torch/csrc/jit/passes/utils/subgraph_utils.h>
+
+#include "poros/context/poros_global.h"
+#include "poros/lowering/lowering_pass.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+namespace {
+using namespace torch::jit;
+
+Value* broadcast_sizes(at::ArrayRef<Value*> sizes) {
+    AT_ASSERT(!sizes.empty());
+    Graph* graph = sizes[0]->owningGraph();
+    Node* broadcast_n =
+        graph->insertNode(graph->create(prim::BroadcastSizes, sizes));
+    broadcast_n->output()->setType(ListType::ofInts());
+    return broadcast_n->output();
+}
+
+struct PorosGraphSegment {
+    using FusionCallback = std::function<bool(Node*)>;
+    Block* block_;
+    std::unique_ptr<AliasDb> aliasDb_;
+    std::shared_ptr<Graph> graph_;
+    Symbol kind_ = prim::CudaFusionGroup;
+    IEngine* engine_;
+    
+    PorosGraphSegment(Block* block, std::shared_ptr<Graph> graph, IEngine* engine)
+        : block_(block), graph_(std::move(graph)), engine_(engine) {}
+
+    //判断一个节点和它某个输入的关系，是否该输入的所有消费者已经在这group里了。
+    bool all_users_are_this_cunsumer(Node* consumer, Value* producer) {
+        Node* defining_node = producer->node();
+        for (Value* o : defining_node->outputs()) {
+            for (auto u : o->uses()) {
+                if (u.user != consumer &&
+                    !(u.user->matches("aten::size(Tensor self) -> int[]"))) {
+                    return false;
+                }
+            }
+        }
+        return true;
+    }
+
+    //判断给定的一个节点（node)，当前engine是否支持。
+    bool is_node_fusable(const Node* node) {
+        //针对aten::append需要额外的判断条件。
+        //当aten::append位于某个block，而它可能改变其parentblock中的ListConstruct的产出的时候，整个逻辑就gg了。
+        //本质上，这还是inplace 语义导致的问题。
+        //poros需要针对 inplace语义的算子，做更好的预处理逻辑。
+        // if ((node->kind() == aten::append) && engine_->is_node_supported(node) &&
+        //     node->inputs().at(0)->node()->kind() == prim::ListConstruct) {
+        //     const Node* mutable_node = node->inputs().at(0)->node();
+        //     for (auto &use: mutable_node->output()->uses()) {
+        //         if (use.user->owningBlock() != mutable_node->owningBlock()) {
+        //             LOG(WARNING) << "opps! meet mutable aten::append: " << node_info(node);
+        //             return false;
+        //         }
+        //     }
+        //     return true;
+        // }
+        // loop和if子block里的mutable op还没有串联，fuse会有问题，先禁用 04.22
+        if (PorosGlobalContext::instance().supported_mutable_ops_set.count(node->kind()) > 0 
+                                                        && engine_->is_node_supported(node)) {
+            if (node->owningBlock() != node->owningGraph()->block()) {
+                LOG(WARNING) << "Graph fuser meets mutable op in sub_block, which is not"
+                " yet supported. Node info: " << node_info(node);
+                return false;
+            }
+        }
+
+        if (node->kind() == kind_ || engine_->is_node_supported(node)) {
+            return true;
+        }
+        return false;
+    }
+
+    //TODO: this should be better
+    //判断给定的一个节点（node），是否可以fuse到已有的节点组（fusion）里面去。
+    bool is_node_fusable(const Node* fusion, const Node* node) {
+        //对prim::ListConstruct 这种有引用语义的，需要更严格的校验。
+        // if (node->kind() == prim::ListConstruct && is_node_fusable(node)) {
+        //     for (auto &use: node->output()->uses()) {
+        //         if (use.user->owningBlock() != fusion->owningBlock()) {
+        //             LOG(WARNING) << "opps! meet mutable ListConstruct: " << node_info(node);
+        //             return false;
+        //         }
+        //     }
+        //     return true;
+        // }
+        if (is_node_fusable(node)) {
+            return true;
+        }
+        return false;
+    }
+
+    //返回给定节点的子图
+    Graph& get_subgraph(Node* node) {
+        AT_ASSERT(node->kind() == kind_);
+        return *node->g(attr::Subgraph);
+    }
+    
+    // 合并两个graph
+    void merge_fusion_groups(Node* consumer_group, Node* producer_group) {
+        // Now we have two fusion groups!
+        // Revert the fusion - place all inner nodes of producer back in the outer
+        // graph.
+        std::vector<Node*> temporary_nodes;
+        Graph* producer_subgraph = &get_subgraph(producer_group);
+
+        // Initialize a map of inner graph values to outer graph values
+        std::unordered_map<Value*, Value*> inner_to_outer;
+        at::ArrayRef<Value*> inner_inputs = producer_subgraph->inputs();
+        at::ArrayRef<Value*> outer_inputs = producer_group->inputs();
+        for (size_t i = 0; i < inner_inputs.size(); ++i) {
+            inner_to_outer[inner_inputs[i]] = outer_inputs[i];
+        }
+
+        // Clone all nodes
+        for (auto inner : producer_subgraph->nodes()) {
+            Node* outer = block_->owningGraph()->createClone(
+                inner, [&](Value* k) -> Value* { return inner_to_outer.at(k); });
+            for (size_t i = 0 ; i < outer->inputs().size(); i++){
+                outer->input(i)->setType(inner->input(i)->type());
+            }
+            outer->insertBefore(producer_group);
+            temporary_nodes.emplace_back(outer);
+            at::ArrayRef<Value*> inner_outputs = inner->outputs();
+            at::ArrayRef<Value*> outer_outputs = outer->outputs();
+            for (size_t i = 0; i < inner_outputs.size(); ++i) {
+                inner_to_outer[inner_outputs[i]] = outer_outputs[i];
+            }
+        }
+
+        // Replace uses of producer_group outputs and destroy the producer
+        at::ArrayRef<Value*> subgraph_outputs = producer_subgraph->outputs();
+        for (size_t i = 0; i < subgraph_outputs.size(); ++i) {
+            Value* outer_output = inner_to_outer.at(subgraph_outputs[i]);
+            producer_group->outputs()[i]->replaceAllUsesWith(outer_output);
+            update_global_context(producer_group->outputs()[i], outer_output);
+        }
+
+        // Inline the temporary nodes into the first group
+        Graph* consumer_subgraph = &get_subgraph(consumer_group);
+        for (auto it = temporary_nodes.rbegin(); it != temporary_nodes.rend(); ++it) {
+            Node* node = *it;
+            Node* merged = merge_node_into_group(consumer_group, node);
+            // If any of the outputs are still used then we need to add them
+            at::ArrayRef<Value*> outputs = node->outputs();
+            for (size_t i = 0; i < outputs.size(); ++i) {
+                Value* output = outputs[i];
+                if (output->uses().size() == 0) {
+                    continue;
+                }
+                consumer_subgraph->registerOutput(merged->outputs()[i]);
+                Value* new_output = consumer_group->addOutput();
+                output->replaceAllUsesWith(new_output);
+                update_global_context(output, new_output);
+                new_output->setType(output->type());
+            }
+            node->destroy();
+        }
+        update_global_list_size_map_node_key_context(producer_group, consumer_group);
+        producer_group->destroy();
+        producer_group = nullptr; // Just to get a clear error in case someone uses it
+    }
+
+    Node* merge_node_into_group(Node* group, Node* to_merge_node) {
+        AT_ASSERT(to_merge_node->kind() != kind_);
+        Graph& subgraph = get_subgraph(group);
+
+        // map from nodes in the surrounding graph to parameters in the fusion
+        // group's subgraph that correspond to them
+        std::unordered_map<Value*, Value*> inputs_map;
+        size_t i = 0;
+        size_t tensor_insert_idx = 0;
+        AT_ASSERT(group->inputs().size() == subgraph.inputs().size());
+        for (auto input : group->inputs()) {
+            inputs_map[input] = subgraph.inputs()[i++];
+            if (input->type()->isSubtypeOf(TensorType::get())) {
+                tensor_insert_idx = i;   //真的要单独搞个tensor_index_idx 么？
+            }
+        }
+
+        WithInsertPoint guard(*subgraph.nodes().begin());
+        for (auto input : to_merge_node->inputs()) {
+            if (inputs_map.count(input) == 0) {
+                // TODO: we are following the convention for no good reason;
+                //       we don't need tensor to come before any other inputs.
+                if (input->type()->isSubtypeOf(TensorType::get())) {
+                    Value* in_group = subgraph.insertInput(tensor_insert_idx);
+                    in_group->setType(input->type());
+                    inputs_map[input] = in_group;
+                    group->insertInput(tensor_insert_idx, input);
+                    tensor_insert_idx++;
+                } else if (
+                    // TODO: extend the supporting inputs here.
+                    (input->type()->isSubtypeOf(FloatType::get()) &&
+                    input->node()->kind() != prim::Constant) ||
+                    (to_merge_node->kind() == aten::_grad_sum_to_size &&
+                    input->type()->isSubtypeOf(ListType::ofInts()))) {
+                    Value* in_group = subgraph.addInput();
+                    in_group->setType(input->type());
+                    inputs_map[input] = in_group;
+                    group->addInput(input);
+                } else if (input->node()->kind() == prim::Constant) {
+                    // inline the constants directly in the body of the fused group.
+                    Node* in_const =
+                        subgraph.createClone(input->node(), [](Value*) -> Value* {
+                            throw std::runtime_error("unexpected input");
+                        });
+                    subgraph.insertNode(in_const);
+                    inputs_map[input] = in_const->output();
+                } else {
+                    Value* in_group = subgraph.addInput();
+                    in_group->setType(input->type());
+                    inputs_map[input] = in_group;
+                    group->addInput(input);
+                }
+            }
+        }
+
+        // for (auto input : to_merge_node->inputs()) {
+        //     update_inputs_map(to_merge_node, input);
+        // }
+        
+        // copy n into the graph, remapping its inputs to internal nodes
+        Node* in_graph = subgraph.createClone(
+            to_merge_node, [&](Value* k) -> Value* { return inputs_map[k]; }, true);
+
+        at::ArrayRef<Value*> inputs = group->inputs();
+        for (size_t i = 0; i < to_merge_node->outputs().size(); ++i) {
+            auto it = std::find(inputs.begin(), inputs.end(), to_merge_node->outputs()[i]);
+            if (it != inputs.end()) {
+                size_t p = it - inputs.begin();
+                group->removeInput(p);
+                subgraph.inputs()[p]->replaceAllUsesWith(in_graph->outputs()[i]);
+                subgraph.eraseInput(p);
+            }
+        }
+        return subgraph.insertNode(in_graph);
+    }
+
+    //将node转化为一个subgraph
+    Node* create_singleton_fusion_group(Node* n) {
+        Node* group = block_->owningGraph()->createWithSubgraph(kind_);
+        // propogate position information for the new node so we can always
+        // have a valid mapping
+        group->insertBefore(n);
+        Node* mergedNode = merge_node_into_group(group, n);
+        if (mergedNode->outputs().size() == 1) {
+            get_subgraph(group).registerOutput(mergedNode->output());
+            Value* sel = group->addOutput();
+            sel->copyMetadata(n->output());
+            n->replaceAllUsesWith(group);
+            update_global_context(n->output(), sel);
+        //fix bug: handle situation when node has more than one output situation.
+        } else {
+            for (size_t index = 0; index <  mergedNode->outputs().size(); index++) {
+                get_subgraph(group).registerOutput(mergedNode->outputs().at(index));
+                Value* new_value = group->insertOutput(index)->copyMetadata(n->outputs().at(index));
+                n->outputs().at(index)->replaceAllUsesWith(new_value);
+                update_global_context(n->outputs().at(index), new_value);
+            }
+        }
+        update_global_list_size_map_node_key_context(n, group);
+        n->destroy();
+        return group;
+    }
+
+    at::optional<Node*> try_fuse(Node* consumer, Value* producer) {
+
+        LOG(INFO) << "[try_fuse] consumer: " << node_info(consumer);
+        LOG(INFO) << "[try_fuse] producer: " << node_info(producer->node());
+
+        bool shouldFuse =
+            //TODO: check carefully later
+            is_node_fusable(consumer, producer->node()) &&
+            // Rearrange nodes such that all uses of producer are after the
+            // consumer. Fusion will rewrite those later uses to use the version of
+            // producer generated by the fused blob. In this case, producer becomes
+            // an output of the fusion group.
+            aliasDb_->moveBeforeTopologicallyValid(producer->node(), consumer);
+
+        if (producer->node()->kind() == prim::Constant) {
+            shouldFuse = true;
+        }
+
+        if (!shouldFuse) {
+            LOG(INFO) << "[try_fuse Fail] should not fuse";
+            return at::nullopt;
+        }
+
+        Node* group = consumer;
+        if (producer->node()->kind() == kind_) {
+            if (consumer->kind() != kind_) {
+                group = create_singleton_fusion_group(consumer);
+                // should not update here cause consumer has destroyed.
+                // update_global_list_size_map_node_key_context(consumer, group);
+            }
+            merge_fusion_groups(group, producer->node());
+            LOG(INFO) << "[try_fuse Success] FusionGroup is: " << node_info(group);
+            return group;
+        }
+
+        // TODO: pay attention here. we should check multi output situation carefully.
+        if (producer->node()->outputs().size() != 1 &&
+            !all_users_are_this_cunsumer(consumer, producer)) {
+            LOG(INFO) << "[try_fuse Fail] Should not fuse, producer output sizes: " << producer->node()->outputs().size()
+                    << ", and is all_users_are_this_cunsumer: " << all_users_are_this_cunsumer(consumer, producer);
+            return at::nullopt;
+        }
+
+        if (consumer->kind() != kind_) {
+            group = create_singleton_fusion_group(consumer);
+            // should not update here cause consumer has destroyed.
+            // update_global_list_size_map_node_key_context(consumer, group);
+        }
+
+        Node* merged = merge_node_into_group(group, producer->node());
+        //support for constant input. cause we copy the input. no need to replace this.
+        //TODO: pay attention here.  constant handle should be careful.
+        if (producer->uses().size() != 0 &&
+            producer->node()->kind() != prim::Constant) {
+            get_subgraph(group).registerOutput(merged->output());
+            Value* new_producer = group->addOutput();
+            new_producer->copyMetadata(producer);
+            producer->replaceAllUsesWith(new_producer);
+            update_global_context(producer, new_producer);
+        }
+        update_global_list_size_map_node_key_context(producer->node(), group);
+        if (producer->node()->kind() != prim::Constant) {
+            producer->node()->destroy();
+        }
+        LOG(INFO) << "[try_fuse Success] FusionGroup is: " << node_info(group);
+        return group;
+    }
+
+    value_list sort_reverse_topological(ArrayRef<Value*> inputs) {
+        value_list result;
+        for (auto i : inputs) {
+            if ((i->node()->owningBlock() == block_) ||
+                (i->node()->kind() == prim::Constant)) {
+                result.push_back(i);
+            }
+        }
+        // Sort in reverse topological order
+        std::sort(result.begin(), result.end(), [&](Value* a, Value* b) {
+            return a->node()->isAfter(b->node());
+        });
+        return result;
+    }
+
+    // returns where to continue scanning, and whether any fusion was made
+    // todo  换条件
+    std::pair<graph_node_list::iterator, bool> scan_node(Node* consumer, const std::string list_construct) {
+        if (is_node_fusable(consumer)) {
+            value_list inputs = sort_reverse_topological(consumer->inputs());
+            for (Value* producer : inputs) {
+                if ((list_construct == "input" && (producer->node()->kind() != prim::ListConstruct || consumer->kind() != prim::CudaFusionGroup)) ||
+                        (list_construct == "output" && (consumer->kind() != prim::ListUnpack || producer->node()->kind() != prim::CudaFusionGroup)) ||
+                        (producer->node()->kind() == prim::ListUnpack) || (consumer->kind() == prim::ListConstruct)) {
+                    continue;
+                }
+
+                at::optional<Node*> fusion_group = try_fuse(consumer, producer);
+                if (fusion_group) {
+                    // after fusion, consumer moves into a FusionGroup, so inputs is no
+                    // longer valid so we rescan the new FusionGroup for more fusions...
+                    return std::make_pair(fusion_group.value()->reverseIterator(), true);
+                }
+            }
+        }
+        return std::make_pair(++consumer->reverseIterator(), false);
+    }
+  
+    void refresh_aliasdb() {
+        aliasDb_ = torch::make_unique<AliasDb>(graph_);
+    }
+
+    void optimize_fused_graphs() {
+        for (Node* node : block_->nodes()) {
+            if (node->kind() != kind_) {
+                continue;
+            }
+            auto subgraph = node->g(attr::Subgraph);
+            EliminateDeadCode(subgraph);
+            EliminateCommonSubexpression(subgraph);
+            ConstantPooling(subgraph);
+        }
+    }
+
+    void run(const std::string list_construct="") {
+        bool any_changed = true;
+        while (any_changed) {
+            any_changed = false;
+            refresh_aliasdb();
+            for (auto it = block_->nodes().rbegin(); it != block_->nodes().rend();) {
+                bool changed = false;
+                std::tie(it, changed) = scan_node(*it, list_construct);
+                any_changed |= changed;
+            }
+        }
+        refresh_aliasdb();
+        optimize_fused_graphs();
+
+        //TODO: should I add this???
+        // for (Node* n : block_->nodes()) {
+        //     removeOutputsUsedOnlyInSize(n);
+        // }
+
+        for (Node* node : block_->nodes()) {
+            for (Block* sub_block : node->blocks()) {
+                PorosGraphSegment(sub_block, graph_, engine_).run(list_construct);
+            }
+        }
+    }
+};  // struct PorosGraphSegment
+
+void gen_value_dyanamic_shape_of_tensorlist(torch::jit::Value* tensor_value, 
+                                        size_t idx,
+                                        std::map<int32_t, std::vector<c10::TensorTypePtr>> type_map) {
+    auto &_value_dynamic_shape_map = PorosGlobalContext::instance()._value_dynamic_shape_map;
+    
+    // 这里在的tensor_value地址可能和预热时的profile value地址一样，所以直接覆盖
+    ValueDynamicShape dynamic_shape;
+    _value_dynamic_shape_map[tensor_value] = dynamic_shape;
+    _value_dynamic_shape_map[tensor_value].is_dynamic = false;
+    _value_dynamic_shape_map[tensor_value].max_shapes = type_map[idx][0]->sizes().concrete_sizes().value();
+    _value_dynamic_shape_map[tensor_value].min_shapes = type_map[idx][0]->sizes().concrete_sizes().value();
+    _value_dynamic_shape_map[tensor_value].opt_shapes = type_map[idx][0]->sizes().concrete_sizes().value();
+
+    // max
+    for (size_t i = 0; i < type_map[idx].size(); i++){
+        std::vector<int64_t> tmp_max_shape = _value_dynamic_shape_map[tensor_value].max_shapes;
+        for(size_t j = 0; j < tmp_max_shape.size(); j++){
+            _value_dynamic_shape_map[tensor_value].max_shapes[j] = std::max(tmp_max_shape[j], type_map[idx][i]->sizes()[j].value());
+        }
+    }
+    
+    // min
+    for (size_t i = 0; i < type_map[idx].size(); i++){
+        std::vector<int64_t> tmp_min_shape = _value_dynamic_shape_map[tensor_value].min_shapes;
+        for(size_t j = 0; j < tmp_min_shape.size(); j++){
+            _value_dynamic_shape_map[tensor_value].min_shapes[j] = std::min(tmp_min_shape[j], type_map[idx][i]->sizes()[j].value());
+        }
+    }
+
+    ValueDynamicShape& shape = _value_dynamic_shape_map[tensor_value];
+    for (size_t i = 0; i < shape.max_shapes.size(); ++i) {
+        if (shape.max_shapes[i] == shape.min_shapes[i] && shape.max_shapes[i] == shape.opt_shapes[i]) {
+                shape.sizes.push_back(shape.max_shapes[i]);     
+        } else {
+            shape.sizes.push_back(-1);
+            shape.is_dynamic = true;
+        }
+    }
+}
+
+// 此处为子图预判断
+// 作用：由于AdjustmentListTensorOutput、AdjustmentListTensorInput、AdjustmentScalarInput处理子图时会额外增加一些节点，
+// 对于一些可预知的必然回退的子图（例如：unconst node不足、不支持的输出类型等），我们就不去处理这个子图，以减少不必要的节点数增加。
+bool cudafusion_should_be_handle(torch::jit::Node* node) {
+    // 如果传入是其他节点，直接返回false
+    if (node->kind() != torch::jit::prim::CudaFusionGroup) {
+        return false;
+    }
+    
+    int non_constant_node_num = 0;
+    std::shared_ptr<torch::jit::Graph> subgraph = node->g(torch::jit::attr::Subgraph);
+    Block* subblock = subgraph->block();
+    // 子图太小不处理
+    int32_t unconst_threshold = PorosGlobalContext::instance().get_poros_options().unconst_ops_thres;
+    for (auto it = subblock->nodes().begin(); it != subblock->nodes().end(); ++it) {
+        if (it->kind() != torch::jit::prim::Constant) {
+            non_constant_node_num++;
+            if (non_constant_node_num > unconst_threshold) {
+                break;
+            }
+        }
+    }
+    if (non_constant_node_num <= unconst_threshold) {
+        LOG(WARNING) << "Subgraph: " << node_info_with_attr(node) << " size is too small, No tactics will be applied to it.";
+        return false;
+    }
+    return true;
+}
+
+void AddPackAndUnpack(std::shared_ptr<Graph>& group_graph, torch::jit::Value* value, size_t idx, torch::jit::Node* node, bool input=true){
+    /* 在CudaFusionGroup内（ouput）/外（input），添加一个prim::ListUnpack；List[Tensor] -> Tensor、Tensor ...
+    在CudaFusionGroup外（ouput）/内（input），添加一个prim::ListConstruct；Tensor、Tensor ... -> List[Tensor]*/
+
+    // 根据input or output 选择不同的map, 如果input中存在该list，则优先使用input;这样避免了append给output带来的引用问题。
+    LIST_SIZE_MAP list_size_map = {}; //PorosGlobalContext::instance()._list_size_map._list_size_map_input;
+    TENSOR_LIST_TYPE_MAP list_tensor_type_map = {}; //PorosGlobalContext::instance()._list_size_map._list_tensor_type_map_input;
+
+    if (input || PorosGlobalContext::instance()._list_size_map._list_size_map_input.count(value) != 0) {
+        list_size_map = PorosGlobalContext::instance()._list_size_map._list_size_map_input;
+        list_tensor_type_map = PorosGlobalContext::instance()._list_size_map._list_tensor_type_map_input;
+        if (!input) {
+            Node* input_node = list_size_map[value].begin()->first;
+            list_size_map[value][node] = list_size_map[value][input_node];
+            list_tensor_type_map[value][node] = list_tensor_type_map[value][input_node];
+        }
+    } 
+    else {
+        list_size_map = PorosGlobalContext::instance()._list_size_map._list_size_map_output;
+        list_tensor_type_map = PorosGlobalContext::instance()._list_size_map._list_tensor_type_map_output;
+    }
+
+    // 获取该tensorlist的长度
+    int list_size = 0;
+    if (list_size_map.count(value) > 0) {
+        if (list_size_map[value].count(node) > 0) {
+            if (list_size_map[value][node].size() != 1) {
+                 LOG(INFO) << "list " + value->debugName() << " has " << std::to_string(list_size_map[value].size()) << " lengths";
+                 return;
+            }
+            list_size = *list_size_map[value][node].begin();
+        }
+        else {
+            LOG(INFO) << "node is not in list_size_map, value: %" << value->debugName() << ", node info:" << node_info(node);
+            throw c10::Error("node must be in list_size_map", "");
+        }
+    }
+    else {
+        LOG(INFO) << "value is not in list_size_map, value: %" << value->debugName();
+        throw c10::Error("value must be in list_size_map", "");
+    }
+    if (list_size == 0) {
+        LOG(INFO) << "The length of the output list is 0: " << node_info(node);
+        return;
+    }
+
+    // 新建一个unpack_node  和 pack_node
+    Node* unpack_node = group_graph->create(prim::ListUnpack, value);
+    Node* pack_node = group_graph->create(prim::ListConstruct, unpack_node->outputs());
+    pack_node->output(0)->setType(value->type());
+    std::vector<TypePtr> guard_types;
+
+    // 更新下，给后面的前置判断使用
+    list_size_map[value][unpack_node] = {list_size};
+
+    if(input) {
+        pack_node->insertBefore(node);
+        unpack_node->insertBefore(pack_node);
+    }
+    else {
+        unpack_node->insertAfter(node);
+        pack_node->insertAfter(unpack_node);
+    }
+
+    // 更新相关输入输出
+    bool is_dynamic_shape = PorosGlobalContext::instance().get_poros_options().is_dynamic;  
+    std::map<int32_t, std::vector<c10::TensorTypePtr>> type_map = list_tensor_type_map[value][node];
+    pack_node->replaceInput(0, unpack_node->output(0));    
+    unpack_node->replaceInput(0, value);
+    unpack_node->output(0)->setType(type_map[0][0]);
+    pack_node->input(0)->setType(type_map[0][0]);
+    guard_types.push_back(type_map[0][0]);
+
+    if (!input) {
+        value->replaceAllUsesWith(pack_node->output(0));
+        update_global_context(value, pack_node->output(0));
+        unpack_node->replaceInput(0, value);
+    }
+    if (is_dynamic_shape && input) {
+        gen_value_dyanamic_shape_of_tensorlist(unpack_node->output(0), 0, type_map);
+    }
+    
+    for (int j = 0; j < list_size - 1; j++){
+        unpack_node->insertOutput(j + 1);
+        pack_node->insertInput(j + 1, unpack_node->output(j + 1));
+        unpack_node->output(j + 1)->setType(type_map[j + 1][0]);
+        pack_node->input(j + 1)->setType(type_map[j + 1][0]);
+        guard_types.push_back(type_map[j + 1][0]);
+        if (is_dynamic_shape && input) {
+            gen_value_dyanamic_shape_of_tensorlist(unpack_node->output(j + 1), j + 1, type_map);
+        }
+    }
+
+    if (input) {
+        node->replaceInput(idx, pack_node->output(0));
+    }
+    unpack_node->tys_(attr::types, guard_types);
+}
+
+void AdjustmentListTensorInput(std::shared_ptr<Graph>& group_graph, Block* block) {
+    /*把tensor list类型的输入纠正为多个tensor的输入，使其适配tensorrt的输入类型*/
+    graph_node_list nodes = block->nodes();
+    for(auto it = nodes.begin(); it != nodes.end(); it++){
+        for (Block* subblock : it->blocks()) {
+            AdjustmentListTensorInput(group_graph, subblock);
+        }
+        if (it->kind() == prim::CudaFusionGroup) {
+            if (!cudafusion_should_be_handle(*it)) {
+                continue;
+            }
+            at::ArrayRef<Value*> inputs = it->inputs();
+            for (size_t i = 0; i < inputs.size(); i++){
+                if(inputs[i]->type()->str() == "Tensor[]") {
+                    LOG(INFO) << "Adjustment Tensor[] input %" << inputs[i]->debugName();
+                    AddPackAndUnpack(group_graph, inputs[i], i, *it, true);
+                }
+            }
+        }
+    }
+}
+
+void AdjustmentListTensorOutput(std::shared_ptr<Graph>& group_graph, Block* block) {
+    /*把tensor list类型的输出纠正为多个tensor的输入，使其适配tensorrt的输出类型*/
+    graph_node_list nodes = block->nodes();
+    for(auto it = nodes.begin(); it != nodes.end(); it++){
+        for (Block* subblock : it->blocks()) {
+            AdjustmentListTensorOutput(group_graph, subblock);
+        }
+        if (it->kind() == prim::CudaFusionGroup) {
+            if (!cudafusion_should_be_handle(*it)) {
+                continue;
+            }
+            at::ArrayRef<Value*> outputs = it->outputs();
+            for (size_t i = 0; i < outputs.size(); i++){
+                if(outputs[i]->type()->str() == "Tensor[]") {
+                    LOG(INFO) << "Adjustment Tensor[] output %" << outputs[i]->debugName();
+                    AddPackAndUnpack(group_graph, outputs[i], i, *it, false);
+                }
+            }
+        }
+    }
+}
+// When cudafusiongroup subgraph input is int (or int[]) like:
+// %1 : int = size(%x, %b)
+// %4 : Tensor = prim::CudaFusionGroup(%1)
+// or
+// %1 : int[] = size(%x)
+// %4 : Tensor = prim::CudaFusionGroup(%1)
+//
+// Then we insert aten::tensor and aten::IntImplicit (or prim::tolist) before the subgraph like:
+// %1 : int = size(%x, %b)
+// %2 : Tensor = aten::tensor(%1, %type, %device, %requires_grad)
+// %3 : int = aten::IntImplicit(%2)
+// %4 : Tensor = prim::CudaFusionGroup(%3)
+// or
+// %1 : int[] = size(%x)
+// %2 : Tensor = aten::tensor(%1, %type, %device, %requires_grad)
+// %3 : int = prim::tolist(%2, %dim, %type)
+// %4 : Tensor = prim::CudaFusionGroup(%3)
+// 
+// Finally, merge the aten::IntImplicit (or prim::tolist) into the cudafusiongroup subgraph. The int input has been replaced by tensor.
+bool AddInputTensorandScalarimplict(std::shared_ptr<Graph>& group_graph, torch::jit::Value* value, size_t idx, torch::jit::Node* node, IEngine* engine) {
+    bool value_type_is_list = (value->type()->kind() == c10::TypeKind::ListType);
+    int32_t list_size = 1;
+    LIST_SIZE_MAP list_size_map = {};
+    if (value_type_is_list) {
+        // get list size
+        list_size_map = PorosGlobalContext::instance()._list_size_map._list_size_map_input;
+        if (list_size_map.count(value) > 0) {
+            if (list_size_map[value].count(node) > 0) {
+                if (list_size_map[value][node].size() != 1) {
+                    LOG(WARNING) << "list " + value->debugName() << " has " << std::to_string(list_size_map[value].size()) << " lengths";
+                    return false;
+                }
+                list_size = *list_size_map[value][node].begin();
+            } else {
+                LOG(WARNING) << "node is not in list_size_map, value: %" << value->debugName() << ", node info:" << node_info(node);
+                return false;
+            }
+        } else {
+            LOG(WARNING) << "value is not in list_size_map, value: %" << value->debugName();
+            return false;
+        }
+    }
+    // 检查全局_int_intlist_values_map中有无当前scalar值
+    std::map<torch::jit::Value*, ValueDynamicShape>& int_intlist_values_map = PorosGlobalContext::instance()._int_intlist_values_map;
+    if (value->type()->isSubtypeOf(c10::ListType::ofInts()) || value->type()->kind() == c10::TypeKind::IntType) {
+        if (int_intlist_values_map.count(value) == 0) {
+            LOG(WARNING) << "can't find max min opt of int(or int[]) %" << value->debugName();
+            return false;
+        }
+    }
+
+    std::map<torch::jit::Value*, ValueDynamicShape>& value_dynamic_shape_map = PorosGlobalContext::instance()._value_dynamic_shape_map;
+    auto fuser = PorosGraphSegment(group_graph->block(), group_graph, engine);
+    // 创建aten::tensor
+    torch::jit::Node* tensor_node = group_graph->create(torch::jit::aten::tensor);
+    tensor_node->insertBefore(node);
+    tensor_node->addInput(value);
+    // note: 没有setInsertPoint insertconstant默认到图的末尾插入节点
+    // 但此处最好不要用setInsertPoint，当图发生变化导致point的点变化时候会出core
+    // 建议使用”insertConstant之后moveBerfore“来代替”setInsertPoint后insertConstant“的操作
+    // group_graph->setInsertPoint(tensor_node);
+    // 创建aten::tensor dtype、device和requires_grad constant输入
+    torch::jit::Value* type_value = nullptr;
+    c10::optional<at::ScalarType> output_scalar_type;
+    if (value_type_is_list) {
+        if (value->type()->isSubtypeOf(c10::ListType::ofInts())) {
+            type_value = group_graph->insertConstant(c10::ScalarType::Long);
+            output_scalar_type = at::kLong;
+        } else {
+            type_value = group_graph->insertConstant(c10::ScalarType::Float);
+            output_scalar_type = at::kFloat;
+        }
+    } else {
+        if (value->type()->kind() == c10::TypeKind::IntType) {
+            type_value = group_graph->insertConstant(c10::ScalarType::Int);
+            output_scalar_type = at::kInt;
+        } else {
+            type_value = group_graph->insertConstant(c10::ScalarType::Float);
+            output_scalar_type = at::kFloat;
+        }
+    }
+    torch::jit::Value* device_value = nullptr;
+    c10::optional<at::Device> output_device;
+    if (PorosGlobalContext::instance().get_poros_options().device == Device::GPU) {
+        device_value = group_graph->insertConstant(torch::Device(torch::DeviceType::CUDA, 0));
+        output_device = torch::Device(at::kCUDA, 0);
+    } else {
+        torch::jit::IValue none_ivalue;
+        device_value = group_graph->insertConstant(none_ivalue);
+        output_device = torch::Device(at::kCPU);
+    }
+    torch::jit::Value* false_value = group_graph->insertConstant(false);
+    // 没有setinsertpoint，insertconstant默认到了图的末尾，需要将constant移到tensor_node之前
+    type_value->node()->moveBefore(tensor_node);
+    device_value->node()->moveBefore(tensor_node);
+    false_value->node()->moveBefore(tensor_node);
+
+    tensor_node->addInput(type_value);
+    tensor_node->addInput(device_value);
+    tensor_node->addInput(false_value);
+    // must set output type
+    TypePtr output_type = c10::TensorType::create(output_scalar_type, 
+                                output_device,
+                                c10::SymbolicShape(std::vector<c10::optional<int64_t>>({list_size})),
+                                std::vector<c10::Stride>({c10::Stride{0, true, 1}}),
+                                false);
+    tensor_node->output(0)->setType(output_type);
+    // 更新value_dynamic_shape_map中aten::tensor output的max min opt值为int_intlist_values_map中的value对应的值。
+    // 因为tensor_node->output(0)即将变为子图输入
+    value_dynamic_shape_map[tensor_node->output(0)] = int_intlist_values_map[value];
+    
+    // 创建scalar implicit node
+    // 如果是scalarlist
+    if (value_type_is_list) {
+        // 更新list_size_map中value在aten::tensor的list_size信息
+        list_size_map[value][tensor_node] = {(int32_t)list_size};
+        // int list create prim::tolist node
+        torch::jit::Node* tolist_node = group_graph->create(torch::jit::prim::tolist);
+        tolist_node->insertBefore(node);
+        torch::jit::Value* dim_val = group_graph->insertConstant(int(1));
+        torch::jit::Value* type_val = nullptr;
+        if (value->type()->isSubtypeOf(c10::ListType::ofInts())) {
+            // int list
+            type_val = group_graph->insertConstant(int(0));
+        } else {
+            // float list
+            type_val = group_graph->insertConstant(int(1));
+        }
+        tolist_node->addInput(tensor_node->output(0));
+
+        dim_val->node()->moveBefore(tolist_node);
+        type_val->node()->moveBefore(tolist_node);
+
+        tolist_node->addInput(dim_val);
+        tolist_node->addInput(type_val);
+
+        if (value->type()->isSubtypeOf(c10::ListType::ofInts())) {
+            tolist_node->output(0)->setType(c10::ListType::ofInts());
+        } else {
+            tolist_node->output(0)->setType(c10::ListType::ofFloats());
+        }
+        node->replaceInput(idx, tolist_node->output(0)); 
+
+        // 手动更新map
+        list_size_map[tolist_node->output(0)][tolist_node] = {(int32_t)list_size};
+        list_size_map[tolist_node->output(0)][node] = {(int32_t)list_size};
+        int_intlist_values_map[tolist_node->output(0)] = int_intlist_values_map[value];
+
+        // 把tolist merge进子图中
+        fuser.refresh_aliasdb();
+        fuser.merge_node_into_group(node, type_val->node());
+        fuser.merge_node_into_group(node, dim_val->node());
+        fuser.merge_node_into_group(node, tolist_node);
+        fuser.refresh_aliasdb();
+        fuser.optimize_fused_graphs();
+
+    // 如果输入是scalar
+    } else {
+        // int创建intimplicit
+        torch::jit::Node* scalar_implicit_node = nullptr;
+        if (value->type()->kind() == c10::TypeKind::IntType) {
+            torch::jit::Node* intimplicit_node = group_graph->create(torch::jit::aten::IntImplicit, tensor_node->output(0));
+            intimplicit_node->output(0)->setType(c10::IntType::get());
+            intimplicit_node->insertBefore(node);
+            node->replaceInput(idx, intimplicit_node->output(0));
+            scalar_implicit_node = intimplicit_node;
+        } else {
+            // float创建FloatImplicit
+            torch::jit::Node* floatimplicit_node = group_graph->create(torch::jit::aten::FloatImplicit, tensor_node->output(0));
+            floatimplicit_node->output(0)->setType(c10::FloatType::get());
+            floatimplicit_node->insertBefore(node);
+            node->replaceInput(idx, floatimplicit_node->output(0));
+            scalar_implicit_node = floatimplicit_node;
+        }
+        // 更新int_intlist_values_map
+        int_intlist_values_map[scalar_implicit_node->output(0)] = int_intlist_values_map[value];
+        fuser.refresh_aliasdb();
+        fuser.try_fuse(node, node->input(idx));
+        fuser.refresh_aliasdb();
+        fuser.optimize_fused_graphs();
+    }
+    return true;
+}
+
+// 当子图输出是scalar（或scalar list）类型时，
+// 创建aten::tensor与aten::IntImplicit（或prim::tolist）
+// 然后将aten::tensor融合到子图中去
+bool AddOutputTensorandScalarimplict(std::shared_ptr<Graph>& group_graph, torch::jit::Value* value, size_t idx, torch::jit::Node*& node, IEngine* engine) {
+    bool value_type_is_list = (value->type()->kind() == c10::TypeKind::ListType);
+    size_t list_size = 1;
+    LIST_SIZE_MAP list_size_map = {};
+    if (value_type_is_list) {
+        // get list size
+        if (PorosGlobalContext::instance()._list_size_map._list_size_map_input.count(value) != 0) {
+            list_size_map = PorosGlobalContext::instance()._list_size_map._list_size_map_input;
+            Node* input_node = list_size_map[value].begin()->first;
+            list_size_map[value][node] = list_size_map[value][input_node];
+        }
+        else {
+            list_size_map = PorosGlobalContext::instance()._list_size_map._list_size_map_output;
+        }
+        if (list_size_map.count(value) > 0) {
+            if (list_size_map[value].count(node) > 0) {
+                if (list_size_map[value][node].size() != 1) {
+                    LOG(WARNING) << "list " + value->debugName() << " has " << std::to_string(list_size_map[value].size()) << " lengths";
+                    return false;
+                }
+                list_size = *list_size_map[value][node].begin();
+            } else {
+                LOG(WARNING) << "node is not in list_size_map, value: %" << value->debugName() << ", node info:" << node_info(node);
+                return false;
+            }
+        } else {
+            LOG(WARNING) << "value is not in list_size_map, value: %" << value->debugName();
+            return false;
+        }
+    }
+    // 检查全局_int_intlist_values_map中有无当前scalar值
+    std::map<torch::jit::Value*, ValueDynamicShape>& int_intlist_values_map = PorosGlobalContext::instance()._int_intlist_values_map;
+    if (value->type()->isSubtypeOf(c10::ListType::ofInts()) || value->type()->kind() == c10::TypeKind::IntType) {
+        if (int_intlist_values_map.count(value) == 0) {
+            LOG(WARNING) << "can't find max min opt of int(or int[]) %" << value->debugName();
+            return false;
+        }
+    }
+
+    std::map<torch::jit::Value*, ValueDynamicShape>& value_dynamic_shape_map = PorosGlobalContext::instance()._value_dynamic_shape_map;
+    auto fuser = PorosGraphSegment(group_graph->block(), group_graph, engine);
+    // 创建aten::tensor
+    torch::jit::Node* tensor_node = group_graph->create(torch::jit::aten::tensor);
+    tensor_node->insertAfter(node);
+    tensor_node->addInput(value);
+    // 创建aten::tensor dtype、device和requires_grad constant输入
+    torch::jit::Value* type_value = nullptr;
+    c10::optional<at::ScalarType> output_scalar_type;
+    if (value_type_is_list) {
+        if (value->type()->isSubtypeOf(c10::ListType::ofInts())) {
+            type_value = group_graph->insertConstant(c10::ScalarType::Long);
+            output_scalar_type = at::kLong;
+        } else {
+            type_value = group_graph->insertConstant(c10::ScalarType::Float);
+            output_scalar_type = at::kFloat;
+        }
+    } else {
+        if (value->type()->kind() == c10::TypeKind::IntType) {
+            type_value = group_graph->insertConstant(c10::ScalarType::Int);
+            output_scalar_type = at::kInt;
+        } else {
+            type_value = group_graph->insertConstant(c10::ScalarType::Float);
+            output_scalar_type = at::kFloat;
+        }
+    }
+    torch::jit::Value* device_value = nullptr;
+    c10::optional<at::Device> output_device;
+    if (PorosGlobalContext::instance().get_poros_options().device == Device::GPU) {
+        device_value = group_graph->insertConstant(torch::Device(torch::DeviceType::CUDA, 0));
+        output_device = torch::Device(at::kCUDA, 0);
+    } else {
+        torch::jit::IValue none_ivalue;
+        device_value = group_graph->insertConstant(none_ivalue);
+        output_device = torch::Device(at::kCPU);
+    }
+    torch::jit::Value* false_value = group_graph->insertConstant(false);
+
+    type_value->node()->moveBefore(tensor_node);
+    device_value->node()->moveBefore(tensor_node);
+    false_value->node()->moveBefore(tensor_node);
+    
+    tensor_node->addInput(type_value);
+    tensor_node->addInput(device_value);
+    tensor_node->addInput(false_value);
+
+    // must set output type
+    TypePtr output_type = c10::TensorType::create(output_scalar_type, 
+                            output_device,
+                            c10::SymbolicShape(std::vector<c10::optional<int64_t>>({list_size})),
+                            std::vector<c10::Stride>({c10::Stride{0, true, 1}}),
+                            false);
+    tensor_node->output(0)->setType(output_type);
+
+    value_dynamic_shape_map[tensor_node->output(0)] = int_intlist_values_map[value];
+
+    // 创建scalar implicit node
+    // 如果输入是scalarlist
+    torch::jit::Node* tolist_node = nullptr;
+    torch::jit::Node* scalar_implicit_node = nullptr;
+    if (value_type_is_list) {
+        // 更新list_size_map中value在aten::tensor子图的list_size信息
+        list_size_map[value][tensor_node] = {(int32_t)list_size};
+        // int list create prim::tolist node
+        tolist_node = group_graph->create(torch::jit::prim::tolist);
+        tolist_node->insertAfter(tensor_node);
+        tolist_node->addInput(tensor_node->output(0));
+        torch::jit::Value* dim_val = group_graph->insertConstant(int(1));
+        torch::jit::Value* type_val = nullptr;
+        if (value->type()->isSubtypeOf(c10::ListType::ofInts())) {
+            // int list
+            type_val = group_graph->insertConstant(int(0));
+        } else {
+            // float list
+            type_val = group_graph->insertConstant(int(1));
+        }
+
+        dim_val->node()->moveBefore(tolist_node);
+        type_val->node()->moveBefore(tolist_node);
+
+        tolist_node->addInput(dim_val);
+        tolist_node->addInput(type_val);
+
+        if (value->type()->isSubtypeOf(c10::ListType::ofInts())) {
+            tolist_node->output(0)->setType(c10::ListType::ofInts());
+        } else {
+            tolist_node->output(0)->setType(c10::ListType::ofFloats());
+        }
+        value->replaceAllUsesAfterNodeWith(tolist_node, tolist_node->output(0));
+
+        list_size_map[tolist_node->output(0)][tolist_node] = {(int32_t)list_size};
+        // list_size_map中value有node概念，需要一个一个更新
+        torch::jit::use_list tolist_node_user = tolist_node->output(0)->uses();
+        for (size_t u = 0; u < tolist_node_user.size(); u++) {
+            list_size_map[tolist_node->output(0)][tolist_node_user[u].user] = {(int32_t)list_size};
+        }
+        int_intlist_values_map[tolist_node->output(0)] = int_intlist_values_map[value];
+    } else {
+        // int create intimplicit node
+        if (value->type()->kind() == c10::TypeKind::IntType) {
+            torch::jit::Node* intimplicit_node = group_graph->create(torch::jit::aten::IntImplicit, tensor_node->output(0));
+            intimplicit_node->output(0)->setType(c10::IntType::get());
+            intimplicit_node->insertAfter(tensor_node);
+            scalar_implicit_node = intimplicit_node;
+        } else {
+            // float create FloatImplicit node
+            torch::jit::Node* floatimplicit_node = group_graph->create(torch::jit::aten::FloatImplicit, tensor_node->output(0));
+            floatimplicit_node->output(0)->setType(c10::FloatType::get());
+            floatimplicit_node->insertAfter(tensor_node);
+            scalar_implicit_node = floatimplicit_node;
+        }
+        value->replaceAllUsesAfterNodeWith(scalar_implicit_node, scalar_implicit_node->output(0));
+        // 更新int_intlist_values_map
+        int_intlist_values_map[scalar_implicit_node->output(0)] = int_intlist_values_map[value];
+    }
+    // 为aten::tensor 创造子图，更新全局map，最后与node fuser
+    fuser.refresh_aliasdb();
+    torch::jit::Node* subgraph_node = fuser.create_singleton_fusion_group(tensor_node);
+    fuser.merge_node_into_group(subgraph_node, type_value->node());
+    fuser.merge_node_into_group(subgraph_node, device_value->node());
+    fuser.merge_node_into_group(subgraph_node, false_value->node());
+    // list_size_map只要更换节点就需要更新
+    if (value_type_is_list) {
+        list_size_map[value][subgraph_node] = {(int32_t)list_size};
+    }
+    value_dynamic_shape_map[subgraph_node->output(0)] = int_intlist_values_map[value];
+    fuser.try_fuse(subgraph_node, subgraph_node->input(0));
+    // 由于用了aten::tensor构造的子图来fuse，之前node的子图已经消失，需更新node为新融合的子图
+    node = subgraph_node;
+    fuser.refresh_aliasdb();
+    fuser.optimize_fused_graphs();
+    return true;
+}
+
+// 将子图的scalar输入转成tensor输入
+bool adjust_scalar_input(std::shared_ptr<Graph>& group_graph, Block* block, IEngine* engine) {
+    bool changed = false;
+    graph_node_list nodes = block->nodes();
+    for(auto it = nodes.begin(); it != nodes.end(); ) {
+        Node* current_node = *it;
+        it++;
+        for (Block* subblock : current_node->blocks()) {
+            changed |= adjust_scalar_input(group_graph, subblock, engine);
+        }
+        if (current_node->kind() == prim::CudaFusionGroup) {
+            if (!cudafusion_should_be_handle(current_node)) {
+                continue;
+            }
+            at::ArrayRef<Value*> subgraph_node_inputs = current_node->inputs();
+            for (size_t i = 0; i < subgraph_node_inputs.size(); i++) {
+                // todo: support float and float[]
+                // mark by tsq 0713: loop中的scalar input可能会有问题，因为其记录的max min opt不一定真实，但目前没有遇到此类问题。
+                if(subgraph_node_inputs[i]->type()->str() == "int" || subgraph_node_inputs[i]->type()->str() == "int[]") {
+                    LOG(INFO) << "Adjustment subgraph: " << node_info_with_attr(current_node) << " scalar input %" << subgraph_node_inputs[i]->debugName();
+                    std::string origin_input_debugname = subgraph_node_inputs[i]->debugName();
+                    if (AddInputTensorandScalarimplict(group_graph, subgraph_node_inputs[i], i, current_node, engine)) {
+                        LOG(INFO) << "Adjustment scalar input %" << origin_input_debugname << " to tensor %" 
+                        << subgraph_node_inputs[i]->debugName() << " succeed!";
+                        changed = true;
+                    } else {
+                        LOG(WARNING) << "Adjustment scalar input %" << origin_input_debugname << " failed!";
+                    }
+                }
+            }
+        }
+    }
+    return changed;
+}
+
+void AdjustmentScalarInput(std::shared_ptr<Graph>& group_graph, Block* block, IEngine* engine) {
+    bool changed = false;
+    changed = adjust_scalar_input(group_graph, block, engine);
+    if (changed) {
+        EliminateDeadCode(group_graph);
+        EliminateCommonSubexpression(group_graph);
+        ConstantPooling(group_graph);
+    }
+}
+
+// 将子图的scalar输出转为tensor输出
+bool adjust_scalar_output(std::shared_ptr<Graph>& group_graph, Block* block, IEngine* engine) {
+    /*把tensor list类型的输入纠正为多个tensor的输入，使其适配tensorrt的输入类型*/
+    bool changed = false;
+    graph_node_list nodes = block->nodes();
+    for(auto it = nodes.begin(); it != nodes.end(); ) {
+        Node* current_node = *it;
+        it++;
+        for (Block* subblock : current_node->blocks()) {
+            changed |= adjust_scalar_output(group_graph, subblock, engine);
+        }
+        if (current_node->kind() == prim::CudaFusionGroup) {
+            if (!cudafusion_should_be_handle(current_node)) {
+                continue;
+            }
+
+            for (size_t i = 0; i < current_node->outputs().size(); i++) {
+                if (current_node->output(i)->type()->str() == "int" || current_node->output(i)->type()->str() == "int[]") {
+                    // todo: support float and float[]
+                    LOG(INFO) << "Adjustment subgraph: " << node_info_with_attr(current_node) << " scalar output %" << current_node->output(i)->debugName();
+                    std::string origin_output_debugname =  current_node->output(i)->debugName();
+                    if (AddOutputTensorandScalarimplict(group_graph, current_node->output(i), i, current_node, engine)) {
+                        LOG(INFO) << "Adjustment scalar output %" << origin_output_debugname << " to tensor %" 
+                        << current_node->output(i)->debugName() << " succeed!";
+                        changed = true;
+                        // 更新scalar output后子图会更新，在新的子图上继续寻找scalar output，直到所有输出都不是scalar。
+                        i = 0;
+                    } else {
+                        LOG(WARNING) << "Adjustment scalar output %" << origin_output_debugname << " failed!";
+                    }
+                }
+            }
+        }
+    }
+    return changed;
+}
+
+void AdjustmentScalarOutput(std::shared_ptr<Graph>& group_graph, Block* block, IEngine* engine) {
+    bool changed = false;
+    changed = adjust_scalar_output(group_graph, block, engine);
+    if (changed) {
+        EliminateDeadCode(group_graph);
+        EliminateCommonSubexpression(group_graph);
+        ConstantPooling(group_graph);
+    }
+}
+
+void peephole_optimize_shape_expressions(Block* block) {
+    graph_node_list nodes = block->nodes();
+    for (auto it = nodes.begin(); it != nodes.end(); ++it) {
+        Node* node = *it;
+        for (Block* subblock : node->blocks()) {
+            peephole_optimize_shape_expressions(subblock);
+        }
+        if (node->kind() == prim::BroadcastSizes) {
+            // Remove no-op broadcasts.
+            if (node->inputs().size() == 1) {
+                node->output()->replaceAllUsesWith(node->input());
+                it.destroyCurrent();
+                continue;
+            }
+            // Deduplicate inputs, but use their unique() values to ensure
+            // this process only depends on the graph.
+            std::map<size_t, Value*> unique_to_value;
+            for (Value* input : node->inputs()) {
+                unique_to_value.emplace(input->unique(), input);
+            }
+            if (unique_to_value.size() != node->inputs().size()) {
+                std::vector<Value*> inputs;
+                inputs.reserve(unique_to_value.size());
+                for (auto& entry : unique_to_value) {
+                    inputs.push_back(entry.second);
+                }
+                if (inputs.size() == 1) {
+                    node->output()->replaceAllUsesWith(inputs[0]);
+                } else {
+                    WithInsertPoint insert_guard{node};
+                    node->output()->replaceAllUsesWith(broadcast_sizes(inputs));
+                }
+                it.destroyCurrent();
+                --it; // Revisit the node with deduplicated inputs
+                continue;
+            }
+            // Remove compose simple chains of broadcasts into a single node.
+            const auto& uses = node->output()->uses();
+            if (uses.size() == 1 && uses[0].user->kind() == prim::BroadcastSizes) {
+                Node* user = uses[0].user;
+                user->removeInput(uses[0].offset);
+                // NB: we don't care about deduplication in here, as we will visit user
+                // later.
+                for (Value* i : node->inputs()) {
+                    user->addInput(i);
+                }
+                it.destroyCurrent();
+            }
+        }
+    }
+}   // peephole_optimize_shape_expressions
+
+void guard_fusion_group(Node* fusion) {
+    // Fixup types of the subgraph inputs
+    std::vector<TypePtr> guard_types;
+    std::vector<Value*> inputs_to_check;
+    for (Value* input : fusion->inputs()) {
+        // We only check inputs of the fusion group and expect NNC to infer
+        // intermediates and outputs shapes
+        if (!input->type()->cast<TensorType>()) {
+            continue;
+        }
+
+        // note: modified from original implementation, we are guarding fusion
+        //       outputs
+        if (input->node()->kind() == prim::Constant) {
+            continue;
+        }
+        inputs_to_check.push_back(input);
+        guard_types.push_back(input->type());
+    }
+    if (!inputs_to_check.size()) {
+        return;
+    }
+
+    Node* typecheck_node = fusion->owningGraph()
+                                //this is not right, i should register my own type to torchscrilpt
+                                //->create(prim::CudaFusionGuard, inputs_to_check, 1)
+                                ->create(prim::FusionGroup, inputs_to_check, 1)
+                                ->insertBefore(fusion);
+    // fix output to BoolType
+    typecheck_node->output()->setType(BoolType::get());
+    Value* typecheck_result = typecheck_node->output();
+    typecheck_node->tys_(attr::types, guard_types);
+
+    std::unordered_map<Value*, Value*> typechecked_inputs;
+
+    // Insert if block
+    Node* versioning_if =
+        fusion->owningGraph()
+            ->create(prim::If, {typecheck_result}, fusion->outputs().size())
+            ->insertAfter(typecheck_node);
+    for (size_t idx = 0; idx < fusion->outputs().size(); ++idx) {
+        versioning_if->output(idx)->setType(fusion->output(idx)->type());
+        fusion->output(idx)->replaceAllUsesWith(versioning_if->output(idx));
+    }
+    Block* true_block = versioning_if->addBlock();
+    Block* false_block = versioning_if->addBlock();
+
+    // Fill in the false block. It should contain the unoptimized
+    // copy of the fused subgraph.
+    auto& subgraph = *fusion->g(attr::Subgraph);
+    WithInsertPoint guard(false_block->return_node());
+    const std::vector<Value*> subgraph_outputs =
+        insertGraph(*fusion->owningGraph(), subgraph, fusion->inputs());
+    for (Value* output : subgraph_outputs) {
+        false_block->registerOutput(output);
+    }
+
+    // types get copied to the fallback graph, so remove specializations before
+    // replacing
+    // TODO: this is not exposed here, I need to remove that before inserting the
+    //       graph
+    // removeTensorTypeSpecializations(false_block);
+    replaceBlockWithFallbackGraph(false_block, fusion->inputs());
+
+    // Fill in the true block. It has all inputs type-checked and its
+    // body should be the fusion group node.
+    fusion->moveBefore(true_block->return_node());
+    for (Value* output : fusion->outputs()) {
+        true_block->registerOutput(output);
+    }
+}  // guard_fusion_group
+
+void guard_fusion_groups(Block* block) {
+    std::vector<Node*> fusions;
+    for (Node* n : block->nodes()) {
+        for (Block* b : n->blocks()) {
+            guard_fusion_groups(b);
+        }
+        if (n->kind() == prim::CudaFusionGroup) {
+            fusions.push_back(n);
+        }
+    }
+    for (Node* fusion : fusions) {
+        guard_fusion_group(fusion);
+    }
+}   // guard_fusion_groups
+
+}   // anonymous namespace
+
+void graph_segment(std::shared_ptr<Graph>& graph, IEngine* engine) {
+
+    GRAPH_DUMP("before PorosGraphSegment Graph: ", graph);
+    PorosGraphSegment(graph->block(), graph, engine).run();
+    GRAPH_DUMP("after PorosGraphSegment Graph: ", graph);
+    //guard_fusion_groups(graph->block());
+
+    //necessary passes after segmentation
+    {
+        torch::jit::EliminateCommonSubexpression(graph);
+        torch::jit::EliminateDeadCode(graph);
+        peephole_optimize_shape_expressions(graph->block());
+        torch::jit::RemoveTensorTypeSpecializations(graph);
+        GRAPH_DUMP("after necessary pass Graph: ", graph);
+    }
+
+    //necessary adjustmentations after segmentation
+    {
+        AdjustmentListTensorInput(graph, graph->block());
+        PorosGraphSegment(graph->block(), graph, engine).run("input");
+        GRAPH_DUMP("after AdjustmentListTensorInput Graph: ", graph);
+        AdjustmentListTensorOutput(graph, graph->block());
+        PorosGraphSegment(graph->block(), graph, engine).run("output");
+        GRAPH_DUMP("after AdjustmentListTensorOutput Graph: ", graph);
+        AdjustmentScalarInput(graph, graph->block(), engine);
+        GRAPH_DUMP("after AdjustmentScalarInput Graph: ", graph);
+        AdjustmentScalarOutput(graph, graph->block(), engine);
+        GRAPH_DUMP("after AdjustmentScalarOutput Graph: ", graph);
+    }
+}
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/compile/graph_segment.h b/poros/src/poros/compile/graph_segment.h
new file mode 100644
index 0000000000..2498edada8
--- /dev/null
+++ b/poros/src/poros/compile/graph_segment.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file graph_segment.h
+* @author tianjinjin@baidu.com
+* @date Thu Mar 18 14:33:54 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+//pytorch
+#include <torch/script.h>
+
+#include "poros/engine/iengine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+/**
+ * @brief  graph_segment fullfil the segmentation func of given graph
+ * @param [in/out] graph : the graph to be segmented
+ * @param [in] engine : backend engine
+ * @return
+ **/
+void graph_segment(std::shared_ptr<torch::jit::Graph>& graph, IEngine* engine);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/compile/ivalues_analysis.cpp b/poros/src/poros/compile/ivalues_analysis.cpp
new file mode 100644
index 0000000000..55c261663f
--- /dev/null
+++ b/poros/src/poros/compile/ivalues_analysis.cpp
@@ -0,0 +1,930 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file ivalues_analysis.cpp
+* @author tianjinjin@baidu.com
+* @date Fri Apr 23 11:41:59 CST 2021
+* @brief
+**/
+#include "poros/compile/ivalues_analysis.h"
+
+#include <sstream>
+
+#include <c10/util/Exception.h>
+#include <torch/script.h>
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/clear_profiling.h>  //check
+#include <torch/csrc/jit/runtime/interpreter.h>  //to get executioncontext
+#include <torch/csrc/jit/runtime/graph_executor.h>  //for get numprofileruns
+
+#include "poros/util/poros_util.h"
+#include "poros/context/poros_global.h"
+
+//I have to copy the ProfileOp function here
+namespace torch {
+namespace jit {
+
+const Symbol ProfileOp::Kind = ::c10::prim::profile;
+void ProfileOp::cloneFrom(torch::jit::Node* other_) {
+    torch::jit::Node::cloneFrom(other_);
+    auto other = other_->cast<ProfileOp>();
+    this->callback_ = other->getCallback();
+}
+
+torch::jit::Node* ProfileOp::allocNewInstance(torch::jit::Graph* g) {
+    return new ProfileOp(g, {nullptr});
+}
+
+}  // namespace jit
+}  // namespace torch
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+IvalueAnalysis::IvalueAnalysis(std::shared_ptr<torch::jit::Graph> g)
+    : profiled_graph_(std::move(g)), profiling_count_(torch::jit::getNumProfiledRuns()) {}
+
+void IvalueAnalysis::insert_input_listsize_profile(torch::jit::Node* node, size_t offset) {
+    torch::jit::Value* input_value = node->input(offset);
+
+    // 创建profile node
+    torch::jit::ProfileOp *pn = create_profile_node(nullptr, {input_value});
+    auto pno = pn->addOutput();
+    pn->ty_(torch::jit::attr::profiled_type, input_value->type());
+    pno->setType(input_value->type());
+
+    std::function<void(torch::jit::Stack&)> list_size_profile = [this, pno, node](torch::jit::Stack& stack){
+        int64_t frame_id = 0;
+        torch::jit::pop(stack, frame_id);
+        c10::IValue ivalue;
+        torch::jit::pop(stack, ivalue);
+        std::lock_guard<std::mutex> lock(this->mutex_);
+        if (ivalue.isList()) {
+            auto input_value = pno->node()->input(0);
+            //not exist yet, insert it
+            if (_list_size_map._list_size_map_input.count(input_value) == 0) {
+                std::set<int32_t> int_set{(int32_t)ivalue.toListRef().size()};
+                _list_size_map._list_size_map_input[input_value][node] = int_set;
+                if (ivalue.isTensorList()) {
+                    auto tl = ivalue.toTensorList();
+                    std::map<int32_t, std::vector<c10::TensorTypePtr>> type_map;
+                    for(size_t i = 0; i < ivalue.toListRef().size(); i++){
+                        auto tlty = torch::jit::tensorTypeInCurrentExecutionContext(tl[i]);
+                        type_map[i] = {tlty};
+                    }
+                    _list_size_map._list_tensor_type_map_input[input_value][node] = type_map;
+                }
+            }
+            else {
+                if (_list_size_map._list_size_map_input[input_value].count(node) == 0) {
+                    std::set<int32_t> int_set{(int32_t)ivalue.toListRef().size()};
+                    _list_size_map._list_size_map_input[input_value][node] = int_set;
+                }
+                else {
+                    _list_size_map._list_size_map_input[input_value][node].insert(ivalue.toListRef().size());
+                }
+                if (ivalue.isTensorList()) {
+                    auto tl = ivalue.toTensorList();
+                    std::map<int32_t, std::vector<c10::TensorTypePtr>> &type_map = _list_size_map._list_tensor_type_map_input[input_value][node];
+                    for(size_t i = 0; i < ivalue.toListRef().size(); i++) {
+                        auto tlty = torch::jit::tensorTypeInCurrentExecutionContext(tl[i]);
+                        type_map[i].push_back(tlty);
+                    }
+                }
+            }
+            // extract int[] values to map
+            if (input_value->type()->isSubtypeOf(c10::ListType::ofInts()) && 
+                input_value->node()->kind() != torch::jit::prim::Constant && ivalue.isIntList()) {
+                auto& value_vec_map = _int_intlist_values_per_frame[frame_id];
+                // extract int[]
+                std::vector<int64_t> int_vec;
+                c10::List<int64_t> c10_int_list = ivalue.toIntList();
+                for (int64_t i : c10_int_list) {
+                    int_vec.push_back(i);
+                }
+                //not exist yet, insert it
+                if (value_vec_map.count(input_value) == 0) {
+                    std::vector<std::vector<int64_t>> int_vec_vec;
+                    int_vec_vec.push_back(int_vec);
+                    value_vec_map.insert({input_value, int_vec_vec});
+                } else {
+                    value_vec_map[input_value].push_back(int_vec);
+                }
+            }    
+        }
+        torch::jit::push(stack, ivalue);
+    };
+    pn->setCallback(list_size_profile);
+    pn->insertBefore(node);
+    node->replaceInput(offset, pn->output());
+}
+
+void IvalueAnalysis::insert_number_eval_profile(torch::jit::Node* node, size_t offset) {
+    torch::jit::Value* input_value = node->input(offset);
+    // 创建profile node
+    torch::jit::ProfileOp *pn = create_profile_node(nullptr, {input_value});
+    auto pno = pn->addOutput();
+    pn->ty_(torch::jit::attr::profiled_type, input_value->type());
+    pno->setType(input_value->type());
+    
+    std::function<void(torch::jit::Stack&)> int_intlist_profile = [this, input_value](torch::jit::Stack& stack) {
+        int64_t frame_id = 0;
+        torch::jit::pop(stack, frame_id);
+        c10::IValue ivalue;
+        torch::jit::pop(stack, ivalue);
+        std::lock_guard<std::mutex> lock(this->mutex_);
+        if (ivalue.isInt()) {
+            auto& value_vec_map = _int_intlist_values_per_frame[frame_id];
+            // extract int
+            std::vector<int64_t> int_vec;
+            int_vec.push_back(ivalue.toInt());
+            //not exist yet, insert it
+            if (value_vec_map.count(input_value) == 0) {
+                std::vector<std::vector<int64_t>> int_vec_vec;
+                int_vec_vec.push_back(int_vec);
+                value_vec_map.insert({input_value, int_vec_vec});
+            } else {
+                value_vec_map[input_value].push_back(int_vec);
+            }
+        }
+        // passing t through
+        torch::jit::push(stack, ivalue);
+    };
+    pn->setCallback(int_intlist_profile);
+    pn->insertBefore(node);
+    node->replaceInput(offset, pn->output());
+}
+
+void IvalueAnalysis::insert_output_listsize_profile(torch::jit::Node* node, size_t offset) {
+    torch::jit::Value* output_value = node->output(offset);
+    //watch this value
+    auto eval_pn = create_profile_node(nullptr, {output_value});
+    auto pno = eval_pn->addOutput();
+    eval_pn->ty_(torch::jit::attr::profiled_type, output_value->type());
+    pno->setType(output_value->type());
+
+    //do we need outout? and change the input of prim::If?
+    std::function<void(torch::jit::Stack&)> eval_profiler = [this, pno, node](torch::jit::Stack& stack) {
+        int64_t frame_id = 0;
+        torch::jit::pop(stack, frame_id);
+        c10::IValue ivalue;
+        torch::jit::pop(stack, ivalue);
+        std::lock_guard<std::mutex> lock(this->mutex_);
+
+        if (ivalue.isList()) {
+            //not exist yet, insert it
+            auto input_value = pno->node()->input(0);
+            if (_list_size_map._list_size_map_output.count(input_value) == 0) {
+                std::set<int32_t> int_set{(int32_t)ivalue.toListRef().size()};
+                _list_size_map._list_size_map_output[input_value][node] = int_set;
+
+                if (ivalue.isTensorList()) {
+                    auto tl = ivalue.toTensorList();
+                    std::map<int32_t, std::vector<c10::TensorTypePtr>> type_map;
+                    for(size_t i = 0; i < ivalue.toListRef().size(); i++){
+                        auto tlty = torch::jit::tensorTypeInCurrentExecutionContext(tl[i]);
+                        type_map[i] = {tlty};
+                    }
+                    _list_size_map._list_tensor_type_map_output[input_value][node] = type_map;
+                }
+            }
+            else {
+                if (_list_size_map._list_size_map_output[input_value].count(node) == 0) {
+                    std::set<int32_t> int_set{(int32_t)ivalue.toListRef().size()};
+                    _list_size_map._list_size_map_output[input_value][node] = int_set;
+                }
+                else {
+                    _list_size_map._list_size_map_output[input_value][node].insert(ivalue.toListRef().size());
+                }
+
+                if (ivalue.isTensorList()) {
+                    auto tl = ivalue.toTensorList();
+                    std::map<int32_t, std::vector<c10::TensorTypePtr>> &type_map = _list_size_map._list_tensor_type_map_output[input_value][node];
+                    for(size_t i = 0; i < ivalue.toListRef().size(); i++) {
+                        auto tlty = torch::jit::tensorTypeInCurrentExecutionContext(tl[i]);
+                        type_map[i].push_back(tlty);
+                    }
+                }
+            }
+        }
+        torch::jit::push(stack, ivalue);
+    };
+    eval_pn->setCallback(eval_profiler);
+    eval_pn->insertAfter(node);
+}
+
+//TODO: check out the difference between the ProfileIValueOp and ProfileOp
+torch::jit::ProfileOp* IvalueAnalysis::create_profile_node(
+                                const std::function<void(torch::jit::Stack&)>& fp,
+                                at::ArrayRef<torch::jit::Value*> inputs) {
+    auto pn = new torch::jit::ProfileOp(profiled_graph_.get(), fp);
+    for (auto in : inputs) {
+        pn->addInput(in);
+    }
+    return pn;
+}
+
+/*
+torch::jit::ProfileOptionalOp* IvalueAnalysis::create_profile_optional_node(
+                            const std::function<void(torch::jit::Stack&)>& fp,
+                            at::ArrayRef<torch::jit::Value*> inputs) {
+    auto pn = new torch::jit::ProfileOptionalOp(profiled_graph_.get(), fp);
+    pn->i_(torch::jit::attr::num_present, 0);
+    pn->i_(torch::jit::attr::num_none, 0);
+    for (auto in : inputs) {
+        pn->addInput(in);
+    }
+    return pn;
+} */
+
+void IvalueAnalysis::insert_shape_profile(torch::jit::Node* node, size_t offset) {
+    torch::jit::Value* input_value = node->input(offset);
+    //watch this value
+    auto pn = create_profile_node(nullptr, {input_value});
+    auto pno = pn->addOutput();
+    pn->ty_(torch::jit::attr::profiled_type, c10::TensorType::get());
+    pno->setType(c10::TensorType::get());
+    
+    std::function<void(torch::jit::Stack&)> shape_profiler = [this, pno](torch::jit::Stack& stack) {
+        int64_t frame_id = 0;
+        torch::jit::pop(stack, frame_id);
+        c10::IValue ivalue;
+        torch::jit::pop(stack, ivalue);
+        if (ivalue.isTensor()) {
+            std::lock_guard<std::mutex> lock(this->mutex_);
+            std::map<torch::jit::Value*, std::vector<c10::TensorTypePtr>>& profiled_types = _profiled_types_per_frame[frame_id];
+            at::Tensor t = ivalue.toTensor();
+            if (t.defined()) {
+                at::TensorTypePtr pttp = torch::jit::tensorTypeInCurrentExecutionContext(t);
+                if (profiled_types.count(pno) == 0) {
+                    //insert value and tensortype info
+                    profiled_types.insert({pno, {pttp}});
+                } else {
+                    std::vector<c10::TensorTypePtr>& type_list = profiled_types.at(pno);
+                    type_list.push_back(pttp);
+                    // auto type = profiled_types.at(pno);
+                    // pttp = type->merge(*pttp);
+                    // profiled_types[pno] = pttp;
+                }
+            } else {
+                profiled_types[pno] = {c10::TensorType::get()->withUndefined()};
+            }
+        }
+        // passing t through
+        torch::jit::push(stack, ivalue);
+    };
+    pn->setCallback(shape_profiler);
+    pn->insertBefore(node);
+    node->replaceInput(offset, pn->output());
+}
+
+/*
+void IvalueAnalysis::insert_optional_profile(torch::jit::Node* node, size_t offset) {
+    torch::jit::Value* input_value = node->input(offset);
+    // this value
+    auto opt_pn = create_profile_optional_node(nullptr, {input_value});
+    // watch the definition instead of the use, 
+    // because we are only optimizing in the case of a None value which is immutable
+    std::function<void(torch::jit::Stack&)> optional_profiler = [this, opt_pn](torch::jit::Stack& stack) {
+        std::lock_guard<std::mutex> lock(this->mutex_);
+        int64_t frame_id = 0;
+        torch::jit::pop(stack, frame_id);
+        c10::IValue ivalue;
+        torch::jit::pop(stack, ivalue);
+        if (ivalue.isNone()) {
+            opt_pn->i_(torch::jit::attr::num_none, opt_pn->i(torch::jit::attr::num_none) + 1);
+        } else {
+            opt_pn->i_(torch::jit::attr::num_present, opt_pn->i(torch::jit::attr::num_present) + 1);
+        }
+        torch::jit::push(stack, ivalue);
+    };
+    opt_pn->setCallback(optional_profiler);
+    auto pno = opt_pn->addOutput();
+    pno->setType(input_value->type());
+    opt_pn->insertAfter(input_value->node());
+    input_value->replaceAllUsesAfterNodeWith(opt_pn, pno);
+} */
+
+//TODO: check more
+void IvalueAnalysis::insert_eval_profile(torch::jit::Node* node, size_t offset) {
+    torch::jit::Value* output_value = node->output(offset);
+    //watch this value
+    auto eval_pn = create_profile_node(nullptr, {output_value});
+    auto pno = eval_pn->addOutput();
+    eval_pn->ty_(torch::jit::attr::profiled_type, output_value->type());
+    pno->setType(output_value->type());
+
+    //do we need outout? and change the input of prim::If?
+    std::function<void(torch::jit::Stack&)> eval_profiler = [this, pno](torch::jit::Stack& stack) {
+        int64_t frame_id = 0;
+        torch::jit::pop(stack, frame_id);
+        c10::IValue ivalue;
+        torch::jit::pop(stack, ivalue);
+        std::lock_guard<std::mutex> lock(this->mutex_);
+        if (ivalue.isBool()) {
+            //not exist yet, insert it
+            if (_evaluate_values_map.count(pno) == 0) {
+                std::vector<bool> bool_vector{ivalue.toBool()};
+                _evaluate_values_map[pno] = bool_vector;
+            } else {
+                _evaluate_values_map[pno].emplace_back(ivalue.toBool());
+            }
+        }
+        torch::jit::push(stack, ivalue);
+    };
+    eval_pn->setCallback(eval_profiler);
+    eval_pn->insertAfter(node);
+    //replace the user nodes.
+    for (auto use: output_value->uses()) {
+        auto consumer_node = use.user;
+        for(size_t offset = 0; offset < consumer_node->inputs().size(); offset++) {
+            if (consumer_node->input(offset) == output_value &&
+                consumer_node != eval_pn) {
+                consumer_node->replaceInput(offset, eval_pn->output());
+            }
+        }
+    }
+}
+
+std::map<torch::jit::Value*, c10::TensorTypePtr> IvalueAnalysis::merge_tensor_type_per_frame(
+                    std::map<torch::jit::Value*, std::vector<c10::TensorTypePtr>>& profiled_map) {
+    std::map<torch::jit::Value*, c10::TensorTypePtr> merged_tensor_type;
+    for (auto iter : profiled_map) {
+        torch::jit::Value* profile_value = iter.first;
+        std::vector<c10::TensorTypePtr> type_list = iter.second;
+        for (auto tensor_type : type_list) {
+            if (merged_tensor_type.count(profile_value) == 0) {
+                merged_tensor_type.insert({profile_value, tensor_type});
+            } else {
+                c10::TensorTypePtr type = merged_tensor_type.at(profile_value);
+                tensor_type = type->merge(*tensor_type);
+                merged_tensor_type[profile_value] = tensor_type;
+            }
+        }
+    }
+    return merged_tensor_type;
+}
+
+c10::SymbolicShape IvalueAnalysis::merge_symbolic_shapes(
+                                const c10::SymbolicShape& new_sizes,
+                                const c10::SymbolicShape& sym_shapes,
+                                torch::jit::SetPartitioningHelper& partition_helper) {
+    std::vector<c10::ShapeSymbol> new_symbols;
+    TORCH_INTERNAL_ASSERT(
+        new_sizes.rank().has_value() && sym_shapes.rank().has_value() &&
+        *new_sizes.rank() == *sym_shapes.rank());
+
+    for (size_t i = 0; i < *new_sizes.rank(); i++) {
+        if (!(*sym_shapes.sizes())[i].is_static() ||
+            !(*new_sizes.sizes())[i].is_static()) {
+            new_symbols.emplace_back();
+            continue;
+        }
+        auto symbol = (*sym_shapes.sizes())[i];
+        int64_t new_size = (*new_sizes.sizes())[i].static_size();
+        //GRAPH_DUMP("Merging symbol ", symbol);
+        auto new_sym = partition_helper.partitionSetByDimension(new_size, symbol);
+        new_symbols.emplace_back(new_sym);
+    }
+    return c10::SymbolicShape(new_symbols);
+}
+
+void IvalueAnalysis::analysis_ivalue_for_block(torch::jit::Block* block) {
+    for (auto it = block->nodes().begin(); it != block->nodes().end(); ++it) {
+        auto node = *it;
+        //iterate the input value of the node
+        for (size_t offset = 0; offset < node->inputs().size(); offset++) {
+            auto input_value = node->input(offset);
+            //tensortype handle
+            if (input_value->type()->kind() == c10::TypeKind::TensorType) {
+                insert_shape_profile(node, offset);
+            }
+            if (input_value->type()->kind() == c10::TypeKind::ListType){
+                insert_input_listsize_profile(node, offset);
+            }
+            // 踩坑记录：0318，须保证后面分析的value没有被前面加过profile node
+            // 否则前面profile output替换了value节点，map key中找不到自己想要的value
+            // 且同一value的profile callback函数会多次执行，造成不可预知的问题
+            if (input_value->type()->kind() == c10::TypeKind::IntType && 
+                input_value->node()->kind() != torch::jit::prim::Constant) {
+                insert_number_eval_profile(node, offset);
+            }
+            //TODO: WHY NOT SUPPORT ProfileOptionalOp anymore after I upgrade libtorch from 1.7.1 to 1.8.1
+            // if (input_value->type()->cast<c10::OptionalType>() && 
+            //     has_gradsum_to_size_uses(input_value)) {
+            //     insert_optional_profile(node, offset);
+            // }
+
+            if (input_value->type()->kind() == c10::TypeKind::BoolType) {
+                insert_eval_profile(input_value->node(), 0);
+                //TODO: modify the second input 0 to more strict check
+            }
+        }
+        for (size_t offset = 0; offset < node->outputs().size(); offset++) {
+            auto output_value = node->output(offset);
+            if (output_value->type()->kind() == c10::TypeKind::ListType){
+                insert_output_listsize_profile(node, offset);
+                it++;
+            }
+        }
+
+        for (auto b : node->blocks()) {
+            analysis_ivalue_for_block(b);
+        }
+    }
+    
+    //insert shape profile for block outputs
+    for (size_t offset = 0; offset < block->return_node()->inputs().size(); offset++) {
+        auto input_value = block->return_node()->input(offset);
+        if (input_value->type()->isSubtypeOf(c10::TensorType::get())) {
+            insert_shape_profile(block->return_node(), offset);
+        }
+        // //TODO: should I add this??
+        // if (input_value->type()->kind() == c10::TypeKind::BoolType) {
+        //     insert_eval_profile(input_value->node(), 0);
+        // }
+    }
+}
+
+void IvalueAnalysis::gen_list_size() {
+    PorosGlobalContext::instance()._list_size_map = _list_size_map;
+}
+
+void IvalueAnalysis::gen_value_dyanamic_shape() {
+
+    std::map<torch::jit::Value*, ValueDynamicShape>& value_dynamic_shape_map = PorosGlobalContext::instance()._value_dynamic_shape_map;
+    if (_profiled_types_per_frame.size() < 3) {
+        throw c10::Error("dynamic_shape must has three prewarm data [max & min & opt]", "");
+    }
+
+    auto profiled_types_iter = _profiled_types_per_frame.begin();  //frame id 
+    auto start_frame_id = profiled_types_iter->first;   // std::map<torch::jit::Value*, c10:TensorTypePtr>
+
+    //max
+    for (auto &e : _profiled_types_per_frame[start_frame_id++]) {
+        auto profile_value = e.first->node()->input();
+        //没有则创建。
+        if (value_dynamic_shape_map.count(profile_value) == 0) {
+            ValueDynamicShape shape;
+            value_dynamic_shape_map[profile_value] = shape;
+            value_dynamic_shape_map[profile_value].is_dynamic = false;
+        }
+
+        std::vector<c10::TensorTypePtr>& shape_list = e.second;
+        std::vector<int64_t> current_shape;
+        for (auto &shape :  shape_list) {
+            if (shape->sizes().concrete_sizes().has_value()) {
+                current_shape = shape->sizes().concrete_sizes().value();
+            } else {
+                // 因为此处在剪枝操作之前，有的block可能没有被执行。
+                // 而这些block中的profile node是空值，此处应跳过
+                continue;
+            }
+            //当一个value 作为多个node的输入的时候，会有多个profile。
+            //其次，当一个tensor出现在loop中的时候，相关联的op大概率会被多次执行，也会有多个profile。
+            //2021.11.11 踩坑记录，针对多个profile，如果出现了size 不一致的情况，max 应该取其中最大的，min应该取其中最小的。
+            if (value_dynamic_shape_map[profile_value].max_shapes.size() != 0) {
+                auto old_shape = value_dynamic_shape_map[profile_value].max_shapes;
+                std::vector<int64_t> new_shape;
+                for (size_t i = 0; i < old_shape.size(); ++i) {
+                    new_shape.push_back(std::max(old_shape[i], current_shape[i]));
+                }
+                value_dynamic_shape_map[profile_value].max_shapes = new_shape;
+                // LOG(INFO) << "try to update max shape, current_shape: [" << current_shape
+                //           << "], old_shape: [" << old_shape
+                //           << "], new_shape: [" << new_shape << "]";
+            } else {
+                value_dynamic_shape_map[profile_value].max_shapes = current_shape;
+            }
+        }
+    }
+
+    //min
+    for (auto &e : _profiled_types_per_frame[start_frame_id++]) {
+        //TODO: maybe need to check the value existing before setting
+        auto profile_value = e.first->node()->input();
+        std::vector<c10::TensorTypePtr> shape_list = e.second;
+        std::vector<int64_t> current_shape;
+        for (auto &shape :  shape_list) {
+            if (shape->sizes().concrete_sizes().has_value()) {
+                current_shape = shape->sizes().concrete_sizes().value();
+            } else {
+                // 因为此处在剪枝操作之前，有的block可能没有被执行。
+                // 而这些block中的profile node是空值，此处应跳过
+                continue;
+            }
+            if (value_dynamic_shape_map[profile_value].min_shapes.size() != 0) {
+                auto old_shape = value_dynamic_shape_map[profile_value].min_shapes;
+                std::vector<int64_t> new_shape;
+                for (size_t i = 0; i < old_shape.size(); ++i) {
+                    new_shape.push_back(std::min(old_shape[i], current_shape[i]));
+                }
+                value_dynamic_shape_map[profile_value].min_shapes = new_shape;
+                // LOG(INFO) << "try to update min shape, current_shape: [" << current_shape
+                //           << "], old_shape: [" << old_shape
+                //           << "], new_shape: [" << new_shape << "]";
+            } else {
+                value_dynamic_shape_map[profile_value].min_shapes = current_shape;
+            }
+        }
+    }
+
+    //opt
+    for (auto &e : _profiled_types_per_frame[start_frame_id++]) {
+        auto profile_value = e.first->node()->input();
+        std::vector<c10::TensorTypePtr> shape_list = e.second;
+        for (auto &shape :  shape_list) {
+            if (shape->sizes().concrete_sizes().has_value()) {
+                value_dynamic_shape_map[profile_value].opt_shapes = shape->sizes().concrete_sizes().value();
+            }
+        }
+    }
+
+    for (auto &e : value_dynamic_shape_map) {
+        ValueDynamicShape& shape = e.second;
+        //2022.09.28 踩坑记录，当针对某一个value, 出现了其中一个shape的size为0的情况，
+        //说明这个value在某个block下(可能是循环次数跟query相关的loop，也可能是进入条件跟query相关的if分支)
+        //此时对该block下的graph进行子图分割会出现异常，因为在子图替换阶段，无法正常生产输入的size信息。
+        //此处兼容这种情况。
+        if (shape.max_shapes.size() == 0 || shape.min_shapes.size() == 0 || shape.opt_shapes.size() == 0) {
+            if (e.first->node()->kind() == torch::jit::prim::Constant) {
+                continue;
+            }
+            LOG(INFO) << "value shape info for: %" << e.first->debugName()
+                    << ", max_shape: " << shape.max_shapes
+                    << ", min_shape: " << shape.min_shapes
+                    << ", opt_shape: " << shape.opt_shapes;       
+            PorosGlobalContext::instance()._disable_subblock_convert = true;
+            continue;
+        }
+        for (size_t i = 0; i < shape.max_shapes.size(); ++i) {
+            if (shape.max_shapes[i] == shape.min_shapes[i] && shape.max_shapes[i] == shape.opt_shapes[i]) {
+                shape.sizes.push_back(shape.max_shapes[i]); 
+            } else {
+                shape.sizes.push_back(-1);
+                shape.is_dynamic = true;
+            }
+        }
+        // LOG(INFO) << "value shape info for: %" << e.first->debugName()
+        //          << ", max_shape: " << shape.max_shapes
+        //          << ", min_shape: " << shape.min_shapes
+        //          << ", opt_shape: " << shape.opt_shapes;
+    }
+}
+
+void IvalueAnalysis::gen_int_intlist_value() {
+    std::map<torch::jit::Value*, ValueDynamicShape>& int_intlist_values_map = PorosGlobalContext::instance()._int_intlist_values_map;
+    if (_int_intlist_values_per_frame.size() == 0) {
+        return;
+    }
+
+    int64_t start_frame_id = _int_intlist_values_per_frame.begin()->first;  //frame id
+    
+    //max
+    // e -> std::map<torch::jit::Value*, std::vector<std::vector<int64_t>>> 迭代 pair
+    for (auto &e : _int_intlist_values_per_frame[start_frame_id++]) {
+        std::vector<std::vector<int64_t>> int_values_vecs = e.second;
+        if (int_values_vecs.size() == 0) {
+            continue;
+        }
+        size_t per_vec_size = int_values_vecs[0].size();
+        std::vector<int64_t> max_vector(per_vec_size, INT64_MIN);
+        bool length_is_var = false;
+        for (std::vector<int64_t>& v : int_values_vecs) {
+            if (max_vector.size() != v.size()) {
+                length_is_var = true;
+                break;
+            }
+            for (size_t i = 0; i < max_vector.size(); i++) {
+                max_vector[i] = std::max(max_vector[i], v[i]);
+            }
+        }
+        if (length_is_var) {
+            continue;
+        }
+
+        torch::jit::Value* int_value = e.first;
+        if (int_intlist_values_map.count(int_value) == 0) {
+            ValueDynamicShape shape;
+            shape.max_shapes = max_vector;
+            int_intlist_values_map[int_value] = shape;
+            int_intlist_values_map[int_value].is_dynamic = false;
+        } else {
+            int_intlist_values_map[int_value].max_shapes = max_vector;
+        }
+    }
+
+    if (_int_intlist_values_per_frame.size() == 1) {
+        for (auto &e : int_intlist_values_map) {
+            e.second.min_shapes = e.second.max_shapes;
+            e.second.opt_shapes = e.second.max_shapes;
+        }
+        return;
+    } else {
+        for (auto &e : _int_intlist_values_per_frame[start_frame_id++]) {
+            std::vector<std::vector<int64_t>> int_values_vecs = e.second;
+            if (int_values_vecs.size() == 0) {
+                continue;
+            }
+            size_t per_vec_size = int_values_vecs[0].size();
+            std::vector<int64_t> min_vector(per_vec_size, INT64_MAX);
+            bool length_is_var = false;
+            for (std::vector<int64_t>& v : int_values_vecs) {
+                if (min_vector.size() != v.size()) {
+                    length_is_var = true;
+                    break;
+                }
+                for (size_t i = 0; i < min_vector.size(); i++) {
+                    min_vector[i] = std::min(min_vector[i], v[i]);
+                }
+            }
+            if (length_is_var) {
+                continue;
+            }
+
+            torch::jit::Value* int_value = e.first;
+            if (int_intlist_values_map.count(int_value) == 0) {
+                ValueDynamicShape shape;
+                shape.min_shapes = min_vector;
+                int_intlist_values_map[int_value] = shape;
+                int_intlist_values_map[int_value].is_dynamic = false;
+            } else {
+                int_intlist_values_map[int_value].min_shapes = min_vector;
+            }
+        }
+
+        //opt
+        for (auto &e : _int_intlist_values_per_frame[start_frame_id++]) {
+            std::vector<std::vector<int64_t>> int_values_vecs = e.second;
+            if (int_values_vecs.size() == 0 ) {
+                continue;
+            }
+            size_t per_vec_size = int_values_vecs[0].size();
+            bool length_is_var = false;
+            for (std::vector<int64_t>& v : int_values_vecs) {
+                if (per_vec_size != v.size()) {
+                    length_is_var = true;
+                    break;
+                }
+            }
+            if (length_is_var) {
+                continue;
+            }
+
+            torch::jit::Value* int_value = e.first;
+            if (int_intlist_values_map.count(int_value) == 0) {
+                ValueDynamicShape shape;
+                shape.opt_shapes = int_values_vecs[0];
+                int_intlist_values_map[int_value] = shape;
+                int_intlist_values_map[int_value].is_dynamic = false;
+            } else {
+                int_intlist_values_map[int_value].opt_shapes = int_values_vecs[0];
+            }
+        }
+    }
+}
+
+std::unique_ptr<IvalueAnalysis> IvalueAnalysis::analysis_ivalue_for_graph(
+                    const std::shared_ptr<torch::jit::Graph>& graph) {
+
+    auto new_g = graph->copy(); //copy or use the original one??
+    auto ia = std::unique_ptr<IvalueAnalysis>(new IvalueAnalysis(new_g));
+    auto raw_ia = ia.get();
+
+    //clear the existing profile node that may exist.
+    torch::jit::ClearProfilingInformation(new_g);
+    //analysis main function
+    ia->analysis_ivalue_for_block(new_g->block());
+    
+    std::function<void(torch::jit::Stack&)> counter = [raw_ia](torch::jit::Stack& stack) {
+        int64_t frame_id = 0;
+        torch::jit::pop(stack, frame_id);
+        std::lock_guard<std::mutex> lock(raw_ia->mutex_);
+        
+        if (raw_ia->profiling_count_ > 0) {
+            raw_ia->profiling_count_--;
+        }
+
+        // merge tensortype profiling information from all runs
+        if (raw_ia->profiling_count_ == 0) {
+            LOG(INFO) << "Collected tensor profile " << raw_ia->_profiled_types_per_frame.size() << " records for run " << frame_id;
+            if  (raw_ia->_profiled_types_per_frame.empty()) {
+                return;
+            }
+            // the key is a frame id, the value is a mapping from a Value in a graph to a profiled TensorType
+            // we make a copy of profiling information from the very first run
+            // and use it for building the symbol sets
+            auto profiled_types_iter = raw_ia->_profiled_types_per_frame.begin();  //frame id
+            
+            // merge itself
+            auto merged_profiled_types = raw_ia->merge_tensor_type_per_frame(profiled_types_iter->second);
+            ++profiled_types_iter;
+            
+            // merge profiling information from next runs into the first one
+            for (; profiled_types_iter != raw_ia->_profiled_types_per_frame.end(); ++profiled_types_iter) {
+                torch::jit::SetPartitioningHelper partition_helper;
+                for (const auto& val_type_pair : raw_ia->merge_tensor_type_per_frame(profiled_types_iter->second)) {
+                    auto insertion_result = merged_profiled_types.insert(val_type_pair);
+                    if (!insertion_result.second) { // Already existed
+                        const c10::TensorType* type = insertion_result.first->second.get();
+                        //TODO: merge function take care more
+                        //TODO: the merge function has change from torch1.7.1 to torch1.8.1
+                        auto merged_type = type->merge(*val_type_pair.second);
+                        if (merged_type->sizes().size().has_value()) {
+                            auto new_shape = raw_ia->merge_symbolic_shapes(
+                                val_type_pair.second->symbolic_sizes(), type->symbolic_sizes(), partition_helper);
+                            GRAPH_DEBUG("Merging ", *val_type_pair.second, " of run ", profiled_types_iter->first, " into ", *type);
+                            merged_type = type->withSymbolicShapes(std::move(new_shape));
+                            GRAPH_DEBUG("Result : ", *merged_type);
+                            insertion_result.first->second = std::move(merged_type);
+                        } else {
+                            // reset symbolic shapes when ranks are different
+                            // TODO: attention here
+                            insertion_result.first->second = std::move(merged_type);
+                        }
+                    }
+                }
+            }
+            
+            // update types in the graph
+            for (auto val_type_pair : merged_profiled_types) {
+                val_type_pair.first->node()->ty_(torch::jit::attr::profiled_type, val_type_pair.second);
+            }
+        }
+
+        //TODO: check this more
+        // update eval information from all runs
+        if (raw_ia->profiling_count_ == 0) {
+            LOG(INFO) << "Collected evaluate " << raw_ia->_evaluate_values_map.size() << " records for run " << frame_id;
+            if  (raw_ia->_evaluate_values_map.empty()) {
+                return;
+            }
+
+            torch::jit::WithInsertPoint guard(raw_ia->profiled_graph_->block()->nodes().front());
+            auto true_const = raw_ia->profiled_graph_->insertConstant(true);
+            auto false_const = raw_ia->profiled_graph_->insertConstant(false);
+            for (auto& value_bools_pair : raw_ia->_evaluate_values_map) {
+                auto profile_value = value_bools_pair.first;
+                auto bool_vector = value_bools_pair.second;
+                if (std::all_of(bool_vector.begin(), bool_vector.end(),
+                    [](bool i){ return i == true;})) {
+                    profile_value->node()->replaceInput(0, true_const);
+                    //LOG(INFO) << "Replace " << node_info(profile_value->node()) << "input 0 as true_constant";
+                }
+
+                if (std::all_of(bool_vector.begin(), bool_vector.end(),
+                    [](bool i){ return i == false;})) {
+                    profile_value->node()->replaceInput(0, false_const);
+                    //LOG(INFO) << "Replace " << node_info(profile_value->node()) << "input 0 as false_constant";
+                }
+            }
+        }
+    }; //func counter end
+    
+    auto pop = ia->create_profile_node(counter, {});
+    new_g->appendNode(pop);  //put this profile at end of the graph to upback all the tensors.
+    GRAPH_DUMP("Instrumented Graph: ", new_g);
+    return ia;
+}
+
+//DEPRECATED
+bool has_gradsum_to_size_uses(torch::jit::Value* v) {
+    return std::any_of(v->uses().begin(), v->uses().end(), [](const torch::jit::Use& use) {
+        return use.user->kind() == torch::jit::aten::_grad_sum_to_size;
+    });
+}
+
+//DEPRECATED
+void IvalueAnalysis::insert_debug_profile(torch::jit::Node* node, size_t offset) {
+    torch::jit::Value* input_value = node->input(offset);
+    //watch this value
+    auto pn = create_profile_node(nullptr, {input_value});
+    //auto pno = pn->addOutput();
+    pn->ty_(torch::jit::attr::profiled_type, c10::TensorType::get());
+    //pno->setType(c10::TensorType::get());
+
+    std::function<void(torch::jit::Stack&)> debug_profiler = [this, node](torch::jit::Stack& stack) {
+        int64_t frame_id = 0;
+        torch::jit::pop(stack, frame_id);
+        c10::IValue ivalue;
+        torch::jit::pop(stack, ivalue);
+        if (ivalue.isTensor()) {
+            std::lock_guard<std::mutex> lock(this->mutex_);
+            auto t = ivalue.toTensor();
+            if (t.defined()) {
+                auto pttp = torch::jit::tensorTypeInCurrentExecutionContext(t);
+
+                //here. print node info.  print input info
+                // std::cout << "debug during interprete, [node_info]:" << node_info_with_attr(node)
+                //     <<", [input value type]: " << pttp->str()
+                //     <<", [input value shape]: " << pttp->sizes().size()
+                //     << std::endl;
+            }
+        }
+        // passing t through
+        torch::jit::push(stack, ivalue);
+    };
+    
+    pn->setCallback(debug_profiler);
+    pn->insertBefore(node);
+    //node->replaceInput(offset, pn->output());
+}
+
+//DEPRECATED
+void IvalueAnalysis::debug_tensors_for_block(torch::jit::Block* block) {
+    for (auto it = block->nodes().begin(); it != block->nodes().end(); ++it) {
+        auto node = *it;
+        //iterate the input value of the node
+        for (size_t offset = 0; offset < node->inputs().size(); offset++) {
+            auto input_value = node->input(offset);
+            //tensortype handle
+            if (input_value->type()->kind() == c10::TypeKind::TensorType) {
+                insert_debug_profile(node, offset);
+            }
+        }
+
+        for (auto b : node->blocks()) {
+            debug_tensors_for_block(b);
+        }
+    }
+    //insert shape profile for block outputs
+    for (size_t offset = 0; offset < block->return_node()->inputs().size(); offset++) {
+        auto input_value = block->return_node()->input(offset);
+        if (input_value->type()->isSubtypeOf(c10::TensorType::get())) {
+            insert_debug_profile(block->return_node(), offset);
+        }
+    }
+}
+
+//DEPRECATED
+std::vector<torch::jit::Node*> get_prim_if_user(torch::jit::Value* value) {
+    std::vector<torch::jit::Node*> if_nodes;
+    for (auto use : value->uses()) {
+        if (is_dim_equal_if_node(use.user)) {
+            if_nodes.emplace_back(use.user);
+        }
+    }
+    //sort
+    std::sort(if_nodes.begin(), if_nodes.end(), [&](torch::jit::Node* a, torch::jit::Node* b) {
+        return a->isBefore(b);
+    });
+    return if_nodes;
+}
+
+//DEPRECATED
+void IvalueAnalysis::prune_if_block(torch::jit::Block* block) {
+    if (_evaluate_values_map.empty()) {
+        return;
+    }
+    for (auto itr = block->nodes().begin(); itr != block->nodes().end(); itr++) {
+        auto node = *itr;
+        //itr++; // nonono, not here. the next node may be if node. and may be already destroyed below
+        if (node->kind() == torch::jit::prim::profile && 
+            node->outputs().size() == 0 && node->inputs().size() == 1) {
+            auto evaluate_values = _evaluate_values_map.find(node->input(0));
+            if (evaluate_values != _evaluate_values_map.end()) {
+                auto bool_vector = evaluate_values->second;
+                if (std::all_of(bool_vector.begin(), bool_vector.end(), 
+                    [](bool i){ return i == true;})) {
+                    //the result keep true during every data round
+                    auto if_nodes = get_prim_if_user(node->input(0));
+                    for (auto if_node: if_nodes) {
+                        inline_if_body(if_node->blocks().at(0));
+                    }
+                }
+                if (std::all_of(bool_vector.begin(), bool_vector.end(),
+                    [](bool i){ return i == false;})) {
+                    //the result keep false during every round
+                    auto if_nodes = get_prim_if_user(node->input(1));
+                    for (auto if_node: if_nodes) {
+                        inline_if_body(if_node->blocks().at(1));
+                    }
+                }
+            }
+            //cause it has no output. so destroy it directly.
+            node->destroy();
+            _evaluate_values_map.erase(node->input(0));
+        } else {
+            for (torch::jit::Block* ib : node->blocks()) {
+                prune_if_block(ib);
+            }
+        }
+    }
+}
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/compile/ivalues_analysis.h b/poros/src/poros/compile/ivalues_analysis.h
new file mode 100644
index 0000000000..69be38ecc5
--- /dev/null
+++ b/poros/src/poros/compile/ivalues_analysis.h
@@ -0,0 +1,119 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file ivalue_analysis.h
+* @author tianjinjin@baidu.com
+* @date Thu Mar 18 14:33:54 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <list>
+#include <map>
+#include <unordered_map>
+#include <vector>
+
+#include <ATen/ATen.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
+#include <ATen/core/stack.h>       //for Stack
+#include <torch/csrc/jit/ir/ir.h>  //for ProfileOp
+#include <torch/csrc/jit/runtime/profiling_record.h>  //for SetPartitioningHelper
+
+#include "poros/context/poros_global.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+struct IvalueAnalysis {
+    //disable copy and move op to avoid unexpected copy/move happened when in callback func
+    IvalueAnalysis(const IvalueAnalysis&) = delete; 
+    IvalueAnalysis(IvalueAnalysis&&) noexcept = delete;
+    static std::unique_ptr<IvalueAnalysis> analysis_ivalue_for_graph(
+        const std::shared_ptr<torch::jit::Graph>& graph);
+        
+    std::shared_ptr<torch::jit::Graph> profiled_graph_;
+    std::mutex mutex_;
+    size_t profiling_count_;
+    // the key is a frame id
+    // the value is a mapping from a Value in a graph to a profiled TensorType
+    std::map<int64_t, std::map<torch::jit::Value*, std::vector<c10::TensorTypePtr>>> _profiled_types_per_frame;
+    // meaning of key(int64_t) and value(Value*) are same to _profiled_types_per_frame.
+    // vec<vec> records all of int(int[]) values against the Value* in a graph.
+    std::map<int64_t, std::map<torch::jit::Value*, std::vector<std::vector<int64_t>>>> _int_intlist_values_per_frame;
+    // std::map<int64_t, std::map<torch::jit::Value*, c10::IValue>>  _evaluate_values_per_frame;
+    // we only store bool data. this may change in the future.
+    std::map<torch::jit::Value*, std::vector<bool>> _evaluate_values_map;
+
+    // 存储list类型的value相关信息
+    ListSizeMap _list_size_map;
+ 
+    std::shared_ptr<torch::jit::Graph> graph() const {
+        return profiled_graph_;
+    }
+
+    // 拷贝dynamic信息到context
+    void gen_value_dyanamic_shape();
+
+    // 拷贝list信息到context
+    void gen_list_size();
+
+    // 拷贝int int[]值信息到context
+    void gen_int_intlist_value();
+
+ private:
+    //ProfileIValueOp not supported  when in pytorch 1.7.x
+    //so I have to rollback to ProfileOp, and have to copy the main function
+    torch::jit::ProfileOp* create_profile_node(
+                            const std::function<void(torch::jit::Stack&)>& fp, 
+                            at::ArrayRef<torch::jit::Value*> inputs);
+
+    void analysis_ivalue_for_block(torch::jit::Block* block);
+    void insert_shape_profile(torch::jit::Node* node, size_t offset);
+    void insert_eval_profile(torch::jit::Node* node, size_t offset);
+    void insert_input_listsize_profile(torch::jit::Node* node, size_t offset);
+    void insert_output_listsize_profile(torch::jit::Node* node, size_t offset);
+    void insert_number_eval_profile(torch::jit::Node* node, size_t offset);
+
+    /**
+     * merge the tensortype list about one given value to a single merged tensortype
+     * **/
+    std::map<torch::jit::Value*, c10::TensorTypePtr> merge_tensor_type_per_frame(
+                    std::map<torch::jit::Value*, std::vector<c10::TensorTypePtr>>& profiled_map);
+
+    c10::SymbolicShape merge_symbolic_shapes(
+                            const c10::SymbolicShape& new_sizes,
+                            const c10::SymbolicShape& sym_shapes,
+                            torch::jit::SetPartitioningHelper& partition_helper);
+
+    //DEPRECATED
+    //cut down some if block if the if-condition won't change during all the warm-up data
+    void prune_if_block(torch::jit::Block* block);
+    //DEPRECATED
+    void debug_tensors_for_block(torch::jit::Block* block);
+    //DEPRECATED
+    void insert_debug_profile(torch::jit::Node* node, size_t offset);
+
+    
+    //be private.
+    IvalueAnalysis(std::shared_ptr<torch::jit::Graph> g);
+};
+
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/compile/partition.cpp b/poros/src/poros/compile/partition.cpp
new file mode 100644
index 0000000000..401da922e3
--- /dev/null
+++ b/poros/src/poros/compile/partition.cpp
@@ -0,0 +1,192 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file partition.cpp
+* @author tianjinjin@baidu.com
+* @date Thu Jun  3 15:10:30 CST 2021
+* @brief 
+**/
+
+#include "poros/compile/partition.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+// utility function to check if the node implies broadcast on a given shape (
+// assumed to be shape of an input tensor)
+// limitations:
+//   1. we rely on shape information to judge this. so we would require output
+//      shape to be available;
+//   2. we basically compares given shape to the shape of the only output of
+//      the node and return true if it implies broadcast from the former to the
+//      latter.
+bool maybeBroadcastOnShape(
+    const torch::jit::Node* node,
+    const std::vector<c10::optional<int64_t>>& shape) {
+    //TODO: add outputs size check
+    //TORCH_INTERNAL_ASSERT(n->outputs().size() == 1, "not expecting multiple outputs from a node, graph partitioning logic needs to be updated");
+    // assumes that if output is not a tensor type, it's not broadcasting
+    if (auto out_type = node->output(0)->type()->cast<c10::TensorType>()) {
+        if (out_type->dim()) {
+            if (out_type->dim().value() < shape.size()) {
+                // no broadcast for reduction operation;
+                return false;
+            } else if (out_type->dim().value() > shape.size()) {
+                // increased rank means there is reduction;
+                return true;
+            } else {
+                // same rank, we need to iterate through sizes and check if size-1
+                // exists in input `shape`
+                for (const auto& opt_size : shape) {
+                // TODO: not sure if we need to check for output size != 1, since we
+                // are currently marking all size-1 dimension as broadcast in codegen.
+                    if (opt_size.has_value() && opt_size.value() == 1) {
+                        return true;
+                    }
+                }
+            }
+        }
+    }
+    return false;
+};
+
+// bool hasReductionOperation(const torch::jit::Node* node) {
+// if (torch::jit::fuser::cuda::isReductionNode(node)) {
+//     return true;
+// }
+// if (node->kind() == torch::jit::prim::CudaFusionGroup) {
+//     for (auto n : node->g(torch::jit::attr::Subgraph)->nodes()) {
+//         if (hasReductionOperation(n)) {
+//             return true;
+//         }
+//     }
+// }
+// return false;
+// }
+    
+bool createTrickyBroadcast(const torch::jit::Node* consumer, const torch::jit::Node* producer) {
+    
+    auto count_broadcasting_in_node =
+        [](const torch::jit::Node* node,
+        const std::vector<c10::optional<int64_t>>& shape,
+        size_t offset) {
+            int num_broadcasting = 0;
+            if (node->kind() == torch::jit::prim::CudaFusionGroup) {
+                // be careful here as `subgraph_input`, as its name suggests, is in a
+                // different fraph from `node`.
+                const auto& subgraph_input =node->g(torch::jit::attr::Subgraph)->inputs()[offset];
+                for (const auto& use : subgraph_input->uses()) {
+                    if (maybeBroadcastOnShape(use.user, shape)) {
+                        num_broadcasting++;
+                    }
+                }
+            } else {
+                if (maybeBroadcastOnShape(node, shape)) {
+                    num_broadcasting++;
+                }
+            }
+            return num_broadcasting;
+        };
+        
+    // case 1. We check shared inputs to `producer` & `consumer`;
+    for (int i = 0; i < static_cast<int>(producer->inputs().size()); i++) {
+        auto n_input = producer->input(i);
+        auto n_input_type = n_input->type()->cast<c10::TensorType>();
+        if (n_input_type != nullptr && n_input_type->sizes().sizes()) {
+            std::vector<c10::optional<int64_t>> n_input_shape = n_input_type->sizes().sizes().value();
+            int num_broadcasting = 0;
+            
+            // check broadcasting for the n_input inside `consumer`;
+            for (const auto& use : n_input->uses()) {
+                if (use.user == consumer) {
+                    num_broadcasting += count_broadcasting_in_node(consumer, n_input_shape, use.offset);
+                }
+            }
+
+            // if no broadcasting happened for consumer, there's no point check
+            // multiple broadcasting in producer alone;
+            if (num_broadcasting == 0) {
+                continue;
+            }
+
+            // check broadcasting for n_input inside `producer`;
+            num_broadcasting += count_broadcasting_in_node(producer, n_input_shape, i);
+
+            // encounted multiple broadcasting scheme for a single TV, we will not be
+            // able to schedule this, prevent the fusion; (case 1)
+            if (num_broadcasting > 1) {
+                return true;
+            }
+        }
+    }
+
+    // case 2. We check input to `consumer` that is also the output from
+    // `producer`
+    for (int i = 0; i < static_cast<int>(producer->outputs().size()); i++) {
+        auto n_output = producer->output(i);
+        auto n_output_type = n_output->type()->cast<c10::TensorType>();
+        if (n_output_type != nullptr && n_output_type->sizes().sizes()) {
+            std::vector<c10::optional<int64_t>> n_output_shape = n_output_type->sizes().sizes().value();
+            int num_broadcasting = 0;
+            // If we only look at case 1 & case 2, we need to check broadcast of
+            // `n_output` inside `producer`, if it is a `prim::CudaFusionGroup`.
+            // this is actually not necessary when we consider case 3, as we avoid
+            // broadcasting on outputs already;
+
+            // TODO: merge this code with case 1.
+            // check broadcasting for the n_output inside `consumer`;
+            bool use_as_output = false;
+            for (const auto& use : n_output->uses()) {
+                if (use.user == consumer) {
+                    num_broadcasting += count_broadcasting_in_node(consumer, n_output_shape, use.offset);
+                } else {
+                    // case 3. output is used by other nodes not the consumer, no
+                    //         broadcasting is allowed;
+                    use_as_output = true;
+                }
+            }
+
+            // encounted multiple broadcasting scheme for a single TV, we will not be
+            // able to schedule this, prevent the fusion; (case 2)
+            // Alternatively, if use_as_output is true, we would not permit broadcast
+            // at all. (case 3)
+            if (num_broadcasting > (use_as_output ? 0 : 1)) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+bool is_node_fusable(const torch::jit::Node* node, IEngine* engine) {
+    if (node->kind() == torch::jit::prim::CudaFusionGroup || (engine->is_node_supported(node))) {
+        return true;
+    }
+    return false;
+}
+
+bool is_node_fusable(const torch::jit::Node* fusion, 
+                    const torch::jit::Node* node,
+                    IEngine* engine) {
+    if (is_node_fusable(node, engine) && !createTrickyBroadcast(fusion, node)) {
+        return true;
+    }
+    return false;
+}
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/compile/partition.h b/poros/src/poros/compile/partition.h
new file mode 100644
index 0000000000..3f193197c8
--- /dev/null
+++ b/poros/src/poros/compile/partition.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file partition.h
+* @author tianjinjin@baidu.com
+* @date Thu Jun  3 14:57:58 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include "torch/script.h"
+
+#include "poros/engine/iengine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+bool is_node_fusable(const torch::jit::Node* node, IEngine* engine);
+bool is_node_fusable(const torch::jit::Node* fusion, 
+                    const torch::jit::Node* node, 
+                    IEngine* engine);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/compile/poros_module.cpp b/poros/src/poros/compile/poros_module.cpp
new file mode 100644
index 0000000000..5b4e618a53
--- /dev/null
+++ b/poros/src/poros/compile/poros_module.cpp
@@ -0,0 +1,53 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file poros_module.cpp
+* @author huangben@baidu.com
+* @date 2021/08/05 11:39:03 CST 2021
+* @brief 
+**/
+
+#include "poros/compile/poros_module.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+std::unique_ptr<PorosModule> Load(const std::string& filename, const PorosOptions& options) {
+    torch::jit::Module module;
+    try {
+        module = torch::jit::load(filename);
+    } catch (const c10::Error& e) {
+        LOG(ERROR) << "error loading the model";
+        return nullptr;
+    }
+    std::unique_ptr<PorosModule> poros_module(new PorosModule(module));
+    poros_module->_options = options;
+
+    if (options.device == GPU) {
+        poros_module->to(at::kCUDA);
+    }
+
+    if (options.debug == true) {
+        // when setting this, all the INFO level will be printed
+        c10::ShowLogInfoToStderr();
+    }
+    
+    return poros_module;
+}
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/compile/poros_module.h b/poros/src/poros/compile/poros_module.h
new file mode 100644
index 0000000000..9a836b437a
--- /dev/null
+++ b/poros/src/poros/compile/poros_module.h
@@ -0,0 +1,89 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file poros_module.h
+* @author huangben@baidu.com
+* @date 2021/08/05  5 11:39:03 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+#include <torch/script.h>
+#include <torch/csrc/jit/jit_log.h>
+//#include <ATen/Context.h>
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+enum Device : int8_t {
+    GPU = 0,
+    CPU,
+    XPU,
+    UNKNOW
+};
+
+struct PorosOptions {
+    Device device = GPU;
+    bool debug = false;
+    bool use_fp16 = false;
+    bool is_dynamic = false;
+    // 该flag对tensorrt engine 有效, 默认为true
+    // 当long_to_int=true，将相关value转成at::kInt进行处理（因为tensorrt不支持at::kLong 类型）
+    // 该设置可能导致数据精度发生变化，如果效果不符合预期，请将该flag设置为false。
+    bool long_to_int = true;
+    //DynamicShapeOptions dynamic_shape_options;
+    uint64_t max_workspace_size = 1ULL << 30;
+    // XPU默认参数为-1，代表第一个可用设备
+    int32_t device_id = -1;
+    // 非const op个数阈值
+    int32_t unconst_ops_thres = -1;
+    // Nvidia TF32 computes inner products by rounding the inputs to 10-bit mantissas before multiplying, 
+    // but accumulates the sum using 23-bit mantissas to accelerate the calculation.
+    // note: It will work on ampere architecture (such as: A10), but may cause diff to the results.
+    bool use_nvidia_tf32 = true;
+    // preprocess mode
+    // 0: use torch.jit.script
+    // 1: use torhc.jit.trace
+    int32_t preprocess_mode = 0;
+    // 用户自定义不支持op列表
+    std::vector<std::string> unsupport_op_list;
+};
+
+class PorosModule : public torch::jit::Module {
+public:
+    PorosModule(torch::jit::Module module) : torch::jit::Module(module) {
+    }
+    ~PorosModule() = default;
+
+    void to_device(Device device){
+        _options.device = device;
+    }
+
+    //c10::IValue forward(std::vector<c10::IValue> inputs);
+    //void save(const std::string& filename);
+public:
+    PorosOptions _options;
+
+};
+
+//via porosmodule.save
+std::unique_ptr<PorosModule> Load(const std::string& filename, const PorosOptions& options);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/compile/segment.cpp b/poros/src/poros/compile/segment.cpp
new file mode 100644
index 0000000000..0bf88e5807
--- /dev/null
+++ b/poros/src/poros/compile/segment.cpp
@@ -0,0 +1,231 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file segment.cpp
+* @author tianjinjin@baidu.com
+* @date Fri Mar 19 19:18:20 CST 2021
+* @brief 
+**/
+
+#include "poros/compile/segment.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+std::vector<const torch::jit::Value*> sort_topological(const at::ArrayRef<const torch::jit::Value*> inputs,
+                                                                const torch::jit::Block* cur_block,
+                                                                bool reverse) {
+    //if not in the same block. bypass it
+    std::vector<const torch::jit::Value*> result;
+    for (auto i : inputs) {
+        if (i->node()->owningBlock() == cur_block) {
+            result.push_back(i);
+        }
+    }
+
+    if (reverse) {
+        // Sort in reverse topological order
+        std::sort(result.begin(), result.end(), [&](const torch::jit::Value* a, const torch::jit::Value* b) {
+            return a->node()->isAfter(b->node());
+        });
+    } else {
+        std::sort(result.begin(), result.end(), [&](const torch::jit::Value* a, const torch::jit::Value* b) {
+            return a->node()->isBefore(b->node());
+        });
+    }
+    return result;
+}
+
+std::vector<torch::jit::Value*> sort_topological (const at::ArrayRef<torch::jit::Value*> inputs,
+                                        const torch::jit::Block* cur_block,
+                                        bool reverse) {
+    //if not in the same block. bypass it
+    std::vector<torch::jit::Value*> result;
+    for (auto i : inputs) {
+        if (i->node()->owningBlock() == cur_block) {
+            result.push_back(i);
+        }
+    }
+
+    if (reverse) {
+        // Sort in reverse topological order
+        std::sort(result.begin(), result.end(), [&](torch::jit::Value* a, torch::jit::Value* b) {
+            return a->node()->isAfter(b->node());
+        });
+    } else {
+        std::sort(result.begin(), result.end(), [&](torch::jit::Value* a, torch::jit::Value* b) {
+            return a->node()->isBefore(b->node());
+        });
+    }
+    return result;
+}
+
+void stable_dfs(const torch::jit::Block& block, bool reverse,
+               const std::vector<const torch::jit::Node*>& start,
+               const std::function<bool(const torch::jit::Node*)>& enter,
+               const std::function<bool(const torch::jit::Node*)>& leave)
+{
+    std::vector<NodeDFSResult> stack(start.size());
+    for (size_t i = 0; i < start.size(); ++i) {
+        stack[i] = NodeDFSResult{start[i], false};
+    }
+    
+    std::unordered_map<const torch::jit::Node*, bool> visited;
+    while(!stack.empty()) {
+        NodeDFSResult w = stack.back();
+        stack.pop_back();
+
+        auto n = w.node;
+        if (w.leave) {
+            if (leave && !leave(n)) {
+                return;
+            }
+            continue;
+        }
+
+        if (visited.find(n) != visited.end()) {
+            continue;
+        }
+        visited[n] = true;
+
+        if (enter && !enter(n)) {
+            return;
+        }
+
+        if (leave) {
+            stack.push_back(NodeDFSResult{n, true});
+        }
+
+        auto values = reverse ? n->inputs() : n->outputs();
+        auto sorted_value_list = sort_topological(values, n->owningBlock(), false);
+        for (auto value: sorted_value_list) {
+            if (visited.find(value->node()) == visited.end()) {
+                stack.push_back(NodeDFSResult{n, false});
+            }
+        }
+    }
+}
+
+bool can_contract(const torch::jit::Node* from_node, 
+                            const torch::jit::Node* to_node, 
+                            const torch::jit::Block& block) {
+    std::vector<const torch::jit::Node*> dfs_start_nodes;
+
+    for (auto i: to_node->inputs()) {
+        if (i->node() != from_node) {
+            dfs_start_nodes.push_back(i->node());
+        }
+    }
+
+    bool has_cycle = false;
+    stable_dfs (block, /*reverse=*/true, dfs_start_nodes,  /*enter=*/nullptr,
+            [&has_cycle, from_node](const torch::jit::Node* n) {
+              if (n == from_node) {
+                has_cycle = true;
+                return false;
+              }
+              return true;
+            });
+    return !has_cycle;
+}
+
+torch::jit::Graph& get_subgraph(torch::jit::Node* n) {
+    AT_ASSERT(n->kind() == torch::jit::prim::CudaFusionGroup);
+    return *n->g(torch::jit::attr::Subgraph);
+  }
+
+torch::jit::Node* merge_node_into_subgraph(torch::jit::Node* group, torch::jit::Node* n) {
+    auto& subgraph = get_subgraph(group);
+    std::unordered_map<torch::jit::Value*, torch::jit::Value*> inputs_map;
+    size_t i = 0;
+    size_t tensor_insert_idx = 0;
+    //cache the original group input data
+    AT_ASSERT(group->inputs().size() == subgraph.inputs().size());
+    for (auto input : group->inputs()) {
+        inputs_map[input] = subgraph.inputs()[i++];
+        if (input->type()->isSubtypeOf(c10::TensorType::get())) {
+            tensor_insert_idx = i;
+        }
+    }
+
+    torch::jit::WithInsertPoint guard(*subgraph.nodes().begin());
+    for (auto input : n->inputs()) {
+        //means we should add this new input
+        if (inputs_map.count(input) == 0) {
+            //consider tensortype first. (it's pytorch tradition)
+            if (input->type()->isSubtypeOf(c10::TensorType::get())) {
+                auto in_group = subgraph.insertInput(tensor_insert_idx);
+                in_group->setType(input->type());
+                inputs_map[input] = in_group;
+                group->insertInput(tensor_insert_idx, input);
+                tensor_insert_idx++;
+            } else if ((input->type()->isSubtypeOf(c10::FloatType::get()) &&
+                        input->node()->kind() != torch::jit::prim::Constant) ||
+                        (n->kind() == torch::jit::aten::_grad_sum_to_size &&
+                        input->type()->isSubtypeOf(c10::ListType::ofInts()))) {
+                auto in_group = subgraph.addInput();
+                in_group->setType(input->type());
+                inputs_map[input] = in_group;
+                group->addInput(input);
+            } else if (input->node()->kind() == torch::jit::prim::Constant) {
+                torch::jit::Node* in_const = subgraph.createClone(input->node(), [](torch::jit::Value*) -> torch::jit::Value* {
+                    throw std::runtime_error("unexpected input");
+                    });
+                subgraph.insertNode(in_const);
+                inputs_map[input] = in_const->output();
+            } else {
+                // TODO: we need to figure out what are supported input scalar
+                LOG(WARNING) << "meet some unexpected node: " << input->node()->kind().toQualString();
+                auto in_group = subgraph.addInput();
+                in_group->setType(input->type());
+                inputs_map[input] = in_group;
+                group->addInput(input);
+            }
+        }
+    }  // for (auto input : n->inputs())
+
+    // copy n into the graph, remapping its inputs to internal nodes
+    torch::jit::Node* in_graph = subgraph.createClone(
+        n, [&](torch::jit::Value* k) -> torch::jit::Value* { return inputs_map[k]; });
+
+    auto inputs = group->inputs();
+    for (size_t i = 0; i < n->outputs().size(); ++i) {
+        auto it = std::find(inputs.begin(), inputs.end(), n->outputs()[i]);
+        if (it != inputs.end()) {
+            size_t p = it - inputs.begin();
+            group->removeInput(p);
+            subgraph.inputs()[p]->replaceAllUsesWith(in_graph->outputs()[i]);
+            subgraph.eraseInput(p);
+        }
+    }
+    return subgraph.insertNode(in_graph);
+}
+
+torch::jit::Node* change_node_to_subgraph(torch::jit::Node* group, torch::jit::Node* n)
+{
+    group->insertBefore(n);
+    torch::jit::Node* mergedNode = merge_node_into_subgraph(group, n);
+    get_subgraph(group).registerOutput(mergedNode->output());
+    auto sel = group->addOutput();
+    sel->copyMetadata(n->output());
+    n->replaceAllUsesWith(group);
+    n->destroy();
+    return group;
+}
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/compile/segment.h b/poros/src/poros/compile/segment.h
new file mode 100644
index 0000000000..5eb1a8e344
--- /dev/null
+++ b/poros/src/poros/compile/segment.h
@@ -0,0 +1,129 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file segment.h
+* @author tianjinjin@baidu.com
+* @date Thu Mar 18 14:33:54 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+#include <algorithm>
+#include <unordered_map>
+#include <set>
+#include <torch/script.h>
+
+#include "poros/engine/iengine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+struct NodeDFSResult {
+    const torch::jit::Node* node;
+    bool leave;  // Are we entering or leaving n?
+};
+
+//template <typename T>  //should I?
+class NodeLink {
+public:
+    NodeLink(torch::jit::Node* n)
+      : _size(1), _head(nullptr), _value(n) {}
+
+    ~NodeLink() {
+        if (_head != nullptr) {
+            _head = nullptr;
+        }
+        if (_value != nullptr) {
+            _value = nullptr;
+        }
+    }
+
+    int size() {return find_root()->_size; }
+    int merge(NodeLink* other) {
+        NodeLink* a = find_root();
+        NodeLink* b = other->find_root();
+        if (a == b) {
+            return 0;
+        }
+        b->_head = a;
+        a->_size += b->_size;
+        return 0;
+    };
+
+    // Retrieves the value for the root of the set.
+    torch::jit::Node* head_value() { return find_root()->_value; }
+
+    // Returns the value for the object.
+    torch::jit::Node* value() const { return _value; }
+    //int64_t value_index() {return _value->topo_position_; }
+
+private:
+    NodeLink* find_root() {
+        if (!_head) {
+            return this;
+        }
+        _head = _head->find_root();
+        return _head;
+    };
+    
+    int _size;
+    NodeLink* _head;
+    torch::jit::Node* _value;
+};
+
+
+struct SegmentOptions {
+    int minimum_segment_size = 2;  //每个setment至少包含多少个node。
+};
+
+struct Segment {
+    Segment() {}
+    Segment(std::set<torch::jit::Node*>& nodes)
+                   : nodes(nodes){}
+    std::set<torch::jit::Node*> nodes;
+};
+
+using SegmentVector = std::vector<Segment>;
+using ValueVector = std::vector<const torch::jit::Value*>;
+
+ValueVector sort_topological (const at::ArrayRef<const torch::jit::Value*> inputs,
+                                        const torch::jit::Block* cur_block,
+                                        bool reverse = false);
+std::vector<torch::jit::Value*> sort_topological (const at::ArrayRef<torch::jit::Value*> inputs,
+                                        const torch::jit::Block* cur_block,
+                                        bool reverse = false);
+
+void stable_dfs(const torch::jit::Block& block, bool reverse,
+               const std::vector<const torch::jit::Node*>& start,
+               const std::function<bool(const torch::jit::Node*)>& enter,
+               const std::function<bool(const torch::jit::Node*)>& leave);
+
+
+bool can_contract(const torch::jit::Node* from_node, 
+                            const torch::jit::Node* to_node, 
+                            const torch::jit::Block& block);
+
+torch::jit::Graph& get_subgraph(torch::jit::Node* n);
+torch::jit::Node* merge_node_into_subgraph(torch::jit::Node* group, torch::jit::Node* n);
+torch::jit::Node* change_node_to_subgraph(torch::jit::Node* group, torch::jit::Node* n);
+
+//void segment_graph_new(std::shared_ptr<torch::jit::Graph>& graph, IEngine* engine);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/context/poros_global.cpp b/poros/src/poros/context/poros_global.cpp
new file mode 100644
index 0000000000..461d28158b
--- /dev/null
+++ b/poros/src/poros/context/poros_global.cpp
@@ -0,0 +1,122 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file poros_global.cpp
+* @author tianshaoqing@baidu.com
+* @author huangben@baidu.com
+* @date Fri Jul 23 11:21:10 CST 2021
+* @brief 
+**/
+
+#include "poros/context/poros_global.h"
+#include "poros/converter/iconverter.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+void ListSizeMap::update_value(torch::jit::Value* old_value, torch::jit::Value* new_value) {
+    if (_list_size_map_input.count(old_value) != 0) {
+        _list_size_map_input[new_value] = _list_size_map_input[old_value];
+        _list_tensor_type_map_input[new_value] = _list_tensor_type_map_input[old_value];
+    }
+    if (_list_size_map_output.count(old_value) != 0) {
+        _list_size_map_output[new_value] = _list_size_map_output[old_value];
+        _list_tensor_type_map_output[new_value] = _list_tensor_type_map_output[old_value];
+    }
+}
+
+void ListSizeMap::update_node(torch::jit::Node* old_node, torch::jit::Node* new_node) {
+    for(size_t i = 0; i < new_node->inputs().size(); i++) {
+        auto value = new_node->input(i);
+        if (_list_size_map_input.count(value) != 0) {
+            if (_list_size_map_input[value].count(old_node) != 0) {
+                _list_size_map_input[value][new_node] = _list_size_map_input[value][old_node];
+                _list_size_map_input[value].erase(old_node);
+
+                _list_tensor_type_map_input[value][new_node] = _list_tensor_type_map_input[value][old_node];
+                _list_tensor_type_map_input[value].erase(old_node);
+            }    
+        }
+    }
+
+    for(size_t i = 0; i < new_node->outputs().size(); i++) {
+        auto value = new_node->output(i);
+        if (_list_size_map_output.count(value) != 0) {
+            if (_list_size_map_output[value].count(old_node) != 0) {
+                _list_size_map_output[value][new_node] = _list_size_map_output[value][old_node];
+                _list_size_map_output[value].erase(old_node);
+
+                _list_tensor_type_map_output[value][new_node] = _list_tensor_type_map_output[value][old_node];
+                _list_tensor_type_map_output[value].erase(old_node);
+            }    
+        }
+    }
+}
+
+// 将PorosOptions放到全局类中，之后再初始化用户自定义不支持op列表
+void PorosGlobalContext::set_poros_options(const PorosOptions& options) {
+    _poros_options = options;
+    for (auto i : _converters_map) {
+        i.second->init_unsupport_op_set();
+    }
+} 
+
+// 注册converter方法到全局的PorosGlobalContext。
+void PorosGlobalContext::register_converter(const std::string& engine_name, IConverter* converter) {
+    //根据engine_name 找到相应的 ConvertersMap 
+    //(ps: 不同engine的ConvertersMap是相互独立的)
+    auto search = _converters_map.find(engine_name);
+    if (search == _converters_map.end()) {
+        _converters_map[engine_name] = new ConvertersMap();
+    }
+    auto e_converter_map = _converters_map[engine_name];
+
+    //根据converter的node_kind()和schema_string()信息，构造ConvRegistration。
+    auto node_kind_list = converter->node_kind();
+    auto schema_list = converter->schema_string();
+    for (auto& node_kind : node_kind_list) {
+        //converter that without schemas. such as aten::Constant
+        ConvRegistration conv_reg;
+        conv_reg.kind = node_kind;
+        conv_reg.converter = converter;
+        if (schema_list.size() == 0) {
+            conv_reg.options = ConverterOptions();
+        } else {
+            conv_reg.options = ConverterOptions().set_valid_schemas(schema_list);
+        }
+        //调用ConvertersMap的add_converter方法, 完成注册。
+        e_converter_map->add_converter(node_kind, conv_reg);
+    }
+    return;
+};
+
+ConvertersMap* PorosGlobalContext::get_converter_map(const std::string& engine_name) {
+    auto search = _converters_map.find(engine_name);
+    if (search == _converters_map.end()) {
+        return nullptr;
+    }
+    return search->second;      
+};
+
+void PorosGlobalContext::destroy() {
+    for (auto &e : _converters_map) {
+        delete e.second;
+    }
+}
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/context/poros_global.h b/poros/src/poros/context/poros_global.h
new file mode 100644
index 0000000000..7d367fcb48
--- /dev/null
+++ b/poros/src/poros/context/poros_global.h
@@ -0,0 +1,162 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file poros_global.h
+* @author tianjinjin@baidu.com
+* @author huangben@baidu.com
+* @date Fri Jul 23 11:21:10 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <memory>
+#include <set>
+#include <unordered_map>
+
+#include <torch/csrc/jit/ir/ir.h>
+
+#include "poros/compile/poros_module.h"
+#include "poros/iplugin/plugin_create.h"
+#include "poros/util/macros.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+// 设置双层map的原因是为了解决在同一个value不同node输入list时，因为append导致的引用问题
+typedef std::map<torch::jit::Value*, std::map<torch::jit::Node*, std::set<int32_t>>> LIST_SIZE_MAP;
+typedef std::map<torch::jit::Value*, std::map<torch::jit::Node*, std::map<int32_t, std::vector<c10::TensorTypePtr>>>> TENSOR_LIST_TYPE_MAP;
+
+struct ListSizeMap {
+    // 存储输入输出为list类型时的size信息
+    LIST_SIZE_MAP _list_size_map_input;
+    LIST_SIZE_MAP _list_size_map_output;
+
+    // 存储输入输出类型为tensor list时的type信息
+    TENSOR_LIST_TYPE_MAP _list_tensor_type_map_input;
+    TENSOR_LIST_TYPE_MAP _list_tensor_type_map_output;
+
+     /**
+     * @brief 将old_value的信息更新到new_value上
+     * @param [in] old_value : 原value
+     * @param [in] new_value : 新的value
+     * @return null
+     **/
+    void update_value(torch::jit::Value* old_value, torch::jit::Value* new_value);
+
+    /**
+     * @brief 将old_node的信息更新到new_node上
+     * @param [in] old_node : 原node
+     * @param [in] new_node : 新的node
+     * @return null
+     **/
+    void update_node(torch::jit::Node* old_node, torch::jit::Node* new_node);
+};
+
+struct ValueDynamicShape {
+    std::vector<int64_t> sizes;
+    std::vector<int64_t> max_shapes;
+    std::vector<int64_t> min_shapes;
+    std::vector<int64_t> opt_shapes;
+    bool is_dynamic = false;
+};
+
+// 前置声明
+class ConvertersMap;
+class IConverter;
+
+class PorosGlobalContext {
+public:
+    static PorosGlobalContext& instance() {
+        static PorosGlobalContext _instance;
+        return _instance;
+    }
+ 
+    ~PorosGlobalContext() {
+        destroy();
+    }
+
+    int init() {
+        //to change
+        return 0;
+    }
+    
+    void set_poros_options(const PorosOptions& options);
+   
+    PorosOptions& get_poros_options() {
+        return _poros_options;
+    } 
+   
+    void destroy();
+
+    // 注册converter方法到全局的PorosGlobalContext。
+    void register_converter(const std::string& engine_name, IConverter* converter);
+
+    ConvertersMap* get_converter_map(const std::string& engine_name);
+public:
+    plugin_creator_map_t _engine_creator_map;
+    std::map<torch::jit::Value*, ValueDynamicShape> _value_dynamic_shape_map;
+    ListSizeMap _list_size_map;
+    std::map<torch::jit::Value*, ValueDynamicShape> _int_intlist_values_map;
+    bool _disable_subblock_convert = false;
+
+    const std::set<c10::Symbol> supported_mutable_ops_set = { 
+        //aten::append.t(t[](a!) self, t(c -> *) el) -> t[](a!)
+        c10::Symbol::fromQualString("aten::append"),
+        //"aten::_set_item.t(t [](a!) l, int idx, t(b -> *) el) -> t[](a!)"
+        c10::Symbol::fromQualString("aten::_set_item"),
+    };
+private:
+    PorosOptions _poros_options;
+    std::unordered_map<std::string, ConvertersMap*> _converters_map;
+};
+
+/*-------------------------------------------------------------------------
+                       converter自动注册相关宏
+-------------------------------------------------------------------------*/
+template <typename T>
+class ConverterRegister {
+public:
+    public:
+    inline ConverterRegister(std::string name = "",
+        PorosGlobalContext& context = PorosGlobalContext::instance()) noexcept;
+};
+
+template <typename T>
+inline ConverterRegister<T>::ConverterRegister(std::string name,
+    PorosGlobalContext& context) noexcept {
+    auto instance = new T();
+    context.register_converter(name, instance);
+}
+
+#define POROS_REGISTER_CONVERTER(name, reg)                                   \
+    static ConverterRegister<reg> POROS_CONVERTER_REGISTER_init_ ## reg (#name);
+
+//engine自动注册
+template <typename EngineType>
+class EngineRegister {
+public:
+    EngineRegister(const std::string& name) {
+        register_plugin_class<EngineType>(name, PorosGlobalContext::instance()._engine_creator_map);
+    }
+};
+
+#define POROS_REGISTER_ENGINE(name)                                   \
+    static EngineRegister<name> POROS_ENGINE_REGISTER_init_##name(#name);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/activation.cpp b/poros/src/poros/converter/gpu/activation.cpp
new file mode 100644
index 0000000000..b55ed4065f
--- /dev/null
+++ b/poros/src/poros/converter/gpu/activation.cpp
@@ -0,0 +1,234 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file activation.cpp
+* @author tianjinjin@baidu.com
+* @date Mon Mar  8 11:36:11 CST 2021
+* @brief 
+**/
+
+#include "poros/converter/gpu/activation.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/util/macros.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/context/poros_global.h"
+#include "poros/util/poros_util.h"
+#include "poros/converter/gpu/converter_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+bool ActivationConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 1 || inputs.size() == 2 
+        || inputs.size() == 3 || inputs.size() == 4), 
+        "invaid inputs size for ActivationConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for ActivationConverter is not Tensor as expected");
+
+    auto nv_tensor = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((nv_tensor != nullptr), 
+        "Unable to init input tensor for node: " << *node);
+
+    nvinfer1::ActivationType activate_type;
+    if (node->kind() == torch::jit::aten::relu || node->kind() == torch::jit::aten::relu_) {
+        activate_type = nvinfer1::ActivationType::kRELU;
+    } else if (node->kind() == torch::jit::aten::relu6 || node->kind() == torch::jit::aten::relu6_) {
+        activate_type = nvinfer1::ActivationType::kRELU;
+    } else if (node->kind() == torch::jit::aten::sigmoid || node->kind() == torch::jit::aten::sigmoid_) {
+        activate_type = nvinfer1::ActivationType::kSIGMOID;
+    } else if (node->kind() == torch::jit::aten::tanh || node->kind() == torch::jit::aten::tanh_) {
+        activate_type = nvinfer1::ActivationType::kTANH;
+    } else if (node->kind() == torch::jit::aten::leaky_relu) {
+        activate_type = nvinfer1::ActivationType::kLEAKY_RELU;
+    } else if (node->kind() == torch::jit::aten::hardtanh || node->kind() == torch::jit::aten::hardtanh_) {
+        activate_type = nvinfer1::ActivationType::kCLIP;
+    } else if (node->kind() == torch::jit::aten::elu) {
+        activate_type = nvinfer1::ActivationType::kELU;
+    }else if (node->kind() == torch::jit::aten::silu) {
+        activate_type = nvinfer1::ActivationType::kSIGMOID;
+    }  else {
+        POROS_THROW_ERROR("We should never reach here for ActivationConverter, meet Unsupported ActivationType!");
+    }
+
+    auto new_layer = engine->network()->addActivation(*nv_tensor, activate_type);
+
+    //set attributes for aten::leaky_relu
+    //"aten::leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor",
+    if (activate_type == nvinfer1::ActivationType::kLEAKY_RELU) {
+        POROS_CHECK_TRUE((inputs.size() == 2), "invaid inputs size for aten::leaky_relu in ActivationConverter");
+        auto negative_slopeScalar = (engine->context().get_constant(inputs[1])).toScalar().to<float>();
+        new_layer->setAlpha(negative_slopeScalar);
+    }
+
+    //set attributes for aten::hardtanh
+    //"aten::hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) -> Tensor",
+    if (activate_type == nvinfer1::ActivationType::kCLIP) {
+        POROS_CHECK_TRUE((inputs.size() == 3), "invaid inputs size for aten::hardtanh in ActivationConverter");
+        auto min = (engine->context().get_constant(inputs[1])).toDouble();
+        auto max = (engine->context().get_constant(inputs[2])).toDouble();
+        new_layer->setAlpha(min);
+        new_layer->setBeta(max);
+    }
+
+    //set attributes for aten::elu
+    //"aten::elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor"
+    if (activate_type == nvinfer1::ActivationType::kELU) {
+        POROS_CHECK_TRUE((inputs.size() == 4), "invaid inputs size for aten::hardtanh in ActivationConverter");
+        auto alpha = (engine->context().get_constant(inputs[1])).toDouble();
+        new_layer->setAlpha(alpha);
+    }
+    
+    new_layer->setName((layer_info(node) + "_IActivationLayer").c_str());
+    nvinfer1::ITensor* output = new_layer->getOutput(0);
+    if (node->kind() == torch::jit::aten::relu6 || node->kind() == torch::jit::aten::relu6_) {
+        nvinfer1::ITensor* relu_output = new_layer->getOutput(0);
+        auto options_pyt = torch::TensorOptions().device(torch::kCUDA, 0).dtype(at::kFloat);
+        at::Tensor relu6_max = at::tensor({6.0}, options_pyt);
+        nvinfer1::ITensor* relu6_max_nv = tensor_to_const(engine, relu6_max);
+
+        auto min_layer = add_elementwise(engine,
+                            nvinfer1::ElementWiseOperation::kMIN,
+                            relu_output,
+                            relu6_max_nv,
+                            layer_info(node) + "_min");
+        output = min_layer->getOutput(0);
+    }else if (node->kind() == torch::jit::aten::silu) {
+        nvinfer1::ITensor* sigmoid_output = new_layer->getOutput(0);
+        auto min_layer = add_elementwise(engine,
+                                         nvinfer1::ElementWiseOperation::kPROD,
+                                         sigmoid_output,
+                                         nv_tensor,
+                                         layer_info(node) + "_prod");
+        output = min_layer->getOutput(0);
+    }
+
+
+    
+    engine->context().set_tensor(node->outputs()[0], output);
+    LOG(INFO) << "Output shape: " << output->getDimensions();
+    return true;
+}
+
+// aten::gelu(Tensor self) -> Tensor
+// aten::gelu(Tensor self, *, str approximate='none') -> Tensor
+bool GeluActivationConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 1 || inputs.size() == 2), "invaid inputs size for GeluActivationConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for GeluActivationConverter is not Tensor as expected");
+
+    auto in = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((in != nullptr), 
+        "Unable to init input tensor for node: " << *node);
+    nvinfer1::DataType type = in->getType();
+    
+    POROS_CHECK((type == nvinfer1::DataType::kFLOAT || type == nvinfer1::DataType::kHALF),
+        "gelu only supports kFLOAT and kHALF");
+    
+    std::string pluginName = "CustomGeluPluginDynamic";
+    nvinfer1::PluginFieldCollection fc;
+    std::vector<nvinfer1::PluginField> f;
+
+    //TODO: maybe need to consider  more about op_precision situation
+    // int type_id = ctx->settings.op_precision == nvinfer1::DataType::kFLOAT 
+    //         ? 0
+    //         : 1; // Integer encoding the DataType (0: FP32, 1: FP16)
+    int type_id = (type == nvinfer1::DataType::kFLOAT) ? 0 : 1;
+    f.emplace_back(nvinfer1::PluginField("type_id", &type_id, nvinfer1::PluginFieldType::kINT32, 1));
+
+    std::string mode = "gelu";
+    f.emplace_back(nvinfer1::PluginField("mode", &mode, nvinfer1::PluginFieldType::kCHAR, 1));
+
+    fc.nbFields = f.size();
+    fc.fields = f.data();
+    
+    auto creator = getPluginRegistry()->getPluginCreator("CustomGeluPluginDynamic", "1", "");
+    auto gelu_plugin = creator->createPlugin("gelu", &fc);
+    
+    POROS_CHECK(gelu_plugin, "Unable to create gelu plugin from TensorRT plugin registry" << *node);
+    auto new_layer = 
+        engine->network()->addPluginV2(reinterpret_cast<nvinfer1::ITensor* const*>(&in), 1, *gelu_plugin);
+    new_layer->setName((layer_info(node) + "_plugin_gelu").c_str());
+    auto out_tensor = new_layer->getOutput(0);
+    engine->context().set_tensor(node->outputs()[0], out_tensor);
+    LOG(INFO) << "Output shape: " << out_tensor->getDimensions();
+    return true;
+}
+
+/*"aten::prelu(Tensor self, Tensor weight) -> Tensor"*/
+bool PreluActivationConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 2), "invaid inputs size for PreluActivationConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for PreluActivationConverter is not Tensor as expected");
+
+    auto in = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((in != nullptr), "Unable to init input tensor for node: " << *node);
+
+    auto maybe_slopes = engine->context().get_constant(inputs[1]);
+    POROS_CHECK_TRUE((maybe_slopes.isTensor()), "Unable to init input const-tensor for node: " << *node);
+    auto slopes = maybe_slopes.toTensor();  //at::tensor
+    //auto slopes_size = sizes_to_nvdim(slopes.sizes());
+
+    //bool to_reshape = false;
+    auto original_shape = in->getDimensions();
+    
+    // Channel dim is the 2nd dim of input. When input has dims < 2, then there is no channel dim and the number of channels = 1.
+    at::Tensor weight;
+    if (slopes.numel() != 1){
+        std::vector<at::Tensor> weights;
+        std::vector<int64_t> reshape_shape;
+        bool sign = true;
+        for (int i = 0; i < original_shape.nbDims; i++) {
+            if (original_shape.d[i] == slopes.numel() && sign) {
+                sign = false;
+                continue;
+            }
+            if (!sign) {
+                reshape_shape.push_back(original_shape.d[i]);
+            }
+        }
+
+        for (int64_t i = 0; i < slopes.numel(); i++) {
+            auto options_pyt = torch::TensorOptions().device(torch::kCUDA, 0).dtype(torch::kFloat32);
+            auto tmp = at::ones(reshape_shape, options_pyt);
+            weights.push_back((slopes[i] * tmp).unsqueeze(0));
+        }
+
+        weight = torch::cat(weights, 0);
+        weight = weight.unsqueeze(0);
+    } else {
+        weight = slopes;
+    } 
+
+    auto slope_tensor = tensor_to_const(engine, weight);
+    auto new_layer = engine->network()->addParametricReLU(*in, *slope_tensor);
+    new_layer->setName((layer_info(node) + "_IParametricReLULayer").c_str());
+    auto out_tensor = new_layer->getOutput(0);
+
+    engine->context().set_tensor(node->outputs()[0], out_tensor);
+    LOG(INFO) << "Output shape: " << out_tensor->getDimensions();
+    return true;
+}
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, ActivationConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, GeluActivationConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, PreluActivationConverter);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/activation.h b/poros/src/poros/converter/gpu/activation.h
new file mode 100644
index 0000000000..8ce274e108
--- /dev/null
+++ b/poros/src/poros/converter/gpu/activation.h
@@ -0,0 +1,115 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file activation.h
+* @author tianjinjin@baidu.com
+* @date Wed Jul 28 15:24:51 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include <torch/script.h>
+#include <torch/version.h>
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class ActivationConverter : public GpuConverter {
+public:
+    ActivationConverter() {}
+    virtual ~ActivationConverter() {}
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::relu(Tensor self) -> Tensor",
+        "aten::relu_(Tensor(a!) self) -> Tensor(a!)",
+        "aten::relu6(Tensor self) -> (Tensor)",
+        "aten::relu6_(Tensor(a!) self) -> Tensor(a!)",
+        "aten::sigmoid(Tensor self) -> Tensor",
+        "aten::sigmoid_(Tensor(a!) self) -> Tensor(a!)",
+        "aten::tanh(Tensor self) -> Tensor",
+        "aten::tanh_(Tensor(a!) self) -> Tensor(a!)",
+        "aten::leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor",
+        "aten::hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) -> Tensor",
+        "aten::hardtanh_(Tensor(a!) self, Scalar min_val=-1, Scalar max_val=1) -> Tensor(a!)",
+        "aten::elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor",
+        "aten::silu(Tensor self) -> Tensor"};
+    }
+
+    /** TODO: TRY TO SUPPORT SCHEMA PATTERNS BELLOW:
+     * //said 'leaky_relu_' is not a member of 'torch::jit::aten' and i don't know why
+     * "aten::leaky_relu_(Tensor(a!) self, Scalar negative_slope=0.01) -> Tensor(a!)", 
+     * */
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::relu,
+                torch::jit::aten::relu_,
+                torch::jit::aten::relu6,
+                torch::jit::aten::relu6_,
+                torch::jit::aten::sigmoid,
+                torch::jit::aten::sigmoid_,
+                torch::jit::aten::tanh,
+                torch::jit::aten::tanh_,
+                torch::jit::aten::leaky_relu,
+                torch::jit::aten::hardtanh,
+                torch::jit::aten::hardtanh_,
+                torch::jit::aten::elu,
+                torch::jit::aten::silu};
+    }
+};
+
+class GeluActivationConverter : public GpuConverter {
+public:
+    GeluActivationConverter() {}
+    virtual ~GeluActivationConverter() {}
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+    const std::vector<std::string> schema_string() {
+        // aten::gelu schema changed in torch-1.12
+        if (TORCH_VERSION_MAJOR < 2 && TORCH_VERSION_MINOR < 12) {
+            return {"aten::gelu(Tensor self) -> Tensor"};
+        } else {
+            return {"aten::gelu(Tensor self, *, str approximate='none') -> Tensor"};
+        }
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::gelu};
+    }
+};
+
+class PreluActivationConverter : public GpuConverter {
+public:
+    PreluActivationConverter() {}
+    virtual ~PreluActivationConverter() {}
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+    const std::vector<std::string> schema_string() {
+        return {"aten::prelu(Tensor self, Tensor weight) -> Tensor"};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::prelu};
+    }
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/add.cpp b/poros/src/poros/converter/gpu/add.cpp
new file mode 100644
index 0000000000..32e530a495
--- /dev/null
+++ b/poros/src/poros/converter/gpu/add.cpp
@@ -0,0 +1,377 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file add.cpp
+* @author tianjinjin@baidu.com
+* @date Mon Mar  8 11:36:11 CST 2021
+* @brief 
+**/
+
+#include "poros/converter/gpu/add.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/context/poros_global.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+/**
+ * @brief unify type : According to the type promotion rules of torch, 
+ *        when one of self or other is float type, the other also becomes float.
+ * @param [in] engine : trt engine
+ * @param [in] node : current node
+ * @param [in] self : self ITensor
+ * @param [in] other : other ITensor
+ * @return
+**/
+static void unify_type(TensorrtEngine* engine, 
+                    const torch::jit::Node *node, 
+                    nvinfer1::ITensor*& self, 
+                    nvinfer1::ITensor*& other) {
+    if (self->getType() == nvinfer1::DataType::kFLOAT && 
+        other->getType() == nvinfer1::DataType::kINT32) {
+        auto id_layer = engine->network()->addIdentity(*other);
+        id_layer->setOutputType(0, nvinfer1::DataType::kFLOAT);
+        id_layer->setName((layer_info(node) + "_IIdentityLayer_other_to_float").c_str());
+        other = id_layer->getOutput(0);
+    }
+
+    if (other->getType() == nvinfer1::DataType::kFLOAT && 
+        self->getType() == nvinfer1::DataType::kINT32) {
+        auto id_layer = engine->network()->addIdentity(*self);
+        id_layer->setOutputType(0, nvinfer1::DataType::kFLOAT);
+        id_layer->setName((layer_info(node) + "_IIdentityLayer_self_to_float").c_str());
+        self = id_layer->getOutput(0);
+    }
+}
+
+/*
+"aten::add.Tensor(Tensor self, Tensor other, Scalar alpha=1) -> Tensor",
+"aten::add.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor"*/
+bool AddConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    
+    // aten::add.int(int a, int b) -> (int)
+    // aten::add.t(t[] a, t[] b) -> (t[])
+    if (node->schema().operator_name() == torch::jit::parseSchema(this->schema_string()[4]).operator_name() ||
+    node->schema().operator_name() == torch::jit::parseSchema(this->schema_string()[5]).operator_name()) {
+        POROS_CHECK_TRUE((inputs.size() == 2), "invaid inputs size for AddConverter");
+        if (check_inputs_tensor_scalar(engine, node)) {
+            // 获取int对应的nvtensor
+            nvinfer1::ITensor* a = this->get_tensor_scalar(inputs[0]);
+            nvinfer1::ITensor* b = this->get_tensor_scalar(inputs[1]);
+            // 判断是否为空 (get_constant失败时可能为空)
+            // 为空时返回false, 让子图fallback
+            POROS_CHECK_TRUE((a != nullptr && b != nullptr), 
+                                node_info(node) + std::string("get nvtensor type int false."));
+            if (node->schema().operator_name() == torch::jit::parseSchema(this->schema_string()[4]).operator_name()) {
+                // a和b相加并返回
+                nvinfer1::ILayer* add_layer = add_elementwise(engine, 
+                                                nvinfer1::ElementWiseOperation::kSUM, 
+                                                a, b, layer_info(node) + "_sum");
+                POROS_CHECK(add_layer, "Unable to create add layer from node: " << *node);
+                nvinfer1::ITensor* output = add_layer->getOutput(0);
+                engine->context().set_tensor(node->outputs()[0], output);
+                LOG(INFO) << "Output tensor shape: " << output->getDimensions();
+            } else {
+                std::vector<nvinfer1::ITensor*> inputs_nvtensor;
+                // 将所有int对应的nvtensor加入vector, 最后cat起来
+                inputs_nvtensor.push_back(a);
+                inputs_nvtensor.push_back(b);
+                nvinfer1::IConcatenationLayer* concat_layer = 
+                        engine->network()->addConcatenation(inputs_nvtensor.data(), inputs_nvtensor.size());
+                concat_layer->setAxis(0);
+                concat_layer->setName((layer_info(node) + "_IConcatenationLayer").c_str());
+                engine->context().set_tensor(node->outputs()[0], concat_layer->getOutput(0));
+            }            
+        } else {
+            torch::jit::IValue a_ivalue = engine->context().get_constant(inputs[0]);
+            if (a_ivalue.isInt()) {
+                int a = engine->context().get_constant(inputs[0]).toScalar().to<int>();
+                int b = engine->context().get_constant(inputs[1]).toScalar().to<int>();
+                engine->context().set_constant(node->outputs()[0], a + b);
+            } else if (a_ivalue.isIntList()) {
+                std::vector<int64_t> a_vec = engine->context().get_constant(inputs[0]).toIntList().vec();
+                std::vector<int64_t> b_vec = engine->context().get_constant(inputs[1]).toIntList().vec();
+                a_vec.insert(a_vec.end(), b_vec.begin(), b_vec.end());
+                auto output_ivalue = c10::optional<torch::jit::IValue>(std::move(torch::jit::IValue(a_vec)));
+                engine->context().set_constant(node->outputs()[0], output_ivalue);   
+            } else {
+                // a and b are tensorlists
+                if (inputs[0]->type()->isSubtypeOf(c10::ListType::ofTensors())) {
+                    std::vector<nvinfer1::ITensor*> in_tensor_a, in_tensor_b;
+                    engine->context().get_tensorlist(inputs[0], in_tensor_a);
+                    engine->context().get_tensorlist(inputs[1], in_tensor_b);
+                    in_tensor_a.insert(in_tensor_a.end(), in_tensor_b.begin(), in_tensor_b.end());
+                    engine->context().set_tensorlist(node->outputs()[0], in_tensor_a);
+                } else {
+                    LOG(INFO) << *node->maybeSchema() << " meets unkown input type.";
+                    return false;
+                }
+            }
+        }
+        return true;
+    }
+
+    POROS_CHECK_TRUE((inputs.size() == 3), "invaid inputs size for AddConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for AddConverter is not Tensor as expected");
+    POROS_CHECK_TRUE((inputs[2]->node()->kind() == torch::jit::prim::Constant),
+        "input[2] for AddConverter is not come from prim::Constant as expected");
+
+    //extract self
+    auto self = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((self != nullptr), "Unable to init input tensor for node: " << *node);
+
+    //extract scalar
+    //TODO: check input scalar is int type situation
+    auto scalar_ivalue = (engine->context().get_constant(inputs[2]));
+    // 先转成float去接input[2]输入
+    auto scalar = scalar_ivalue.toScalar().to<float>();
+    //which one is better?
+    //auto scalar = ((engine->context().get_constant(inputs[2]))->to<int>();
+    //auto scalar = ((engine->context().get_constant(inputs[2])).to<c10::Scalar>()).to<float>();
+
+    //extract other
+    auto other = engine->context().get_tensor(inputs[1]);
+    //situation1: ---------- when other input is Scalar -------------
+    if (other == nullptr) {
+        auto other_const = engine->context().get_constant(inputs[1]);
+        if (other_const.isScalar()) {
+            // 先转成float去接input[1]输入
+            auto other_scalar = other_const.toScalar().to<float>();
+            at::Tensor prod_tensor = torch::tensor({other_scalar * scalar});
+            // input[1]和input[2]若本身是int，相乘结果需转成int
+            if (scalar_ivalue.isInt() && other_const.isInt()) {
+                prod_tensor = prod_tensor.to(at::ScalarType::Int);
+            }
+            other = tensor_to_const(engine, prod_tensor);
+        } else {
+            POROS_THROW_ERROR("Unable to get input other value for AddConverter");
+        }
+    //situation2:  ---------- when other input is Tensor -------------
+    } else {
+        const double EPSILON = 1e-9;
+        if (fabs(scalar - 1.0) > EPSILON) {
+            nvinfer1::ITensor* alphaTensor = nullptr;
+            // input[1]和input[2]若本身是int，则input[2]需转回int。否则trt中float和int相乘为0。
+            if (scalar_ivalue.isInt() && other->getType() == nvinfer1::DataType::kINT32) {
+                alphaTensor = tensor_to_const(engine, torch::tensor({scalar}).to(at::ScalarType::Int));
+            } else {
+                alphaTensor = tensor_to_const(engine, torch::tensor({scalar}));
+            }
+            auto scaleLayer = add_elementwise(engine,
+                            nvinfer1::ElementWiseOperation::kPROD,
+                            other,
+                            alphaTensor,
+                            layer_info(node) + "_prod");
+            POROS_CHECK(scaleLayer, "Unable to create alpha*input layer from node: " << *node);
+            other = scaleLayer->getOutput(0);
+        }
+    }
+
+    unify_type(engine, node, self, other);
+
+    auto add = add_elementwise(engine, 
+            nvinfer1::ElementWiseOperation::kSUM, 
+            self, 
+            other,
+            layer_info(node) + "_sum");
+    POROS_CHECK(add, "Unable to create add layer from node: " << *node);
+    engine->context().set_tensor(node->outputs()[0], add->getOutput(0));
+    LOG(INFO) << "Output tensor shape: " << add->getOutput(0)->getDimensions();
+    return true;
+}
+
+/*
+"aten::sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor",
+"aten::sub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor",*/
+bool SubConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    // aten::sub.int(int a, int b) -> (int)
+    if (node->schema().operator_name() == torch::jit::parseSchema(this->schema_string()[4]).operator_name()) {
+        POROS_CHECK_TRUE((inputs.size() == 2), "invaid inputs size for SubConverter");
+        if (check_inputs_tensor_scalar(engine, node)) {
+            // 获取int对应的nvtensor
+            nvinfer1::ITensor* a = this->get_tensor_scalar(inputs[0]);
+            nvinfer1::ITensor* b = this->get_tensor_scalar(inputs[1]);
+            // 判断是否为空 (get_constant失败时可能为空)
+            // 为空时返回false, 让子图fallback
+            POROS_CHECK_TRUE((a != nullptr && b != nullptr), 
+                                node_info(node) + std::string("get int nvtensor false."));
+            // a和b相加并返回
+            nvinfer1::ILayer* sub_layer = add_elementwise(engine, 
+                                            nvinfer1::ElementWiseOperation::kSUB, 
+                                            a, b, layer_info(node) + "_sub");
+            POROS_CHECK(sub_layer, "Unable to create sub layer from node: " << *node);
+            nvinfer1::ITensor* output = sub_layer->getOutput(0);
+            engine->context().set_tensor(node->outputs()[0], output);
+            LOG(INFO) << "Output tensor shape: " << output->getDimensions();
+        } else {
+            int a = engine->context().get_constant(inputs[0]).toScalar().to<int>();
+            int b = engine->context().get_constant(inputs[1]).toScalar().to<int>();
+            engine->context().set_constant(node->outputs()[0], a - b);
+        }
+        return true;
+    }
+
+    POROS_CHECK_TRUE((inputs.size() == 3), "invaid inputs size for SubConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for SubConverter is not Tensor as expected");
+    POROS_CHECK_TRUE((inputs[2]->node()->kind() == torch::jit::prim::Constant),
+        "input[2] for SubConverter is not come from prim::Constant as expected");
+
+    //extract self
+    auto self = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((self != nullptr), "Unable to init input tensor for node: " << *node);
+
+    //extract scalar
+    //TODO: check input scalar is int type situation
+    auto scalar_ivalue = (engine->context().get_constant(inputs[2]));
+    // 先转成float去接input[2]输入
+    auto scalar = scalar_ivalue.toScalar().to<float>();
+
+    //extract other
+    auto other = engine->context().get_tensor(inputs[1]);
+    //situation1: ---------- when other input is Scalar -------------
+    if (other == nullptr) {
+        auto other_const = engine->context().get_constant(inputs[1]);
+        if (other_const.isScalar()) {
+            // 先转成float去接input[1]输入
+            auto other_scalar = other_const.toScalar().to<float>();
+            at::Tensor prod_tensor = torch::tensor({other_scalar * scalar});
+            // input[1]和input[2]若本身是int，相乘结果需转成int
+            if (scalar_ivalue.isInt() && other_const.isInt()) {
+                prod_tensor = prod_tensor.to(at::ScalarType::Int);
+            }
+            other = tensor_to_const(engine, prod_tensor);
+        } else {
+            POROS_THROW_ERROR("Unable to get input other value for MulConverter");
+        }
+    //situation2:  ---------- when other input is Tensor -------------
+    } else {
+        const double EPSILON = 1e-9;
+        if (fabs(scalar - 1.0) > EPSILON) {
+            nvinfer1::ITensor* alphaTensor = nullptr;
+            // input[1]和input[2]若本身是int，则input[2]需转回int。否则trt中float和int相乘为0。
+            if (scalar_ivalue.isInt() && other->getType() == nvinfer1::DataType::kINT32) {
+                alphaTensor = tensor_to_const(engine, torch::tensor({scalar}).to(at::ScalarType::Int));
+            } else {
+                alphaTensor = tensor_to_const(engine, torch::tensor({scalar}));
+            }
+            auto scaleLayer = add_elementwise(engine,
+                            nvinfer1::ElementWiseOperation::kPROD,
+                            other,
+                            alphaTensor,
+                            layer_info(node) + "_prod");
+            POROS_CHECK(scaleLayer, "Unable to create alpha*input layer from node: " << *node);
+            other = scaleLayer->getOutput(0);
+        }
+    }
+
+    unify_type(engine, node, self, other);
+
+    auto sub = add_elementwise(engine, 
+            nvinfer1::ElementWiseOperation::kSUB, 
+            self,
+            other,
+            layer_info(node) + "_sub");
+    POROS_CHECK(sub, "Unable to create sub layer from node: " << *node);
+    engine->context().set_tensor(node->outputs()[0], sub->getOutput(0));
+    LOG(INFO) << "Output tensor shape: " << sub->getOutput(0)->getDimensions();
+    return true;
+}
+
+// aten::rsub.Tensor(Tensor self, Tensor other, Scalar alpha=1) -> (Tensor)
+// aten::rsub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> (Tensor)
+bool RsubConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 3), "invaid inputs size for SubConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for SubConverter is not Tensor as expected");
+    POROS_CHECK_TRUE((inputs[2]->node()->kind() == torch::jit::prim::Constant),
+        "input[2] for SubConverter is not come from prim::Constant as expected");
+
+    //extract self
+    nvinfer1::ITensor* self = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((self != nullptr), "Unable to init input tensor for node: " << *node);
+    //extract scalar
+    //TODO: check input scalar is int type situation
+    auto scalar_ivalue = engine->context().get_constant(inputs[2]);
+    // 先转成float去接input[2]输入
+    auto scalar = scalar_ivalue.toScalar().to<float>();
+
+    // self * alpha
+    const double EPSILON = 1e-9;
+    if (fabs(scalar - 1.0) > EPSILON) {
+        nvinfer1::ITensor* alpha_tensor = nullptr;
+        // input[0]和input[2]若本身是int，则input[2]需转回int。否则trt中float和int相乘为0。
+        if (scalar_ivalue.isInt() && self->getType() == nvinfer1::DataType::kINT32) {
+            alpha_tensor = tensor_to_const(engine, torch::tensor({scalar}).to(at::ScalarType::Int));
+        } else {
+            alpha_tensor = tensor_to_const(engine, torch::tensor({scalar}));
+        }
+        auto scaleLayer = add_elementwise(engine,
+                        nvinfer1::ElementWiseOperation::kPROD,
+                        self,
+                        alpha_tensor,
+                        layer_info(node) + "_prod");
+        POROS_CHECK(scaleLayer, "Unable to create alpha*input layer from node: " << *node);
+        self = scaleLayer->getOutput(0);
+    }
+
+    //extract other
+    auto other = engine->context().get_tensor(inputs[1]);
+    //situation1: ---------- when other input is Scalar -------------
+    if (other == nullptr) {
+        auto other_const = engine->context().get_constant(inputs[1]);
+        if (other_const.isScalar()) {
+            // 先转成float去接input[1]输入
+            auto other_scalar = other_const.toScalar().to<float>();
+            at::Tensor other_tensor = torch::tensor({other_scalar});
+            if (other_const.isInt() && self->getType() == nvinfer1::DataType::kINT32) {
+                other_tensor = other_tensor.to(at::ScalarType::Int);
+            }
+            other = tensor_to_const(engine, other_tensor);
+        } else {
+            POROS_THROW_ERROR("Unable to get input other value for MulConverter");
+        }
+    } 
+
+    unify_type(engine, node, self, other);
+
+    auto sub = add_elementwise(engine, 
+            nvinfer1::ElementWiseOperation::kSUB, 
+            other,
+            self,
+            layer_info(node) + "_rsub");
+    POROS_CHECK(sub, "Unable to create sub layer from node: " << *node);
+    engine->context().set_tensor(node->outputs()[0], sub->getOutput(0));
+    LOG(INFO) << "Output tensor shape: " << sub->getOutput(0)->getDimensions();
+    return true;
+}
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, AddConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, SubConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, RsubConverter);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
\ No newline at end of file
diff --git a/poros/src/poros/converter/gpu/add.h b/poros/src/poros/converter/gpu/add.h
new file mode 100644
index 0000000000..c3284c60c1
--- /dev/null
+++ b/poros/src/poros/converter/gpu/add.h
@@ -0,0 +1,120 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file add.h
+* @author tianjinjin@baidu.com
+* @date Mon Aug 16 12:26:28 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class AddConverter : public GpuConverter {
+public:
+    AddConverter() {}
+    virtual ~AddConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    //aten::add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+    const std::vector<std::string> schema_string() {
+        return {"aten::add.Tensor(Tensor self, Tensor other, Scalar alpha=1) -> Tensor",
+                "aten::add.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor",
+                "aten::add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)",
+                "aten::add_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)",
+                "aten::add.int(int a, int b) -> (int)",
+                "aten::add.t(t[] a, t[] b) -> (t[])"
+                };
+    }
+
+    /** TODO: TO SUPPORT CONVERTERS BELLOW:
+     * "aten::add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)",
+     * **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::add,
+                torch::jit::aten::add_};
+    }
+
+    bool assign_schema_attr() {
+        bool result = true;
+        result &= assign_schema_attr_helper({{"aten::add.int(int a, int b) -> (int)", {1, 1}}});
+        result &= assign_schema_attr_helper({{"aten::add.t(t[] a, t[] b) -> (t[])", {1, 1}}});
+        result &= assign_schema_attr_helper({{"aten::add.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> (Tensor)", {1, 1}}}); 
+        return result;
+    }
+};
+
+class SubConverter : public GpuConverter {
+public:
+    SubConverter() {}
+    virtual ~SubConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor",
+                "aten::sub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor",
+                "aten::sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)",
+                "aten::sub_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)",
+                "aten::sub.int(int a, int b) -> (int)",
+                };
+    }
+
+    /** TODO: TO SUPPORT CONVERTERS BELLOW:
+     * "aten::sub.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)",
+     * **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::sub,
+                torch::jit::aten::sub_};
+    }
+    
+    bool assign_schema_attr() {
+        return assign_schema_attr_helper({{"aten::sub.int(int a, int b) -> (int)", {1, 1}}});
+    }
+};
+
+class RsubConverter : public GpuConverter {
+public:
+    RsubConverter() {}
+    virtual ~RsubConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::rsub.Tensor(Tensor self, Tensor other, Scalar alpha=1) -> (Tensor)",
+                "aten::rsub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> (Tensor)",
+                };
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::rsub};
+    }
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/aten_eval.cpp b/poros/src/poros/converter/gpu/aten_eval.cpp
new file mode 100644
index 0000000000..8dd93d9d3f
--- /dev/null
+++ b/poros/src/poros/converter/gpu/aten_eval.cpp
@@ -0,0 +1,160 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file aten_eval.cpp
+* @author tianjinjin@baidu.com
+* @date Mon Mar  8 11:36:11 CST 2021
+* @brief 
+**/
+
+#include "poros/converter/gpu/aten_eval.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/context/poros_global.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+/*
+"aten::append.t(t[](a!) self, t(c -> *) el) -> (t[](a!))"*/
+bool AppendConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 2), "invaid inputs size for AppendConverter");
+
+    //extract self
+    std::vector<nvinfer1::ITensor*> tensorlist;
+    POROS_CHECK_TRUE((engine->context().get_tensorlist(inputs[0], tensorlist)), "extract tensor list err");
+    // 防止输入的tensorlist没有经过trtengine的情况
+    //（不加的话 build trtengine 时会报 Unused Input 或者 Tensor xxx cannot be both input and output 的错误）
+    for (size_t i = 0; i < tensorlist.size(); i++) {
+        tensorlist[i] = engine->network()->addIdentity(*tensorlist[i])->getOutput(0);
+    }
+    //extract element
+    auto element = engine->context().get_tensor(inputs[1]);
+    //element is an already changed nvtensor
+    if (element != nullptr) {
+        element = engine->network()->addIdentity(*element)->getOutput(0);
+        tensorlist.emplace_back(element);
+        engine->context().set_tensorlist(node->outputs()[0], tensorlist);
+        engine->context().set_tensorlist(node->inputs()[0], tensorlist);
+        return true;
+    } else {
+        LOG(WARNING) << "non tensor kind element append is currently not support in AppendConverter";
+        return false;
+    }
+}
+
+/*
+"aten::__getitem__.t(t[](a) list, int idx) -> (t(*))"*/
+bool GetitemConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 2), "invaid inputs size for GetitemConverter");
+
+    if (node->outputs()[0]->type()->str() == "Tensor") {
+        //extract list
+        std::vector<nvinfer1::ITensor*> tensorlist;
+        POROS_CHECK_TRUE((engine->context().get_tensorlist(inputs[0], tensorlist)), "extract tensor list err")
+        
+        const int64_t list_size = tensorlist.size();
+        auto index = (engine->context().get_constant(inputs[1])).toInt();
+        const int64_t normalized_index = index < 0 ? list_size + index : index;
+        nvinfer1::ITensor* out_tensor = tensorlist[normalized_index];
+        engine->context().set_tensor(node->outputs()[0], out_tensor);
+        return true;
+    } else {
+        // extract nvtensor intlist
+        if (check_inputs_tensor_scalar(engine, node)) {
+            nvinfer1::ITensor* list_itensor = this->get_tensor_scalar(inputs[0]);
+            POROS_CHECK_TRUE((list_itensor != nullptr), 
+                                node_info(node) + std::string("get int nvtensor false."));
+
+            auto index = (engine->context().get_constant(inputs[1])).toInt();
+            auto list_len = (list_itensor->getDimensions()).d[0];
+            POROS_CHECK_TRUE((index >= -list_len && index <= list_len - 1), 
+                                node_info(node) + std::string(" idx is out of range."));
+
+            // 倒序改正序
+            index = index < 0 ? index + list_len : index;
+
+            nvinfer1::ITensor* index_itensor = tensor_to_const(engine, torch::tensor({index}, torch::kInt));
+
+            //extract the specific dynamic dim as a 1D-1value tensor
+            std::vector<int64_t> start_vec{0}, size_vec{1}, stride_vec{1};
+            nvinfer1::ISliceLayer* slice_layer = engine->network()->addSlice(*list_itensor,
+                                                    sizes_to_nvdim(start_vec),
+                                                    sizes_to_nvdim(size_vec),
+                                                    sizes_to_nvdim(stride_vec));
+            POROS_CHECK(slice_layer, "Unable to given dim info from node: " << *node);
+            slice_layer->setInput(1, *index_itensor);
+            slice_layer->setName((layer_info(node) + "_ISliceLayer").c_str());
+            nvinfer1::ITensor* slice_out = slice_layer->getOutput(0);
+            engine->context().set_tensor(node->outputs()[0], slice_out);
+            LOG(INFO) << "Output tensor shape: " << slice_out->getDimensions();
+        } else {
+            torch::jit::IValue ts_ivalue = engine->context().get_constant(inputs[0]);
+            POROS_CHECK_TRUE((ts_ivalue.isList()), "Unable to init input tensor for node: " << *node);
+            auto list = ts_ivalue.toListRef();
+            const int64_t list_size = list.size();
+            int64_t index = (engine->context().get_constant(inputs[1])).toInt();
+            const int64_t normalized_index = index < 0 ? list_size + index : index;
+            auto value_item = list[normalized_index];
+            engine->context().set_constant(node->outputs()[0], value_item);
+        }
+        return true;
+    }
+}
+
+/*
+"aten::_set_item.t(t[](a!) l, int idx, t(b -> *) el) -> (t[](a!))"*/
+bool SetitemConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 3), "invaid inputs size for SetitemConverter");
+
+    size_t idx = engine->context().get_constant(inputs[1]).toInt();
+
+    if (node->outputs()[0]->type()->str() == "Tensor[]") {
+        std::vector<nvinfer1::ITensor*> tensorlist;
+        POROS_CHECK_TRUE((engine->context().get_tensorlist(inputs[0], tensorlist)), "extract tensor list err");
+        POROS_CHECK_TRUE((tensorlist.size() > idx), "Tensorlist index out of range: " << *node);
+        // 防止输入的tensorlist没有经过trtengine的情况
+        //（不加的话 build trtengine 时会报 Unused Input 或者 Tensor xxx cannot be both input and output 的错误）
+        for (size_t i = 0; i < tensorlist.size(); i++) {
+            tensorlist[i] = engine->network()->addIdentity(*tensorlist[i])->getOutput(0);
+        }
+        nvinfer1::ITensor *input_tensor = engine->context().get_tensor(inputs[2]);
+        input_tensor = engine->network()->addIdentity(*input_tensor)->getOutput(0);
+        tensorlist[idx] = input_tensor;
+        engine->context().set_tensorlist(node->outputs()[0], tensorlist);
+        engine->context().set_tensorlist(node->inputs()[0], tensorlist);
+        return true;
+    }
+    else{
+        LOG(WARNING) << "non tensor kind element _set_item is currently not support in SetitemConverter";
+        return true;
+    }
+}
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, AppendConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, GetitemConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, SetitemConverter);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/aten_eval.h b/poros/src/poros/converter/gpu/aten_eval.h
new file mode 100644
index 0000000000..fbc34c4791
--- /dev/null
+++ b/poros/src/poros/converter/gpu/aten_eval.h
@@ -0,0 +1,93 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file aten_eval.h
+* @author tianjinjin@baidu.com
+* @date Mon Aug 16 12:26:28 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class AppendConverter : public GpuConverter {
+public:
+    AppendConverter() {}
+    virtual ~AppendConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    //aten::add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+    const std::vector<std::string> schema_string() {
+        return {"aten::append.t(t[](a!) self, t(c -> *) el) -> (t[](a!))" };
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::append};
+    }
+};
+
+class GetitemConverter : public GpuConverter {
+public:
+    GetitemConverter() {}
+    virtual ~GetitemConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    //aten::__getitem__.t(t[](a) list, int idx) -> (t(*))
+    const std::vector<std::string> schema_string() {
+        return {"aten::__getitem__.t(t[](a) list, int idx) -> (t(*))"};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::__getitem__};
+    }
+
+    bool assign_schema_attr() {
+        return assign_schema_attr_helper({{"aten::__getitem__.t(t[](a) list, int idx) -> (t(*))", {1, 1}}});
+    }
+};
+
+class SetitemConverter : public GpuConverter {
+public:
+    SetitemConverter() {}
+    virtual ~SetitemConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    //aten::_set_item.t(t[](a!) l, int idx, t(b -> *) el) -> (t[](a!))
+    const std::vector<std::string> schema_string() {
+        return {"aten::_set_item.t(t[](a!) l, int idx, t(b -> *) el) -> (t[](a!))"};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::_set_item};
+    }
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/aten_trt_util.cpp b/poros/src/poros/converter/gpu/aten_trt_util.cpp
new file mode 100644
index 0000000000..aaea55019a
--- /dev/null
+++ b/poros/src/poros/converter/gpu/aten_trt_util.cpp
@@ -0,0 +1,87 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file aten_trt_util.cpp
+* @author tianjinjin@baidu.com
+* @date Fri Aug  6 14:17:11 CST 2021
+* @brief 
+**/
+
+#include "poros/converter/gpu/aten_trt_util.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/util/macros.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+//将torchscript中的at::tensor转变成tensorrt中的weights结构
+bool at_tensor_to_trt_weignts(at::Tensor tensor, nvinfer1::Weights& weight) {
+    POROS_CHECK_TRUE((tensor.sizes().size() <= nvinfer1::Dims::MAX_DIMS), 
+        "given tensor is outof max_dims");
+
+    /*
+    auto shape = sizes_to_nvdim(tensor.sizes());
+    //TODO: CHECK this bias info. 
+    int64_t inputs_num = (tensor.sizes().size() > 1) ? tensor.sizes()[1] : tensor.sizes()[0];
+    int64_t outputs_num = tensor.sizes()[0];
+
+    nvinfer1::Dims kernel_shape;
+    if (tensor.sizes().size() > 2) {
+        kernel_shape.nbDims = tensor.sizes().size() - 2;
+        for (size_t i = 2; i < tensor.sizes().size(); i++) {
+            kernel_shape.d[i - 2] = tensor.size()[i];
+        }
+    } else {
+        kernal_shape.nbdims = 1;
+        kernal_shape.d[0] = 1;
+    }*/
+
+    auto t_cpu = tensor.to(at::kCPU);
+    t_cpu = t_cpu.contiguous();
+
+    auto t_type = c10::optTypeMetaToScalarType(t_cpu.dtype());
+    POROS_CHECK_TRUE(t_type.has_value(), "unsupported datatype");
+    //TODO: may be failed here
+    auto dtype = attype_to_nvtype(t_type.value());
+
+    void* buf = nullptr;
+    if (dtype == nvinfer1::DataType::kFLOAT) {
+        buf = malloc(t_cpu.numel() * sizeof(float));
+        memcpy(buf, t_cpu.data_ptr(), t_cpu.numel() * sizeof(float));
+    } else if (dtype == nvinfer1::DataType::kHALF) {
+        buf = malloc(t_cpu.numel() * (sizeof(float) / 2));
+        memcpy(buf, t_cpu.data_ptr(), t_cpu.numel() * (sizeof(float) / 2));
+    } else if (dtype == nvinfer1::DataType::kINT8) {
+        buf = malloc(t_cpu.numel() * sizeof(char));
+        memcpy(buf, t_cpu.data_ptr(), t_cpu.numel() * sizeof(char));
+    } else if (dtype == nvinfer1::DataType::kINT32) {
+        buf = malloc(t_cpu.numel() * sizeof(int));
+        memcpy(buf, t_cpu.data_ptr(), t_cpu.numel() * sizeof(int));
+    } else if (dtype == nvinfer1::DataType::kBOOL) {
+        buf = malloc(t_cpu.numel() * sizeof(bool));
+        memcpy(buf, t_cpu.data_ptr(), t_cpu.numel() * sizeof(bool));
+    }
+
+    weight.type = dtype;
+    weight.count = t_cpu.numel();
+    weight.values = buf;
+
+    return true;
+}
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/aten_trt_util.h b/poros/src/poros/converter/gpu/aten_trt_util.h
new file mode 100644
index 0000000000..34d4ddb7e7
--- /dev/null
+++ b/poros/src/poros/converter/gpu/aten_trt_util.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file aten_trt_util.h
+* @author tianjinjin@baidu.com
+* @date Fri Aug  6 10:42:39 CST 2021
+* @brief
+**/
+
+#pragma once
+
+#include <string>
+
+#include "torch/script.h"
+#include "NvInfer.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+//将torchscript中的at::tensor转变成tensorrt中的weights结构
+bool at_tensor_to_trt_weignts(at::Tensor tensor, nvinfer1::Weights& weight);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
\ No newline at end of file
diff --git a/poros/src/poros/converter/gpu/batch_norm.cpp b/poros/src/poros/converter/gpu/batch_norm.cpp
new file mode 100644
index 0000000000..b39e11dac2
--- /dev/null
+++ b/poros/src/poros/converter/gpu/batch_norm.cpp
@@ -0,0 +1,248 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file batch_norm.cpp
+* @author tianjinjin@baidu.com
+* @date Sun Aug 15 22:23:03 CST 2021
+* @brief 
+**/
+
+#include "poros/converter/gpu/batch_norm.h"
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/context/poros_global.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+/*
+aten::batch_norm(Tensor input, 
+Tensor? weight, 
+Tensor? bias, 
+Tensor? running_mean, 
+Tensor? running_var, 
+bool training, 
+float momentum, 
+float eps, 
+bool cudnn_enabled) -> Tensor
+*/
+bool BatchNormConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    
+    POROS_CHECK_TRUE((inputs.size() == 9), "invaid inputs size for BatchNormConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for BatchNormConverter is not Tensor as expected");
+    // weight & bias & running_mean & running_var
+    POROS_CHECK_TRUE((inputs[1]->node()->kind() == torch::jit::prim::Constant),
+        "input[1] for BatchNormConverter is not come from prim::Constant as expected");
+    POROS_CHECK_TRUE((inputs[2]->node()->kind() == torch::jit::prim::Constant),
+        "input[2] for BatchNormConverter is not come from prim::Constant as expected");
+    POROS_CHECK_TRUE((inputs[3]->node()->kind() == torch::jit::prim::Constant),
+        "input[3] for BatchNormConverter is not come from prim::Constant as expected");
+    POROS_CHECK_TRUE((inputs[4]->node()->kind() == torch::jit::prim::Constant),
+        "input[4] for BatchNormConverter is not come from prim::Constant as expected");
+
+    auto input = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((input != nullptr), 
+        "Unable to init input tensor for node: " << *node);
+    auto orig_shape = input->getDimensions();
+    auto shape = nvdim_to_sizes(orig_shape);
+    auto tensor_type = nvtype_to_attype(input->getType());
+    auto options = torch::TensorOptions().dtype(tensor_type);
+
+    torch::Tensor gamma, beta, mean, var;
+    auto maybe_gamma = engine->context().get_constant(inputs[1]);
+    auto maybe_beta = engine->context().get_constant(inputs[2]);
+    auto maybe_mean = engine->context().get_constant(inputs[3]);
+    auto maybe_bar = engine->context().get_constant(inputs[4]);
+
+    if (maybe_gamma.isTensor()) {
+        gamma = maybe_gamma.toTensor();
+    } else {
+        gamma = at::full({shape}, 1, {options});
+    }
+
+    if (maybe_beta.isTensor()) {
+        beta = maybe_beta.toTensor();
+    } else {
+        beta = at::full({shape}, 1, {options});
+    }
+
+    if (maybe_mean.isTensor()) {
+        mean = maybe_mean.toTensor();
+    } else {
+        mean = at::full({shape}, 0, {options});
+    }
+
+    if (maybe_bar.isTensor()) {
+        var = maybe_bar.toTensor();
+    } else {
+       var = at::full({shape}, 0, {options}); 
+    }
+
+    auto eps = engine->context().get_constant(inputs[7]).to<float>();
+
+    // Expand spatial dims from 1D to 2D if needed
+    bool expandDims = (orig_shape.nbDims < 4);
+    if (expandDims) {
+        input = add_padding(engine, node, input, 4);
+    }
+
+    auto scale = gamma / torch::sqrt(var + eps);
+    auto bias = beta - mean * scale;
+
+    auto scale_weights = Weights(scale);
+    auto bias_weights = Weights(bias);
+
+    auto power = Weights(at::ones_like(scale));
+    auto bn = engine->network()->addScaleNd(
+        *input, nvinfer1::ScaleMode::kCHANNEL, bias_weights.data, scale_weights.data, power.data, 1);
+    bn->setName((layer_info(node) + "_IScaleLayer").c_str());
+    // Un-pad bn output if needed
+    auto out_tensor = add_unpadding(engine, node, bn->getOutput(0), orig_shape.nbDims);
+    engine->context().set_tensor(node->outputs()[0], out_tensor);
+    LOG(INFO) << "Output tensor shape: " << out_tensor->getDimensions();
+    return true;
+}
+
+/*
+aten::instance_norm(Tensor input,
+Tensor? weight,
+Tensor? bias,
+Tensor? running_mean,
+Tensor? running_var,
+bool use_input_stats,
+float momentum,
+float eps,
+bool cudnn_enabled) -> Tensor
+*/
+bool InstanceNormConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    
+    POROS_CHECK_TRUE((inputs.size() == 9), "invaid inputs size for InstanceNormConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for InstanceNormConverter is not Tensor as expected");
+    // weight & bias & running_mean & running_var
+    POROS_CHECK_TRUE((inputs[1]->node()->kind() == torch::jit::prim::Constant),
+        "input[1] for InstanceNormConverter is not come from prim::Constant as expected");
+    POROS_CHECK_TRUE((inputs[2]->node()->kind() == torch::jit::prim::Constant),
+        "input[2] for InstanceNormConverter is not come from prim::Constant as expected");
+    POROS_CHECK_TRUE((inputs[3]->node()->kind() == torch::jit::prim::Constant),
+        "input[3] for InstanceNormConverter is not come from prim::Constant as expected");
+    POROS_CHECK_TRUE((inputs[4]->node()->kind() == torch::jit::prim::Constant),
+        "input[4] for InstanceNormConverter is not come from prim::Constant as expected");
+
+    auto input = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((input != nullptr), 
+        "Unable to init input tensor for node: " << *node);
+    auto orig_shape = input->getDimensions();
+    auto shape = nvdim_to_sizes(orig_shape);
+    auto tensor_type = nvtype_to_attype(input->getType());
+    auto options = torch::TensorOptions().dtype(tensor_type);
+
+    // Expand spatial dims from 1D to 2D if needed
+    bool expand_dims = (orig_shape.nbDims < 4);
+    if (expand_dims) {
+        input = add_padding(engine, node, input, 4);
+    }
+
+    torch::Tensor weight, bias, mean, var;
+    auto maybe_weight = engine->context().get_constant(inputs[1]);
+    auto maybe_bias = engine->context().get_constant(inputs[2]);
+    auto maybe_mean = engine->context().get_constant(inputs[3]);
+    auto maybe_var = engine->context().get_constant(inputs[4]);
+
+    if (maybe_weight.isTensor()) {
+        weight = maybe_weight.toTensor().cpu().contiguous();
+    } else {
+        weight = at::ones(shape[1], options).cpu().contiguous();
+    }
+
+    if (maybe_bias.isTensor()) {
+        bias = maybe_bias.toTensor().cpu().contiguous();
+    } else {
+        bias = at::zeros(shape[1], options).cpu().contiguous();
+    }
+
+    auto eps = static_cast<float>(engine->context().get_constant(inputs[7]).toDouble());
+
+    //TODO: 确认此处设置 ”或“ 还是 “与” 合适
+    if (maybe_mean.isTensor() && maybe_var.isTensor()) {
+        mean = maybe_mean.toTensor();
+        var = maybe_var.toTensor();
+        
+        auto scale = weight.to(mean.options()) / torch::sqrt(var + eps);
+        auto new_bias = bias.to(mean.options()) - mean * scale;
+
+        auto scale_weights = Weights(scale);
+        auto bias_weights = Weights(new_bias);
+        
+        auto power = Weights(at::ones_like(scale));
+        auto bn = engine->network()->addScaleNd(
+            *input, nvinfer1::ScaleMode::kCHANNEL, bias_weights.data, scale_weights.data, power.data, 1);
+        bn->setName((layer_info(node) + "_IScaleLayer").c_str());
+        // Un-pad bn output if needed
+        auto out_tensor = add_unpadding(engine, node, bn->getOutput(0), orig_shape.nbDims);
+        engine->context().set_tensor(node->outputs()[0], out_tensor);
+        LOG(INFO) << "Output tensor shape: " << out_tensor->getDimensions();
+        return true;
+    }
+
+    // https://github.com/NVIDIA/TensorRT/tree/release/8.4/plugin/instanceNormalizationPlugin
+    
+    const int relu = 0;
+    const float alpha = 0;
+    std::vector<nvinfer1::PluginField> f;
+    f.emplace_back(nvinfer1::PluginField("epsilon", &eps, nvinfer1::PluginFieldType::kFLOAT32, 1));
+    f.emplace_back(nvinfer1::PluginField(
+        "scales", weight.data_ptr<float>(), nvinfer1::PluginFieldType::kFLOAT32, weight.numel()));
+    f.emplace_back(nvinfer1::PluginField(
+        "bias", bias.data_ptr<float>(), nvinfer1::PluginFieldType::kFLOAT32, bias.numel()));
+    f.emplace_back(nvinfer1::PluginField("relu", &relu, nvinfer1::PluginFieldType::kINT32, 1));
+    f.emplace_back(nvinfer1::PluginField("alpha", &alpha, nvinfer1::PluginFieldType::kFLOAT32, 1));
+    
+    nvinfer1::PluginFieldCollection fc;
+    fc.nbFields = f.size();
+    fc.fields = f.data();
+    
+    auto creator = getPluginRegistry()->getPluginCreator("InstanceNormalization_TRT", "1", "");
+    auto instance_norm_plugin = creator->createPlugin("instance_norm", &fc);
+    
+    POROS_CHECK(instance_norm_plugin, "Unable to create instance_norm plugin from TensorRT plugin registry" << *node);
+    auto new_layer = engine->network()->addPluginV2(
+        reinterpret_cast<nvinfer1::ITensor* const*>(&input), 1, *instance_norm_plugin);
+    new_layer->setName((layer_info(node) + "_plugin_instance_norm").c_str());
+    nvinfer1::ITensor* output = new_layer->getOutput(0);
+
+    if (expand_dims) {
+        output = add_unpadding(engine, node, output, orig_shape.nbDims);
+    }
+
+    engine->context().set_tensor(node->outputs()[0], output);
+    LOG(INFO) << "Output tensor shape: " << output->getDimensions();
+    return true;
+}
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, BatchNormConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, InstanceNormConverter);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/batch_norm.h b/poros/src/poros/converter/gpu/batch_norm.h
new file mode 100644
index 0000000000..dba8bfeea4
--- /dev/null
+++ b/poros/src/poros/converter/gpu/batch_norm.h
@@ -0,0 +1,71 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file batch_norm.h
+* @author tianjinjin@baidu.com
+* @date Fri Aug 13 16:51:50 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class BatchNormConverter : public GpuConverter {
+public:
+    BatchNormConverter() {}
+    virtual ~BatchNormConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    virtual const std::vector<std::string> schema_string() {
+        return {"aten::batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> Tensor"};
+    }
+
+    virtual const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::batch_norm,};
+    }
+};
+
+
+class InstanceNormConverter : public GpuConverter {
+public:
+    InstanceNormConverter() {}
+    virtual ~InstanceNormConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    virtual const std::vector<std::string> schema_string() {
+        return {"aten::instance_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool use_input_stats, float momentum, float eps, bool cudnn_enabled) -> Tensor"};
+    }
+
+    virtual const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::instance_norm,};
+    }
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/clone.cpp b/poros/src/poros/converter/gpu/clone.cpp
new file mode 100644
index 0000000000..9ebcf3a52a
--- /dev/null
+++ b/poros/src/poros/converter/gpu/clone.cpp
@@ -0,0 +1,92 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file clone.cpp
+* @author tianshaoqing@baidu.com
+* @date Tue Nov 23 12:26:28 CST 2021
+* @brief 
+**/
+
+#include "poros/converter/gpu/clone.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/context/poros_global.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+// aten::clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor
+bool CloneConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 2), "invaid inputs size for CloneConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for CloneConverter is not Tensor as expected");
+    POROS_CHECK_TRUE((inputs[1]->node()->kind() == torch::jit::prim::Constant),
+        "input[1] for CloneConverter is not come from prim::Constant as expected");
+    
+    POROS_CHECK_TRUE(engine->context().get_constant(inputs[1]).isNone(),
+        "not support memory format set yet.");
+
+    //extract self
+    nvinfer1::ITensor* self = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((self != nullptr), "Unable to init input tensor for node: " << *node);
+
+    // select whole input tensor to clone a new tensor
+    nvinfer1::Dims self_dims = self->getDimensions();
+    bool is_dynamic = check_nvtensor_is_dynamic(self);
+    
+    std::vector<int64_t> start_vec, size_vec, stride_vec;
+    for (int32_t i = 0; i < self_dims.nbDims; i++) {
+        start_vec.push_back(0);
+        if (is_dynamic) {
+            size_vec.push_back(0);
+        } else {
+            size_vec.push_back(self_dims.d[i]);
+        }
+        stride_vec.push_back(1);
+    }
+
+    nvinfer1::Dims start_dim = sizes_to_nvdim(start_vec);
+    nvinfer1::Dims size_dim = sizes_to_nvdim(size_vec);
+    nvinfer1::Dims stride_dim = sizes_to_nvdim(stride_vec);
+
+    nvinfer1::ITensor* self_shape = nullptr;
+    if (is_dynamic) {
+        self_shape = engine->network()->addShape(*self)->getOutput(0);
+    }
+    
+    nvinfer1::ISliceLayer* slice_layer = engine->network()->addSlice(*self, start_dim, size_dim, stride_dim);
+    POROS_CHECK(slice_layer, "Unable to create slice layer from node: " << *node);
+    if (is_dynamic) {
+        slice_layer->setInput(2, *self_shape);
+    }
+    slice_layer->setName((layer_info(node) + "_ISliceLayer").c_str());
+    nvinfer1::ITensor* output = slice_layer->getOutput(0);
+    
+    engine->context().set_tensor(node->outputs()[0], self);
+    LOG(INFO) << "Output tensor shape: " << output->getDimensions();
+    return true;
+}
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, CloneConverter);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
\ No newline at end of file
diff --git a/poros/src/poros/converter/gpu/clone.h b/poros/src/poros/converter/gpu/clone.h
new file mode 100644
index 0000000000..92f6617c43
--- /dev/null
+++ b/poros/src/poros/converter/gpu/clone.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file clone.h
+* @author tianshaoqing@baidu.com
+* @date Tue Nov 23 12:26:28 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class CloneConverter : public GpuConverter {
+public:
+    CloneConverter() {}
+    virtual ~CloneConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    // aten::clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor
+    const std::vector<std::string> schema_string() {
+        return {"aten::clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor"};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::clone};
+    }
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/coercion.cpp b/poros/src/poros/converter/gpu/coercion.cpp
new file mode 100644
index 0000000000..afe442d943
--- /dev/null
+++ b/poros/src/poros/converter/gpu/coercion.cpp
@@ -0,0 +1,60 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file coercion.cpp
+* @author wangrui39@baidu.com
+* @date Fri May 13 11:36:11 CST 2022
+* @brief 
+**/
+
+#include "poros/converter/gpu/coercion.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/context/poros_global.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+/*"aten::Int.float(float a) -> (int)"
+"aten::Int.Tensor(Tensor a) -> (int)*/
+bool CoercionConverter::converter(TensorrtEngine *engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value *> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 1), "invaid inputs size for CoercionConverter");
+    nvinfer1::ITensor *tensor_a = engine->context().get_tensor(inputs[0]);
+
+    // int to tensor
+    if (nullptr != tensor_a) {
+        auto id_layer = engine->network()->addIdentity(*tensor_a);
+        id_layer->setName((layer_info(node) + "_IIdentityLayer").c_str());
+        id_layer->setOutputType(0, nvinfer1::DataType::kINT32);
+        engine->context().set_tensor(node->outputs()[0], id_layer->getOutput(0));
+    } else {
+        int a = engine->context().get_constant(inputs[0]).toScalar().to<int>();
+        engine->context().set_constant(node->outputs()[0], a);
+    }
+    return true;
+}  
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, CoercionConverter);
+
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/coercion.h b/poros/src/poros/converter/gpu/coercion.h
new file mode 100644
index 0000000000..5eaddf45f8
--- /dev/null
+++ b/poros/src/poros/converter/gpu/coercion.h
@@ -0,0 +1,63 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file coercion.h
+* @author wangrui39@baidu.com
+* @date Fri May 13 11:36:11 CST 2022
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class CoercionConverter : public GpuConverter {
+public:
+    CoercionConverter() {}
+    virtual ~CoercionConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::Int.float(float a) -> (int)", 
+                "aten::Int.Tensor(Tensor a) -> (int)"};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::Int};
+    }
+
+    bool assign_schema_attr() {
+        bool result = true;
+        result &= assign_schema_attr_helper({{"aten::Int.float(float a) -> (int)", {1, 1}}});
+        result &= assign_schema_attr_helper({{"aten::Int.Tensor(Tensor a) -> (int)", {1, 1}}});
+        return result;
+    }
+
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/concat.cpp b/poros/src/poros/converter/gpu/concat.cpp
new file mode 100644
index 0000000000..f25b7236e3
--- /dev/null
+++ b/poros/src/poros/converter/gpu/concat.cpp
@@ -0,0 +1,65 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file concat.cpp
+* @author tianjinjin@baidu.com
+* @date Mon Mar  8 11:36:11 CST 2021
+* @brief 
+**/
+
+#include "poros/converter/gpu/concat.h"
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/context/poros_global.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+/*"aten::cat(Tensor[] tensors, int dim=0) -> Tensor*/
+bool ConcatConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+
+    POROS_CHECK_TRUE((inputs.size() == 2), "invaid inputs size for ConcatConverter");
+    POROS_CHECK_TRUE(inputs[0]->type()->isSubtypeOf(c10::ListType::ofTensors()), 
+        "input[0] for ConcatConverter is not TensorList as expected");
+    POROS_CHECK_TRUE(inputs[1]->type()->isSubtypeOf(c10::NumberType::get()), 
+        "input[1] for ConcatConverter is not int64_t as expected");
+
+    std::vector<nvinfer1::ITensor*> tensorlist;
+    POROS_CHECK_TRUE((engine->context().get_tensorlist(inputs[0], tensorlist)), "extract tensor list err")
+
+    //extract dims
+    auto dim = (engine->context().get_constant(inputs[1])).toInt();
+    if (dim < 0) {
+        dim = tensorlist[0]->getDimensions().nbDims + dim;
+    }
+    
+    auto cat_layer = engine->network()->addConcatenation(tensorlist.data(), tensorlist.size());
+    cat_layer->setAxis(static_cast<int>(dim));
+    cat_layer->setName((layer_info(node) + "_IConcatenationLayer").c_str());
+    engine->context().set_tensor(node->outputs()[0], cat_layer->getOutput(0));
+    LOG(INFO) << "Output tensor shape: " << cat_layer->getOutput(0)->getDimensions();
+    return true;
+}
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, ConcatConverter);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/concat.h b/poros/src/poros/converter/gpu/concat.h
new file mode 100644
index 0000000000..147d7603ab
--- /dev/null
+++ b/poros/src/poros/converter/gpu/concat.h
@@ -0,0 +1,60 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file concat.h
+* @author tianjinjin@baidu.com
+* @date Tue Jul 27 11:24:21 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+//TODO: there is a concat_opt.cpp in torchscript. check it.
+class ConcatConverter : public GpuConverter {
+public:
+    ConcatConverter() {}
+    virtual ~ConcatConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::cat(Tensor[] tensors, int dim=0) -> Tensor"};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::cat};
+    }
+
+    bool assign_schema_attr() {
+        return assign_schema_attr_helper({{"aten::cat(Tensor[] tensors, int dim=0) -> Tensor", {1, 1}}});
+    }
+
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/constant.cpp b/poros/src/poros/converter/gpu/constant.cpp
new file mode 100644
index 0000000000..f0735adfdd
--- /dev/null
+++ b/poros/src/poros/converter/gpu/constant.cpp
@@ -0,0 +1,80 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file constant.cpp
+* @author tianjinjin@baidu.com
+* @date Mon Mar  8 11:36:11 CST 2021
+* @brief 
+**/
+#include "torch/script.h"
+
+#include "poros/converter/gpu/constant.h"
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/context/poros_global.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+bool ConstantConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    c10::optional<torch::jit::IValue> ivalue = toIValue(node->output());
+    POROS_CHECK_TRUE(ivalue.has_value(), "invaid data for ConstantConverter");
+    engine->context().set_constant(node->outputs()[0], ivalue.value());
+
+    //situation1: Tensor
+    if (ivalue.value().isTensor()) {
+        auto tensor = ivalue.value().toTensor();
+        auto t_weights = Weights(tensor);
+        auto const_layer = engine->network()->addConstant(t_weights.shape, t_weights.data);
+        const_layer->setName(layer_info(node).c_str());
+        engine->context().set_tensor(node->outputs()[0], const_layer->getOutput(0));
+    }
+    //situation2: Tensor[]
+    else if(ivalue.value().isTensorList()) {
+        auto c10_tensorlist = ivalue.value().toTensorList();
+        std::vector<nvinfer1::ITensor*> tensorlist;
+        tensorlist.reserve(c10_tensorlist.size());
+        for (size_t i = 0; i < c10_tensorlist.size(); i++){
+            nvinfer1::ITensor* nv_tensor = tensor_to_const(engine, c10_tensorlist[i]);
+            tensorlist.emplace_back(nv_tensor);
+        }
+        engine->context().set_tensorlist(node->outputs()[0], tensorlist);
+    }
+    //situation3: Tensor?[]
+    else if (ivalue.value().type()->str().find("Tensor?[]") != std::string::npos) {
+        c10::List<c10::IValue> c10_tensorlist = ivalue.value().to<c10::List<c10::IValue>>();
+        std::vector<nvinfer1::ITensor*> tensorlist;
+        tensorlist.reserve(c10_tensorlist.size());
+        for (size_t i = 0; i < c10_tensorlist.size(); i++){
+            auto tensor = c10_tensorlist.get(i).toTensor();
+            nvinfer1::ITensor* nv_tensor = tensor_to_const(engine, tensor);
+            tensorlist.emplace_back(nv_tensor);
+        }
+        engine->context().set_tensorlist(node->outputs()[0], tensorlist);
+    }
+
+    return true;
+}
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, ConstantConverter);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/constant.h b/poros/src/poros/converter/gpu/constant.h
new file mode 100644
index 0000000000..d17218cdbb
--- /dev/null
+++ b/poros/src/poros/converter/gpu/constant.h
@@ -0,0 +1,60 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file constant.h
+* @author tianjinjin@baidu.com
+* @date Tue Jul 27 11:24:21 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class ConstantConverter : public GpuConverter {
+public:
+    ConstantConverter() {}
+    virtual ~ConstantConverter() {}
+
+    /**
+     * @brief converter, 将node转换成layer，添加到engine的组网逻辑里
+     * @retval 0 => success, <0 => fail
+     **/ 
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    //prim::Constant kind node has no schema
+    const std::vector<std::string> schema_string() {
+        return {};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::prim::Constant};
+    }
+
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/constant_pad_nd.cpp b/poros/src/poros/converter/gpu/constant_pad_nd.cpp
new file mode 100644
index 0000000000..f1f5527acd
--- /dev/null
+++ b/poros/src/poros/converter/gpu/constant_pad_nd.cpp
@@ -0,0 +1,375 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file constant_pad_nd.cpp
+* @author tianshaoqing@baidu.com
+* @date Thur Dec 2 14:29:20 CST 2021
+* @brief 
+**/
+
+#include "poros/converter/gpu/constant_pad_nd.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/context/poros_global.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+//DEPRECATED: 该实现方式内部采用的contat，在trt的profile阶段，会额外引入一些copy节点，导致性能变差。
+// aten::constant_pad_nd(Tensor self, int[] pad, Scalar value=0) -> Tensor
+bool ConstantPadNdConverter::converter_old_version(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 3), "invaid inputs size for ReplicationPadConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for ConstantPadNdConverter is not Tensor as expected");
+
+    // extract self
+    auto self = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((self != nullptr), "Unable to init input tensor for node: " << *node);
+    auto self_dims = self->getDimensions();
+    int64_t self_rank = self_dims.nbDims;
+
+    // check input is dynamic or not
+    std::vector<int64_t> self_dims_vec = nvdim_to_sizes(self_dims);
+    // extract self
+    torch::jit::IValue maybe_pad = engine->context().get_constant(inputs[1]);
+    POROS_CHECK_TRUE((!maybe_pad.isNone()), "invaid inputs[1] for ConstantPadNdConverter");
+    std::vector<int64_t> padding = maybe_pad.toIntList().vec();
+    int64_t pad_size = padding.size();
+
+    // pad_size must be an integer multiple of 2
+    POROS_CHECK_TRUE((pad_size % 2 == 0), "Length of pad must be even but instead it equals: " << pad_size);
+    int64_t l_pad = pad_size / 2;
+    POROS_CHECK_TRUE((self_rank >= l_pad), "Length of pad should be no more than twice the number of "
+            "dimensions of the input. Pad length is " << pad_size << "while the input has " << self_rank << "dimensions.");
+
+    // extract value
+    torch::jit::IValue maybe_value = engine->context().get_constant(inputs[2]);
+    POROS_CHECK_TRUE((!maybe_value.isNone()), "invaid inputs[2] for ConstantPadNdConverter");
+    float value = maybe_value.toScalar().to<float>();
+
+    // prepare for dynamic
+    const bool is_dynamic = check_nvtensor_is_dynamic(self);
+
+    // dynamic下trt的Ifilllayer无法构建bool类型，所以先返回false（虽然constant_pad_nd bool的非常少）
+    if (is_dynamic && maybe_value.isBool()) {
+        LOG(WARNING) << "ConstantPadNdConverter is not support padding value is type of bool when dynamic.";
+        return false;
+    }
+
+    nvinfer1::ITensor* self_shape = nullptr;
+    nvinfer1::ITensor* rev_mask_shape_tensor = nullptr;
+    if (is_dynamic) {
+        self_shape = engine->network()->addShape(*self)->getOutput(0);
+    }
+
+    // create itensors vector
+    std::vector<nvinfer1::ITensor*> itensors_vec;
+    
+    for (int64_t i = 0; i < l_pad; i++) {
+        int64_t axis = self_rank - (i + 1);
+        int64_t padding_index = i * 2;   
+        // dynamic情况，需要使用mask完成padding shape的构造
+        // 首先，使用rev_mask_tensor使self_shape[axis] = 0
+        // 例如：self_shape = [2, 3, 4, 5]，axis = 3，则rev_mask_shape_tensor = [2, 3, 4, 0]
+        if (is_dynamic && (padding[padding_index] > 0 || padding[padding_index + 1] > 0)) {
+            at::Tensor rev_mask_tensor = at::ones({self_rank}, torch::kInt);
+            rev_mask_tensor[axis] = 0;
+            nvinfer1::ITensor* nv_rev_mask_tensor = tensor_to_const(engine, rev_mask_tensor);
+            rev_mask_shape_tensor = add_elementwise(engine, 
+                                                    nvinfer1::ElementWiseOperation::kPROD, 
+                                                    self_shape, 
+                                                    nv_rev_mask_tensor,
+                                                    layer_info(node) + "_prod(axis_dim_to_zero)_" + std::to_string(i))->getOutput(0);
+        }
+
+        if (padding[padding_index] > 0) {
+            itensors_vec.clear();
+            // 非dynamic情况
+            if (!is_dynamic) {
+                // create pad tensor
+                self_dims_vec[axis] = padding[padding_index];
+                at::Tensor pad_tenosr = at::full(self_dims_vec, value, torch::kFloat32);
+                // 默认是float32类型，如果self是int32的需转换类型
+                if (self->getType() == nvinfer1::DataType::kINT32) {
+                    pad_tenosr = pad_tenosr.to(at::ScalarType::Int);
+                }
+                // 默认是float32类型，如果self是bool的需转换类型（bool情况很少）
+                if (self->getType() == nvinfer1::DataType::kBOOL && maybe_value.isBool()) {
+                    pad_tenosr = pad_tenosr.to(at::ScalarType::Bool);
+                }
+                itensors_vec.push_back(tensor_to_const(engine, pad_tenosr));
+            } else {
+            // dynamic情况
+                // 然后，使用mask_tensor构造只有axis下标是padding[padding_index]，其余数据都是0的tensor
+                // 例如：self_shape = [2, 3, 4, 5]，则self_rank = 4，
+                // 若当前axis = 3，padding[padding_index] = 2，则构造出来的nv_mask_tensor = [0, 0, 0, 2]
+                at::Tensor mask_tensor = at::zeros({self_rank}, torch::kInt);
+                mask_tensor[axis] = padding[padding_index];
+                nvinfer1::ITensor* nv_mask_tensor = tensor_to_const(engine, mask_tensor);
+                // 最后，nv_mask_tensor与之前得到的rev_mask_shape_tensor相加，就得到padding shape
+                // 例如：刚才rev_mask_shape_tensor = [2, 3, 4, 0]， nv_mask_tensor = [0, 0, 0, 2]
+                // 则pad_shape_tensor = [2, 3, 4, 2]
+                nvinfer1::ITensor* pad_shape_tensor = add_elementwise(engine, 
+                                            nvinfer1::ElementWiseOperation::kSUM, 
+                                            rev_mask_shape_tensor, 
+                                            nv_mask_tensor,
+                                            layer_info(node) + "_sum(gen_left_pad_shape)_" + std::to_string(i))->getOutput(0);
+                // 根据padding shape和value创建nvtensor
+                auto fill_layer = engine->network()->addFill(nvinfer1::Dims{1, {1}}, nvinfer1::FillOperation::kLINSPACE);
+                fill_layer->setInput(0, *pad_shape_tensor);
+                fill_layer->setName((layer_info(node) + "_IFillLayer_" + std::to_string(padding_index)).c_str());
+
+                at::Tensor value_tensor = torch::tensor(value, torch::kFloat32);
+                at::Tensor delta_tensor = torch::zeros(self_rank, torch::kFloat32);
+                // 默认是float32类型，如果self是int32的需转换类型
+                if (self->getType() == nvinfer1::DataType::kINT32) {
+                    value_tensor = value_tensor.to(at::ScalarType::Int);
+                    delta_tensor = delta_tensor.to(at::ScalarType::Int);
+                }
+                auto value_itensor = tensor_to_const(engine, value_tensor);
+                fill_layer->setInput(1, *value_itensor); // 初始值
+                auto delta_itensor = tensor_to_const(engine, delta_tensor);
+                fill_layer->setInput(2, *delta_itensor); // delta值
+                
+                itensors_vec.push_back(fill_layer->getOutput(0));
+            }
+
+            itensors_vec.push_back(self);
+            // concat
+            nvinfer1::IConcatenationLayer* concat_layer = 
+                        engine->network()->addConcatenation(itensors_vec.data(), itensors_vec.size());
+            concat_layer->setAxis(axis);
+            concat_layer->setName((layer_info(node) + "_IConcatenationLayer_" + std::to_string(padding_index)).c_str());
+            self = concat_layer->getOutput(0);
+            // 非dynamic更新维度信息
+            self_dims = self->getDimensions();
+            self_dims_vec = nvdim_to_sizes(self_dims);
+            // dynamic更新维度信息
+            if (is_dynamic) {
+                self_shape = engine->network()->addShape(*self)->getOutput(0);
+            }
+        }
+
+        if (padding[padding_index + 1] > 0) {
+            itensors_vec.clear();
+            // padding self dim=axis的另一边，
+            // 与上面的代码只有self加入itensors_vec先后顺序的区别，这里是先push_back
+            itensors_vec.push_back(self);
+
+            // create pad tensor
+            if (!is_dynamic) {
+                self_dims_vec[axis] = padding[padding_index + 1];
+                at::Tensor pad_tenosr = at::full(self_dims_vec, value, torch::kFloat32);
+                // 默认是float32类型，如果self是int32的需转换类型
+                if (self->getType() == nvinfer1::DataType::kINT32) {
+                    pad_tenosr = pad_tenosr.to(at::ScalarType::Int);
+                }
+                // 默认是float32类型，如果self是bool的需转换类型（bool情况很少）
+                if (self->getType() == nvinfer1::DataType::kBOOL && maybe_value.isBool()) {
+                    pad_tenosr = pad_tenosr.to(at::ScalarType::Bool);
+                }
+                itensors_vec.push_back(tensor_to_const(engine, pad_tenosr));
+            } else {
+                // 与上面代码类似
+                at::Tensor mask_tensor = at::zeros({self_rank}, torch::kInt);
+                mask_tensor[axis] = padding[padding_index + 1];
+                nvinfer1::ITensor* nv_mask_tensor = tensor_to_const(engine, mask_tensor);
+                nvinfer1::ITensor* pad_shape_tensor = add_elementwise(engine, 
+                                            nvinfer1::ElementWiseOperation::kSUM, 
+                                            rev_mask_shape_tensor, 
+                                            nv_mask_tensor,
+                                            layer_info(node) + "_sum(gen_right_pad_shape)_" + std::to_string(i))->getOutput(0);
+
+                auto fill_layer = engine->network()->addFill(nvinfer1::Dims{1, {1}}, nvinfer1::FillOperation::kLINSPACE);
+                fill_layer->setInput(0, *pad_shape_tensor);  // 设置output shape
+                fill_layer->setName((layer_info(node) + "_IFillLayer_more_" + std::to_string(padding_index)).c_str());
+                at::Tensor value_tensor = torch::tensor(value, torch::kFloat32);
+                at::Tensor delta_tensor = torch::zeros(self_rank, torch::kFloat32); // 只有1个维度
+                // 默认是float32类型，如果self是int32的需转换类型
+                if (self->getType() == nvinfer1::DataType::kINT32) {
+                    value_tensor = value_tensor.to(at::ScalarType::Int);
+                    delta_tensor = delta_tensor.to(at::ScalarType::Int);
+                }
+                auto value_itensor = tensor_to_const(engine, value_tensor);
+                fill_layer->setInput(1, *value_itensor); // 初始值
+                auto delta_itensor = tensor_to_const(engine, delta_tensor);
+                fill_layer->setInput(2, *delta_itensor);
+                
+                itensors_vec.push_back(fill_layer->getOutput(0));
+            }
+            
+            // concat
+            nvinfer1::IConcatenationLayer* concat_layer = 
+                        engine->network()->addConcatenation(itensors_vec.data(), itensors_vec.size());
+            concat_layer->setAxis(axis);
+            concat_layer->setName((layer_info(node) + "_IConcatenationLayer_" + std::to_string(padding_index + 1)).c_str());
+            self = concat_layer->getOutput(0);
+            // 非dynamic更新维度信息
+            self_dims = self->getDimensions();
+            self_dims_vec = nvdim_to_sizes(self_dims);
+            // dynamic更新维度信息
+            if (is_dynamic) {
+                self_shape = engine->network()->addShape(*self)->getOutput(0);
+            }
+        }
+    }
+
+    engine->context().set_tensor(node->outputs()[0], self);
+    LOG(INFO) << "Output tensor shape: " << self->getDimensions();
+
+    return true;
+}
+
+/** 
+ * @brief 将pytorch组织的padding信息，转化成tensorrt可以接受的padding。
+ * pytorch 下的padding order：
+ *      The order is dim_n_begin, dim_n_end, dim_n-1_begin, dim_n-1_end, ..., dim_m_begin, dim_m_end,
+ *      where m is in range [0, n].
+ * 期望被转变成的padding order：
+ *      dim_0_begin, dim_1_begin, ... , dim_0_end, ..., dim_n_end.
+ *      while n is the dimension of input.
+ * 当前的转化逻辑，基于padding 本身是constant 这个前提（被padding的tensor是否dynamic不影响）。
+ * 目前不支持padding本身是动态的，如果遇到相应的场景，再添加。
+ * padding本身dynamic下的转化思路是：
+ *      padding后面先补0，再reshape成(-1,2)维度，然后flip+transpose，最后reshape成1维即可。
+ *      torch::reshape(torch::transpose(aten::flip(torch::reshape(padding_tensor, {-1, 2}), [0]), 1, 0),{-1});
+ * **/
+bool ConstantPadNdConverter::converter_padding(TensorrtEngine* engine,
+                    int64_t rank, 
+                    const std::vector<int64_t>& padding,
+                    nvinfer1::ITensor*& start_tensor,
+                    nvinfer1::ITensor*& total_padding_tensor) {
+    
+    std::vector<int64_t> start;
+    std::vector<int64_t> total_padding;
+    if (padding.size() % 2U != 0) {
+        LOG(WARNING) << "padding size should be even but instead it equals: " << padding.size();
+        return false;
+    }
+
+    const int64_t pad_dim_len = static_cast<int64_t>(padding.size() / 2U);
+    const int64_t diff = rank - pad_dim_len;
+    if (diff < 0) {
+        LOG(WARNING) << "padding size should be no more than twice the number of dimensions of the input"
+                    << " , but given padding size is: " << padding.size() 
+                    << " , given input dimensions is: " << rank << ".";
+        return false;
+    }
+
+    start.resize(rank, 0);
+    total_padding.resize(rank, 0);
+
+    for (int64_t i = diff; i < rank; i++) {
+        const int64_t idx = i - diff;
+        const int64_t reverse_idx = pad_dim_len - idx - 1;
+        const int64_t before = padding[reverse_idx * 2];
+        const int64_t after = padding[reverse_idx * 2 + 1];
+        if (before < 0 || after < 0) {
+            return false;
+        }
+        start[i] = -before;
+        total_padding[i] = before + after;
+    }
+
+    at::Tensor at_start_tensor = torch::from_blob(start.data(), start.size(), 
+                                                torch::TensorOptions().dtype(torch::kInt64));
+    start_tensor = tensor_to_const(engine, at_start_tensor);
+
+    at::Tensor at_total_padding_tensor = torch::from_blob(total_padding.data(), total_padding.size(), 
+                                                        torch::TensorOptions().dtype(torch::kInt64));
+    total_padding_tensor = tensor_to_const(engine, at_total_padding_tensor);
+    return start_tensor && total_padding_tensor;
+}
+
+// aten::constant_pad_nd(Tensor self, int[] pad, Scalar value=0) -> Tensor
+bool ConstantPadNdConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 3), "invaid inputs size for ReplicationPadConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for ConstantPadNdConverter is not Tensor as expected");
+
+    // extract self
+    nvinfer1::ITensor* self = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((self != nullptr), "Unable to init input tensor for node: " << *node);
+    nvinfer1::Dims self_dims = self->getDimensions();
+    const int64_t self_rank = self_dims.nbDims;
+
+    // extract pad
+    torch::jit::IValue maybe_pad = engine->context().get_constant(inputs[1]);
+    POROS_CHECK_TRUE((!maybe_pad.isNone()), "invaid inputs[1] for ConstantPadNdConverter");
+    std::vector<int64_t> padding = maybe_pad.toIntList().vec();
+   
+    // extract value
+    torch::jit::IValue maybe_value = engine->context().get_constant(inputs[2]);
+    POROS_CHECK_TRUE((!maybe_value.isNone()), "invaid inputs[2] for ConstantPadNdConverter");
+    nvinfer1::ITensor* value_tensor = nullptr;
+    float value = maybe_value.toScalar().to<float>();
+    // value的类型与self对齐
+    if (self->getType() == nvinfer1::DataType::kINT32) {
+        value_tensor = tensor_to_const(engine, torch::tensor({value}).to(at::ScalarType::Int));
+    } else if (self->getType() == nvinfer1::DataType::kBOOL && maybe_value.isBool()) {
+        value_tensor = tensor_to_const(engine, torch::tensor({value}).to(at::ScalarType::Bool));
+    } else {
+        value_tensor = tensor_to_const(engine, torch::tensor({value}).to(at::ScalarType::Float));
+    }
+
+    // const bool is_dynamic = check_nvtensor_is_dynamic(self);
+    // // dynamic下trt的Ifilllayer无法构建bool类型，所以先返回false（虽然constant_pad_nd bool的非常少）
+    // if (is_dynamic && maybe_value.isBool()) {
+    //     LOG(WARNING) << "ConstantPadNdConverter is not support padding value is type of bool when dynamic.";
+    //     return false;
+    // }
+
+    nvinfer1::ITensor* start = nullptr;
+    nvinfer1::ITensor* total_padding = nullptr;
+    if (converter_padding(engine, self_rank, padding, start, total_padding) == false) {
+        return false;
+    }
+    nvinfer1::ITensor* self_shape = engine->network()->addShape(*self)->getOutput(0);
+    nvinfer1::ITensor* size = add_elementwise(engine, 
+                                nvinfer1::ElementWiseOperation::kSUM, 
+                                self_shape, 
+                                total_padding,
+                                layer_info(node) + "_sum(for_padding)")->getOutput(0);
+
+    //fix stride setting
+    nvinfer1::Dims stride;
+    stride.nbDims = self_rank;
+    std::fill_n(stride.d, self_rank, 1);
+    const nvinfer1::Dims& dummy = stride;
+    nvinfer1::ISliceLayer* layer = engine->network()->addSlice(*self, dummy, dummy, stride);
+    layer->setInput(1, *start);
+    layer->setInput(2, *size);
+    layer->setMode(nvinfer1::SliceMode::kFILL);
+    layer->setInput(4, *value_tensor);
+    layer->setName((layer_info(node) + "_ISliceLayer").c_str());
+
+    engine->context().set_tensor(node->outputs()[0], layer->getOutput(0));
+    LOG(INFO) << "Output tensor shape: " << layer->getOutput(0)->getDimensions();
+
+    return true;
+}
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, ConstantPadNdConverter);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
\ No newline at end of file
diff --git a/poros/src/poros/converter/gpu/constant_pad_nd.h b/poros/src/poros/converter/gpu/constant_pad_nd.h
new file mode 100644
index 0000000000..b56176ef98
--- /dev/null
+++ b/poros/src/poros/converter/gpu/constant_pad_nd.h
@@ -0,0 +1,78 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file constant_pad_nd.h
+* @author tianshaoqing@baidu.com
+* @date Thur Dec 2 14:29:20 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class ConstantPadNdConverter : public GpuConverter {
+public:
+    ConstantPadNdConverter() {}
+    virtual ~ConstantPadNdConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    //DEPRECATED: 该实现方式内部采用的contat，在trt的profile阶段，会额外引入一些copy节点，导致性能变差。
+    bool converter_old_version(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::constant_pad_nd(Tensor self, int[] pad, Scalar value=0) -> Tensor"};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::constant_pad_nd};
+    }
+
+    bool assign_schema_attr() {
+        return assign_schema_attr_helper({{"aten::constant_pad_nd(Tensor self, int[] pad, Scalar value=0) -> Tensor", {1, 0}}});
+    }
+
+private:
+    /** 
+     * @brief 将pytorch组织的padding信息，转化成tensorrt可以接受的padding。
+     * @param [in] engine : 略
+     * @param [in] rank : 被padding的tensor的rank信息（也就是nbDims值）
+     * @param [in] padding : pytorch序的padding信息，是 int[] 类型（注意: pytorch 的padding信息是从后往前的）
+     * @param [out] start_tensor : 用于slice 的start tensor信息
+     * @param [out] total_padding_tensor : padding 引入后每一维增多的size信息。
+     * @return  bool
+     * @retval true => succeed  false => failed
+     * **/
+    bool converter_padding(TensorrtEngine* engine,
+                    int64_t rank,
+                    const std::vector<int64_t>& padding,
+                    nvinfer1::ITensor*& start_tensor,
+                    nvinfer1::ITensor*& total_padding_tensor);
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/converter_util.cpp b/poros/src/poros/converter/gpu/converter_util.cpp
new file mode 100644
index 0000000000..6c60849da1
--- /dev/null
+++ b/poros/src/poros/converter/gpu/converter_util.cpp
@@ -0,0 +1,376 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Part of the following code in this file refs to
+// https://github.com/pytorch/TensorRT/blob/master/core/conversion/converters/converter_util.cpp
+//
+// Copyright (c) 2020-present, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// Licensed under the 3-Clause BSD License
+
+/**
+* @file converter_util.cpp
+* @author tianjinjin@baidu.com
+* @date Thu Aug 12 14:50:37 CST 2021
+* @brief 
+**/
+
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+
+nvinfer1::ITensor* add_padding(TensorrtEngine* engine,
+                            const torch::jit::Node* n, 
+                            nvinfer1::ITensor* tensor,
+                            int nDim,
+                            bool trailing,
+                            bool use_zeros) {
+    const auto dims = tensor->getDimensions();
+    if (dims.nbDims < nDim) {
+        auto newDims = dims;
+        for (int dim = dims.nbDims; dim < nDim; ++dim) {
+            newDims = unsqueeze_dims(newDims, trailing ? dim : 0, 1, use_zeros);
+        }
+        LOG(INFO) << "Original shape: " << dims << ", reshaping to: " << newDims;
+        auto shuffle_layer = engine->network()->addShuffle(*tensor);
+        POROS_CHECK(shuffle_layer, "Unable to create shuffle layer");
+        shuffle_layer->setReshapeDimensions(newDims);
+        shuffle_layer->setZeroIsPlaceholder(use_zeros);
+        shuffle_layer->setName((layer_info(n) + " [Reshape to " + nvdim_to_str(newDims) + ']').c_str());
+        return shuffle_layer->getOutput(0);
+    } else {
+        return tensor;
+    }
+}
+
+nvinfer1::ITensor* add_unpadding(TensorrtEngine* engine,
+                            const torch::jit::Node* n,
+                            nvinfer1::ITensor* tensor,
+                            int nDim,
+                            bool trailing,
+                            bool use_zeros) {
+    const auto dims = tensor->getDimensions();
+    if (dims.nbDims > nDim) {
+        auto newDims = dims;
+        for (int dim = dims.nbDims; dim > nDim; --dim) {
+            newDims = squeeze_dims(newDims, trailing ? dim - 1 : 0);
+        }
+        LOG(INFO) << "Original shape: " << dims << ", reshaping to: " << newDims;
+        auto shuffle_layer = engine->network()->addShuffle(*tensor);
+        POROS_CHECK(shuffle_layer, "Unable to create shuffle layer");
+        shuffle_layer->setReshapeDimensions(newDims);
+        shuffle_layer->setZeroIsPlaceholder(use_zeros);
+        shuffle_layer->setName((layer_info(n) + " [Reshape to " + nvdim_to_str(newDims) + "]").c_str());
+        return shuffle_layer->getOutput(0);
+    } else {
+        return tensor;
+    }
+}
+
+bool check_tensor_type(nvinfer1::ITensor* &self, nvinfer1::ITensor* &other) {
+    // 保证二元操作的两个tensor类型相同 float32 > harf > int32 > int8 
+    if (self->getType() != nvinfer1::DataType::kBOOL){
+        if (self->getType() < other->getType()) {
+            if (self->getType() == nvinfer1::DataType::kINT8){
+                self->setType(other->getType());
+            }
+            else {
+                other->setType(self->getType());
+            }
+        }
+        else if(other->getType() < self->getType()) {
+            if (other->getType() == nvinfer1::DataType::kINT8){
+                other->setType(self->getType());
+            }
+            else {
+                self->setType(other->getType());
+            }
+        }
+    }
+    return true;
+}
+
+nvinfer1::ILayer* add_elementwise(TensorrtEngine* engine,
+                            nvinfer1::ElementWiseOperation op,
+                            nvinfer1::ITensor* self,
+                            nvinfer1::ITensor* other,
+                            const std::string& name) {
+    // ensure self to have larger number of dimension
+    bool swapSelfOther = false;
+    check_tensor_type(self, other);
+    if (self->getDimensions().nbDims < other->getDimensions().nbDims) {
+        std::swap(self, other);
+        swapSelfOther = true;
+    }
+    auto selfDim = nvdim_to_sizes(self->getDimensions());
+    auto otherDim = nvdim_to_sizes(other->getDimensions());
+    if (selfDim.size() != otherDim.size()) {
+        // other is with dynamic shape, need to expand its dimension now and get its
+        // shape at runtime
+        // 对other而言，如果其dim是-1，则需要保持原维度，如果其dim是1，则需要等于self相应的维度。
+        if (otherDim.end() != std::find(otherDim.begin(), otherDim.end(), -1)) {
+            auto thOtherStaticShapeMask = torch::ones(selfDim.size(), torch::kInt32);
+            auto thOtherDynamicShapeMask = torch::zeros(selfDim.size(), torch::kInt32);
+            for (size_t start = selfDim.size() - otherDim.size(), idx = 0; idx < otherDim.size(); ++idx) {
+                if (-1 != otherDim[idx]) {
+                    thOtherStaticShapeMask[start + idx] = otherDim[idx];
+                } else {
+                    thOtherStaticShapeMask[start + idx] = 0;
+                    if (selfDim[start + idx] == 1) {
+                        thOtherDynamicShapeMask[start + idx] = -1;
+                    } else {
+                        thOtherDynamicShapeMask[start + idx] = 1;
+                    }
+                }
+            }
+            auto otherStaticShapeMask = tensor_to_const(engine, thOtherStaticShapeMask);
+            auto otherDynamicShapeMask = tensor_to_const(engine, thOtherDynamicShapeMask);
+            auto selfShape = engine->network()->addShape(*self)->getOutput(0);
+            
+            // size of dynamic dimension of other need to the same as that of
+            // corresponding dimension of self
+            auto otherDynamicShape = engine->network()->addElementWise(*selfShape,
+                            *otherDynamicShapeMask, nvinfer1::ElementWiseOperation::kPROD)->getOutput(0);
+            auto targetOtherShape = engine->network()->addElementWise(*otherDynamicShape, 
+                            *otherStaticShapeMask, nvinfer1::ElementWiseOperation::kSUM)->getOutput(0);
+
+            auto otherShuffle = engine->network()->addShuffle(*other);
+            otherShuffle->setName((name + "_IShuffleLayer").c_str());
+            otherShuffle->setInput(1, *targetOtherShape);
+            other = otherShuffle->getOutput(0);
+        } else {
+            // other is with static shape, expand dimension to make tow tensor have
+            // the same number of dimension
+            auto otherShuffle = engine->network()->addShuffle(*other);
+            otherShuffle->setReshapeDimensions(sizes_to_nvdim_with_pad(otherDim, selfDim.size()));
+            otherShuffle->setName((name + "_IShuffleLayer").c_str());
+            other = otherShuffle->getOutput(0);
+        }
+    }
+    if (swapSelfOther) {
+        // swap back
+        std::swap(self, other);
+        swapSelfOther = false;
+    }
+    auto ele = engine->network()->addElementWise(*self, *other, op);
+    ele->setName(name.c_str());
+    return ele;
+}
+
+nvinfer1::ITensor* broadcast_itensor(TensorrtEngine* engine,
+                                const torch::jit::Node* n,
+                                nvinfer1::ITensor* tensor,
+                                const int new_rank,
+                                std::string name) {
+    int current_rank = tensor->getDimensions().nbDims;
+    POROS_CHECK((current_rank <= new_rank), "Cannot broadcast a higher rank tensor to a lower rank tensor.");
+    if (current_rank < new_rank) {
+        //1. get shape tensor
+        nvinfer1::ITensor* shape_tensor = engine->network()->addShape(*tensor)->getOutput(0);
+
+        //2.padding the missing rank part with value 1.
+        std::vector<int64_t> padding_vec(new_rank - current_rank, 1);
+        nvinfer1::Dims padding_dim = sizes_to_nvdim(c10::IntArrayRef(padding_vec));
+        at::Tensor the_padding = torch::tensor(nvdim_to_sizes(padding_dim), torch::kInt32);
+        nvinfer1::ITensor* padding_shape = tensor_to_const(engine, the_padding);
+
+        //3. concat the shape tensor
+        std::vector<nvinfer1::ITensor*> to_concat_tensors = {padding_shape, shape_tensor};
+        nvinfer1::IConcatenationLayer* shape_cat_layer = engine->network()->addConcatenation(to_concat_tensors.data(), to_concat_tensors.size());
+        shape_cat_layer->setName((layer_info(n) + "_IConcatenationLayer_for_" + name).c_str());
+        auto new_shape = shape_cat_layer->getOutput(0);
+
+        //4. shuffle given tensor to the new shape
+        nvinfer1::IShuffleLayer* reshape_layer = engine->network()->addShuffle(*tensor);
+        reshape_layer->setInput(1, *new_shape);
+        reshape_layer->setName((layer_info(n) + "_IShuffleLayer_for_" + name).c_str());
+        nvinfer1::ITensor* new_tensor = reshape_layer->getOutput(0);
+        return new_tensor;
+    }
+    return tensor;               
+}
+
+nvinfer1::ITensor* cast_itensor(TensorrtEngine* engine, 
+                                nvinfer1::ITensor* tensor, 
+                                nvinfer1::DataType dtype) {
+    if (tensor->getType() != dtype) {
+        std::ostringstream tensor_id;
+        tensor_id << reinterpret_cast<int*>(tensor);
+        
+        auto id_layer = engine->network()->addIdentity(*tensor);
+        POROS_CHECK(id_layer, "Unable to create identity layer for ITensor: " << tensor_id.str());
+        auto casted_tensor = id_layer->getOutput(0);
+        casted_tensor->setType(dtype);
+        
+        LOG(INFO) << "Casting ITensor " << tensor_id.str() << " from " << tensor->getType() << " to " << dtype;
+        std::stringstream ss;
+        ss << "[Cast ITensor " << tensor_id.str() << " from " << tensor->getType() << " to " << dtype << "]";
+        id_layer->setName(ss.str().c_str());
+        return casted_tensor;
+    } else {
+        return tensor;
+    }
+}
+
+// 对nv shape tensor进行unsqueeze操作, 支持dim倒序
+nvinfer1::ITensor* unsqueeze_nv_shapetensor(TensorrtEngine* engine, 
+                                    nvinfer1::ITensor* input, int dim) {
+    nvinfer1::Dims input_dims = input->getDimensions();
+
+    if (input_dims.nbDims != 1 || input->getType() != nvinfer1::DataType::kINT32) {
+        LOG(INFO) << "input is not shape tensor";
+        return nullptr;
+    }
+    // dim must be in range of [-input_dims.d[0] - 1, input_dims.d[0]].
+    if (dim < -input_dims.d[0] - 1 || dim > input_dims.d[0]) {
+        LOG(INFO) << "expected to be in range of [" << -input_dims.d[0] - 1 << "," 
+        << input_dims.d[0] << "], but got " << dim;
+        return nullptr;
+    }
+    if (dim < 0) {
+        dim = input_dims.d[0] + dim + 1;
+    }
+    std::vector<nvinfer1::ITensor*> inputs_nvtensor;
+    nvinfer1::ITensor* insert_tensor = tensor_to_const(engine, torch::tensor({1}, torch::kInt));
+    // if dim == 0 or dim == input_dims.d[0], concat origin tensor and insert tensor directly.
+    if (dim == 0) {
+        inputs_nvtensor.push_back(insert_tensor);
+        inputs_nvtensor.push_back(input);
+        
+    } else if (dim == input_dims.d[0]) {
+        inputs_nvtensor.push_back(input);
+        inputs_nvtensor.push_back(insert_tensor);
+    } else {
+        // divide origin tensor into two parts, then insert the unsqueeze tensor.
+        std::vector<int64_t> start_vec{0}, size_vec{dim}, stride_vec{1};
+        nvinfer1::ISliceLayer* slice_front = engine->network()->addSlice(*input,
+                                                sizes_to_nvdim(start_vec),
+                                                sizes_to_nvdim(size_vec),
+                                                sizes_to_nvdim(stride_vec));
+        inputs_nvtensor.push_back(slice_front->getOutput(0));
+        inputs_nvtensor.push_back(insert_tensor);
+        start_vec[0] = dim;
+        size_vec[0] = input_dims.d[0] - dim;
+        nvinfer1::ISliceLayer* slice_back = engine->network()->addSlice(*input,
+                                                sizes_to_nvdim(start_vec),
+                                                sizes_to_nvdim(size_vec),
+                                                sizes_to_nvdim(stride_vec));
+        inputs_nvtensor.push_back(slice_back->getOutput(0));
+    }
+    nvinfer1::IConcatenationLayer* concat_layer = 
+                    engine->network()->addConcatenation(inputs_nvtensor.data(), inputs_nvtensor.size());
+    concat_layer->setAxis(0);
+    return concat_layer->getOutput(0);
+}
+
+// 对nv shape tensor进行squeeze操作, 支持dim倒序
+// note: 使用前须检查 input[dim] == 1
+nvinfer1::ITensor* squeeze_nv_shapetensor(TensorrtEngine* engine, 
+                                    nvinfer1::ITensor* input, int dim) {
+    nvinfer1::Dims input_dims = input->getDimensions();
+
+    if (input_dims.nbDims != 1 || input->getType() != nvinfer1::DataType::kINT32) {
+        LOG(INFO) << "input is not shape tensor";
+        return nullptr;
+    }
+    // dim must be in range of [-input_dims.d[0], input_dims.d[0] - 1].
+    if (dim < -input_dims.d[0] || dim > input_dims.d[0] - 1) {
+        LOG(INFO) << "expected to be in range of [" << -input_dims.d[0] << "," 
+        << input_dims.d[0] - 1 << "], but got " << dim;
+        return nullptr;
+    }
+    if (dim < 0) {
+        dim = input_dims.d[0] + dim;
+    }
+    std::vector<nvinfer1::ITensor*> inputs_nvtensor;
+    //nvinfer1::ITensor* insert_tensor = tensor_to_const(engine, torch::tensor({1}, torch::kInt));
+    tensor_to_const(engine, torch::tensor({1}, torch::kInt));
+    // if dim == 0 or dim == input_dims.d[0] - 1, slice squeeze dimension directly.
+    std::vector<int64_t> start_vec{0}, size_vec{input_dims.d[0] - 1}, stride_vec{1};
+    if (dim == 0 || dim == input_dims.d[0] - 1) {
+        if (dim == 0) {
+            start_vec[0] = 1;
+        }
+        nvinfer1::ISliceLayer* slice_l = engine->network()->addSlice(*input,
+                                                sizes_to_nvdim(start_vec),
+                                                sizes_to_nvdim(size_vec),
+                                                sizes_to_nvdim(stride_vec));
+        return slice_l->getOutput(0);
+
+    } else {
+        // divide origin tensor into two parts (skip the squeeze dim), and concat them.
+        std::vector<int64_t> start_vec{0}, size_vec{dim}, stride_vec{1};
+        nvinfer1::ISliceLayer* slice_front = engine->network()->addSlice(*input,
+                                                sizes_to_nvdim(start_vec),
+                                                sizes_to_nvdim(size_vec),
+                                                sizes_to_nvdim(stride_vec));
+        inputs_nvtensor.push_back(slice_front->getOutput(0));
+        start_vec[0] = dim + 1;
+        size_vec[0] = input_dims.d[0] - dim - 1;
+        nvinfer1::ISliceLayer* slice_back = engine->network()->addSlice(*input,
+                                                sizes_to_nvdim(start_vec),
+                                                sizes_to_nvdim(size_vec),
+                                                sizes_to_nvdim(stride_vec));
+        inputs_nvtensor.push_back(slice_back->getOutput(0));
+    }
+    nvinfer1::IConcatenationLayer* concat_layer = 
+                    engine->network()->addConcatenation(inputs_nvtensor.data(), inputs_nvtensor.size());
+    concat_layer->setAxis(0);
+    return concat_layer->getOutput(0);
+}
+
+nvinfer1::ITensor* unsqueeze_itensor(TensorrtEngine* engine, 
+                                    nvinfer1::ITensor* input,
+                                    const std::vector<int>& axes) {
+    nvinfer1::ITensor* input_shape_tensor = engine->network()->addShape(*input)->getOutput(0);
+    int input_rank = input->getDimensions().nbDims;
+
+    const std::set<int> axes_set(axes.begin(), axes.end());
+    if (input_rank + axes_set.size() > nvinfer1::Dims::MAX_DIMS)
+    {
+        return nullptr;
+    }
+
+    // compute interlacing subscripts.
+    std::vector<int64_t> subscripts(input_rank);
+    std::iota(subscripts.begin(), subscripts.end(), 0);
+    for (const auto& axis : axes_set)
+    {
+        subscripts.insert(subscripts.begin() + axis, input_rank);
+    }
+    at::Tensor indices = torch::tensor(subscripts, torch::kInt32);
+    auto indices_tensor = tensor_to_const(engine, indices);
+
+    //calculate gather(concat(input_shape_tensor, {1}), indices_tensor)
+    torch::Tensor the_one = torch::tensor(std::vector<int32_t>({1}), torch::kInt32);
+    nvinfer1::ITensor* one_tensor = tensor_to_const(engine, the_one);
+    nvinfer1::ITensor* const args[2] = {input_shape_tensor, one_tensor};
+    nvinfer1::ITensor* tmp_concat_tensor =  engine->network()->addConcatenation(args, 2)->getOutput(0);
+    nvinfer1::ITensor* new_shape_tensor =  engine->network()->addGather(*tmp_concat_tensor, *indices_tensor, 0)->getOutput(0);
+
+    nvinfer1::IShuffleLayer* reshape_layer = engine->network()->addShuffle(*input);
+    reshape_layer->setInput(1, *new_shape_tensor);
+    return reshape_layer->getOutput(0);
+}
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/converter_util.h b/poros/src/poros/converter/gpu/converter_util.h
new file mode 100644
index 0000000000..66492eda3b
--- /dev/null
+++ b/poros/src/poros/converter/gpu/converter_util.h
@@ -0,0 +1,87 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file converter_util.h
+* @author tianjinjin@baidu.com
+* @date Thu Aug 12 10:50:28 CST 2021
+* @brief
+**/
+
+#pragma once
+
+#include <string>
+
+#include "torch/script.h"
+#include "NvInfer.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+
+nvinfer1::ITensor* add_padding(TensorrtEngine* engine,
+                                const torch::jit::Node* n, 
+                                nvinfer1::ITensor* tensor,
+                                int nDim,
+                                bool trailing = true,
+                                bool use_zeros = true);
+
+nvinfer1::ITensor* add_unpadding(TensorrtEngine* engine,
+                                const torch::jit::Node* n,
+                                nvinfer1::ITensor* tensor,
+                                int nDim,
+                                bool trailing = true,
+                                bool use_zeros = true);
+
+nvinfer1::ILayer* add_elementwise(TensorrtEngine* engine,
+                                nvinfer1::ElementWiseOperation op,
+                                nvinfer1::ITensor* self,
+                                nvinfer1::ITensor* other,
+                                const std::string& name);
+
+nvinfer1::ITensor* broadcast_itensor(TensorrtEngine* engine,
+                                const torch::jit::Node* n,
+                                nvinfer1::ITensor* tensor,
+                                const int new_rank,
+                                std::string name);
+
+//If an ITensor is of a type not dtype, add an Identity layer to cast it to dtype
+nvinfer1::ITensor* cast_itensor(TensorrtEngine* engine,
+                                nvinfer1::ITensor* tensor, 
+                                nvinfer1::DataType dtype);
+                                
+// 对nv shape tensor进行unsqueeze操作, 支持dim倒序
+nvinfer1::ITensor* unsqueeze_nv_shapetensor(TensorrtEngine* engine, 
+                                    nvinfer1::ITensor* input, 
+                                    int dim);
+// 对nv shape tensor进行squeeze操作, 支持dim倒序
+// note: 使用前须检查 input[dim] == 1
+nvinfer1::ITensor* squeeze_nv_shapetensor(TensorrtEngine* engine, 
+                                    nvinfer1::ITensor* input, int dim);
+
+// 对nv tensor进行unsqueeze操作
+nvinfer1::ITensor* unsqueeze_itensor(TensorrtEngine* engine, 
+                                    nvinfer1::ITensor* input,
+                                    const std::vector<int>& axes);
+
+//TODO: 添加对 nv tensor 进行squeeze操作
+// nvinfer1::ITensor* squeeze_itensor(TensorrtEngine* engine,
+//                                     nvinfer1::ITensor* input,
+//                                     const std::vector<int>& axes);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/convolution.cpp b/poros/src/poros/converter/gpu/convolution.cpp
new file mode 100644
index 0000000000..179a569e1e
--- /dev/null
+++ b/poros/src/poros/converter/gpu/convolution.cpp
@@ -0,0 +1,271 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file convolution.cpp
+* @author tianjinjin@baidu.com
+* @date Mon Mar  8 11:36:11 CST 2021
+* @brief 
+**/
+
+#include "poros/converter/gpu/convolution.h"
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/context/poros_global.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+//note: conv?d 与 _convolution 相比，前者入参有7个，后者入参有12个or13个。
+//note2: conv?d 与 _convolution 相比，缺了 transposed 参数(bool类型)，默认补零即可。
+//note3: conv?d 与 _convolution 相比，缺了 output_padding 参数(int[]类型)，默认也补零即可。
+//note4: conv?d 之间的差异，在于 int[] 的维度不一样。
+bool ConvolutionConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    //basic check
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 12 || inputs.size() == 13 || inputs.size() == 7), 
+        "invaid inputs size for ConvolutionConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for ConvolutionConverter is not Tensor as expected");
+    //ATTENTION HERE: assumes weight inputs are all come from prim::Constant and is TensorType.
+    POROS_CHECK_TRUE((inputs[1]->type()->isSubtypeOf(c10::TensorType::get()) && 
+        inputs[1]->node()->kind() == torch::jit::prim::Constant),
+        "input[1] for ConvolutionConverter is not Tensor or not come from prim::Constant as expected");
+    //ATTENTION HERE: assumes int[] inputs are all come from prim::Constant.
+    POROS_CHECK_TRUE((inputs[3]->node()->kind() == torch::jit::prim::Constant),
+        "input[3] for ConvolutionConverter is not come from prim::Constant as expected");
+    POROS_CHECK_TRUE((inputs[4]->node()->kind() == torch::jit::prim::Constant),
+        "input[4] for ConvolutionConverter is not come from prim::Constant as expected");
+    POROS_CHECK_TRUE((inputs[5]->node()->kind() == torch::jit::prim::Constant),
+        "input[5] for ConvolutionConverter is not come from prim::Constant as expected");
+    if (inputs.size() == 12 || inputs.size() == 13) {
+        POROS_CHECK_TRUE((inputs[7]->node()->kind() == torch::jit::prim::Constant),
+            "input[7] for ConvolutionConverter is not come from prim::Constant as expected");
+    }
+    //extract in
+    auto in = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((in != nullptr), "Unable to init input tensor for node: " << *node); 
+
+    //extract dims settings
+    auto stride = sizes_to_nvdim((engine->context().get_constant(inputs[3])).toIntList());
+    auto padding =  sizes_to_nvdim((engine->context().get_constant(inputs[4])).toIntList());
+    auto dilation = sizes_to_nvdim((engine->context().get_constant(inputs[5])).toIntList());
+
+    //handle the difference between _convolution and conv?d.
+    bool transposed = false;
+    nvinfer1::Dims out_padding;
+    int64_t groups = 1;
+    if (inputs.size() == 12 || inputs.size() == 13) {
+        transposed = (engine->context().get_constant(inputs[6])).toBool();
+        out_padding = sizes_to_nvdim((engine->context().get_constant(inputs[7])).toIntList());
+        groups = (engine->context().get_constant(inputs[8])).toInt();
+    //situation when conv1d & conv2d & conv3d has no transposed and out_padding paragrams.
+    } else {
+        out_padding.nbDims = padding.nbDims;
+        for (int i = 0; i < padding.nbDims; i++) {
+            out_padding.d[i] = 0;
+        }
+        groups = (engine->context().get_constant(inputs[6])).toInt();
+    }
+  
+    //handle stride & dilation & padding & out_apdding
+    if (stride.nbDims == 1) {
+        stride = unsqueeze_dims(stride, 1, stride.d[0]);
+        LOG(INFO) << "Reshaped stride for ConvolutionConverter: " << stride;
+    }
+    if (dilation.nbDims == 1) {
+        dilation = unsqueeze_dims(dilation, 1, dilation.d[0]);
+        LOG(INFO) << "Reshaped dilation for ConvolutionConverter: " << dilation;
+    }
+    if (padding.nbDims == 1) {
+        padding = unsqueeze_dims(padding, 1, 0);
+        LOG(INFO) << "Reshaped padding for ConvolutionConverter: " << padding;
+    }
+    if (out_padding.nbDims == 1) {
+        out_padding = unsqueeze_dims(out_padding, 1, 0);
+        LOG(INFO) << "Reshaped out_padding for ConvolutionConverter: " << out_padding;
+    }
+
+    // According to our tests, when the GPU architecture is Ampere and nvidia tf32 is enabled, 
+    // IConvolutionLayer set PostPadding explicitly even if it is the default value will cause 
+    // tensorrt choose the slow Conv+BN+Relu kernel in some case.
+    // So we try not to set PostPadding when it is the default value.
+    bool out_padding_need_set = false;
+    for (int32_t i = 0; i < out_padding.nbDims; i++) {
+        if (out_padding.d[i] != 0) {
+            out_padding_need_set = true;
+            break;
+        }
+    }
+
+    //extract bias
+    auto maybe_bias = engine->context().get_constant(inputs[2]);
+    Weights bias;
+    if (maybe_bias.isTensor()) {
+        bias = Weights(maybe_bias.toTensor());
+    } else {//when bias is None
+        bias = Weights();
+    }
+
+    //extract weight
+    //the situation can be complex. 
+    //sometimes this params is come from constant. sometimes it's come from another tensor.
+    //because of the handle strategy we set in prim::Constant.
+    //we'd better check if it is come from constant first.
+    auto maybe_weight = engine->context().get_constant(inputs[1]);
+    /*---------------------------------------------------------------------------
+    *          when weight is come from constant
+    ---------------------------------------------------------------------------*/
+    if (maybe_weight.isTensor()) {
+        auto weight = Weights(maybe_weight.toTensor());
+
+        //first: handle input
+        auto dims = in->getDimensions();
+        auto orig_dims = dims;
+        POROS_CHECK(orig_dims.nbDims > 2, "Unable to create convolution layer from node: " << *node);
+
+        bool expandDims = (orig_dims.nbDims < 4);
+        if (expandDims) {
+            in = add_padding(engine, node, in, 4);
+            dims = in->getDimensions();
+            LOG(INFO) << "Reshaped Input dims: " << dims;
+        }
+
+        //second: handle dims
+        if (weight.shape.nbDims < 4) {
+            for (int i = weight.shape.nbDims; i < 4; ++i) {
+                weight.shape.d[i] = 1;
+            }
+            weight.shape.nbDims = 4;
+            weight.kernel_shape.nbDims = 2;
+            weight.kernel_shape.d[1] = 1;
+            LOG(INFO) << "Reshaped Weights for ConvolutionConverter: " << weight;
+        }
+
+        //fifth: try to add new layer
+        nvinfer1::ILayer* new_layer;
+        if (transposed) {   
+            // shape of deconvolution's weight: [in, out/groups, ...]
+            auto deconv = engine->network()->addDeconvolutionNd(*in,
+                                weight.shape.d[1] * groups, weight.kernel_shape, weight.data, bias.data);
+            POROS_CHECK(deconv, "Unable to create deconvolution layer from node: " << *node);
+            
+            deconv->setStrideNd(stride);
+            deconv->setPaddingNd(padding);
+            deconv->setName((layer_info(node) + "_IDeconvolutionLayer").c_str());
+#if NV_TENSORRT_MAJOR > 7 || (NV_TENSORRT_MAJOR == 7 && NV_TENSORRT_MINOR >= 1)
+            deconv->setDilationNd(dilation);
+            deconv->setNbGroups(groups);
+#else
+            POROS_CHECK(groups == 1, "for deconv with groups > 1, require TensorRT version >= 7.1");
+            for (int idx = 0; idx < dilation.nbDims; idx++) {
+                POROS_CHECK(dilation.d[idx] == 1, "for deconv with dilation > 1, require TensorRT version >= 7.1");
+            }
+#endif
+            new_layer = deconv;
+        // when transposed == false
+        } else {
+            // shape of convolution's weight: [out, in/groups, ...]
+            auto conv = engine->network()->addConvolutionNd(*in, 
+                                weight.shape.d[0], weight.kernel_shape, weight.data, bias.data);
+            POROS_CHECK(conv, "Unable to create convolution layer from node: " << *node);
+            
+            conv->setStrideNd(stride);
+            conv->setPaddingMode(nvinfer1::PaddingMode::kCAFFE_ROUND_DOWN);
+            conv->setPaddingNd(padding);
+            if (out_padding_need_set) {
+                conv->setPostPadding(out_padding);
+            }
+            conv->setDilationNd(dilation);
+            conv->setNbGroups(groups);
+            conv->setName((layer_info(node) + "_IConvolutionLayer").c_str());
+            new_layer = conv;
+        }
+
+        auto out = add_unpadding(engine, node, new_layer->getOutput(0), orig_dims.nbDims);
+        engine->context().set_tensor(node->outputs()[0], out);
+        LOG(INFO) << "Output tensor shape: " << out->getDimensions();
+
+        return true;
+    /*---------------------------------------------------------------------------
+    * when weight is come from other layer's (especial Dequantize layer) output
+    ---------------------------------------------------------------------------*/
+    } else {
+        auto kernel = engine->context().get_tensor(inputs[1]);
+        POROS_CHECK_TRUE((kernel != nullptr), "Unable to init input tensor for node: " << *node);
+        auto kernel_dims = kernel->getDimensions();
+        
+        // Make a new Dims with only the spatial dimensions.
+        nvinfer1::Dims filter_dim;
+        int64_t nbSpatialDims = in->getDimensions().nbDims - 2;
+        POROS_CHECK(nbSpatialDims == (kernel_dims.nbDims - 2),
+        "Number of input spatial dimensions should match the kernel spatial dimensions");
+        filter_dim.nbDims = nbSpatialDims;
+        filter_dim.d[0] = kernel_dims.d[2];
+        filter_dim.d[1] = kernel_dims.d[3];
+        
+        // Initialize a dummy constant kernel to pass it to INetwork->addConvolutionNd/addDeconvolutionNd API.
+        auto kernel_weights = nvinfer1::Weights{nvinfer1::DataType::kFLOAT, nullptr, 0};
+        
+        nvinfer1::ILayer* layer = nullptr;
+        if (transposed) {
+            nvinfer1::IDeconvolutionLayer* deconv = engine->network()->addDeconvolutionNd(*in,
+                        kernel_dims.d[0],
+                        filter_dim,
+                        kernel_weights,
+                        bias.data);
+            deconv->setStrideNd(stride);
+            deconv->setDilationNd(dilation);
+            deconv->setNbGroups(groups);
+            deconv->setPaddingNd(padding);
+            // Set deconv kernel weights
+            deconv->setInput(1, *kernel);
+            deconv->setName((layer_info(node) + "_IDeconvolutionLayer").c_str());
+            POROS_CHECK(deconv, "Unable to create deconv layer with non-const weights from node: " << *node);
+            layer = deconv;
+        } else {
+            nvinfer1::IConvolutionLayer* conv = engine->network()->addConvolutionNd(*in,
+                        kernel_dims.d[0],
+                        filter_dim,
+                        kernel_weights,
+                        bias.data);
+            conv->setStrideNd(stride);
+            conv->setPaddingMode(nvinfer1::PaddingMode::kCAFFE_ROUND_DOWN);
+            conv->setPaddingNd(padding);
+            if (out_padding_need_set) {
+                conv->setPostPadding(out_padding);
+            }
+            conv->setDilationNd(dilation);
+            conv->setNbGroups(groups);
+            // Set conv kernel weights
+            conv->setInput(1, *kernel);
+            conv->setName((layer_info(node) + "_IConvolutionLayer").c_str());
+            layer = conv;
+        }
+        engine->context().set_tensor(node->outputs()[0], layer->getOutput(0));
+        LOG(INFO) << "Output tensor shape: " << layer->getOutput(0)->getDimensions();
+        return true;
+    }
+}
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, ConvolutionConverter);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/convolution.h b/poros/src/poros/converter/gpu/convolution.h
new file mode 100644
index 0000000000..422d1ac0d1
--- /dev/null
+++ b/poros/src/poros/converter/gpu/convolution.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file convolution.h
+* @author tianjinjin@baidu.com
+* @date Wed Aug 11 16:00:26 CST 2021
+* @brief
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class ConvolutionConverter : public GpuConverter {
+public:
+    ConvolutionConverter() {}
+    virtual ~ConvolutionConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+    // bool converter(TensorrtEngine* engine,
+    //             const std::vector<const torch::jit::Value*> inputs, 
+    //             const std::vector<const torch::jit::Value*> outputs);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::_convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor",
+        "aten::_convolution.deprecated(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled) -> Tensor",
+        "aten::conv1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, int[1] padding=0, int[1] dilation=1, int groups=1) -> Tensor",
+        "aten::conv2d(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor",
+        "aten::conv3d(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] dilation=1, int groups=1) -> Tensor"
+        };
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::_convolution,
+                torch::jit::aten::conv1d,
+                torch::jit::aten::conv2d,
+                torch::jit::aten::conv3d};
+    }
+};
+
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/einsum.cpp b/poros/src/poros/converter/gpu/einsum.cpp
new file mode 100644
index 0000000000..9945aa54fc
--- /dev/null
+++ b/poros/src/poros/converter/gpu/einsum.cpp
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file einsum.cpp
+* @author tianshaoqing@baidu.com
+* @date Wed Jul 06 11:24:51 CST 2022
+* @brief 
+**/
+
+#include "poros/converter/gpu/einsum.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+// aten::einsum(str equation, Tensor[] tensors) -> (Tensor)
+bool EinsumConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 2), "invaid inputs size for EinsumConverter");
+    POROS_CHECK_TRUE(inputs[1]->type()->isSubtypeOf(c10::ListType::ofTensors()), 
+        "input[1] for EinsumConverter is not TensorList as expected.");
+
+    // extract equation string
+    torch::jit::IValue equation_ivalue = engine->context().get_constant(inputs[0]);
+    POROS_CHECK_TRUE(equation_ivalue.isString(), "EinsumConverter input[0] is not constant string as expected.");
+    std::string equation_str = equation_ivalue.toStringRef();
+
+    // 大写转小写，nvinfer1::IEinsumLayer不支持equation中包含大写字母
+    for (auto it = equation_str.begin(); it != equation_str.end(); it++) {
+        if ((*it) >= 'A' && (*it) <= 'Z') {
+            *it = *it + 32;
+        }                
+    }
+
+    // extract tensorlist
+    // mark：单测时输入2个以上的tensor trt会报错 nbInputs > 0 && nbInputs <= MAX_EINSUM_NB_INPUTS
+    // 不确定MAX_EINSUM_NB_INPUTS是固定=2还是根据equation来定，暂时不加判断。
+    std::vector<nvinfer1::ITensor*> tensorlist;
+    POROS_CHECK_TRUE(engine->context().get_tensorlist(inputs[1], tensorlist), "EinsumConverter "
+    "extract tensor list error.");
+
+    nvinfer1::IEinsumLayer* einsum_layer = engine->network()->addEinsum(tensorlist.data(), 
+                                                                        tensorlist.size(), 
+                                                                        equation_str.c_str());
+    einsum_layer->setName((layer_info(node) + "_IEinsumLayer").c_str());
+
+    nvinfer1::ITensor* output = einsum_layer->getOutput(0);
+
+    engine->context().set_tensor(node->outputs()[0], output);
+    LOG(INFO) << "Output shape: " << output->getDimensions();
+    return true;
+}
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, EinsumConverter);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/einsum.h b/poros/src/poros/converter/gpu/einsum.h
new file mode 100644
index 0000000000..f174cd2ad3
--- /dev/null
+++ b/poros/src/poros/converter/gpu/einsum.h
@@ -0,0 +1,57 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file einsum.h
+* @author tianshaoqing@baidu.com
+* @date Wed Jul 06 11:24:51 CST 2022
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class EinsumConverter : public GpuConverter {
+public:
+    EinsumConverter() {}
+    virtual ~EinsumConverter() {}
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+    const std::vector<std::string> schema_string() {
+        return {"aten::einsum(str equation, Tensor[] tensors) -> (Tensor)"};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::einsum};
+    }
+
+    // mark: einsum动态的规则比较复杂，是根据equation情况来定的，先禁止dy
+    bool assign_schema_attr() {
+        return assign_schema_attr_helper({{"aten::einsum(str equation, Tensor[] tensors) -> (Tensor)", {0, 0}}});
+    }
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/element_wise.cpp b/poros/src/poros/converter/gpu/element_wise.cpp
new file mode 100644
index 0000000000..15b7b8f504
--- /dev/null
+++ b/poros/src/poros/converter/gpu/element_wise.cpp
@@ -0,0 +1,399 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file element_wise.cpp
+* @author tianjinjin@baidu.com
+* @date Fri Aug 27 15:32:36 CST 2021
+* @brief 
+**/
+
+#include "poros/converter/gpu/element_wise.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/context/poros_global.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+nvinfer1::ITensor* GreaterOrLessConverter::scalar_to_nvtensor(TensorrtEngine* engine, at::Scalar s) {
+    nvinfer1::ITensor* out;
+    if (s.isIntegral(false)) {
+        auto s_int = s.to<int64_t>();
+        auto s_t = torch::tensor({s_int}).to(at::kInt);
+        out = tensor_to_const(engine, s_t);
+    } else if (s.isBoolean()) {
+        auto s_bool = s.to<bool>();
+        auto s_t = torch::tensor({s_bool}).to(at::kBool);
+        out = tensor_to_const(engine, s_t);
+    } else if (s.isFloatingPoint()) {
+        auto other_float = s.to<float>();
+        auto s_t = torch::tensor({other_float});
+        out = tensor_to_const(engine, s_t);
+    } else {
+        out = nullptr;
+        POROS_THROW_ERROR("Unsupported data type for scalar. Found: (" << s.type() << ")");
+    }
+    return out;
+}
+
+/*
+"aten::gt.Tensor(Tensor self, Tensor other) -> Tensor",
+"aten::gt.Scalar(Tensor self, Scalar other) -> Tensor",*/
+bool GreaterOrLessConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 2), "invaid inputs size for GreaterOrLessConverter");
+    
+    int schema_index = 0;
+    for (std::string schema : this->schema_string()) {
+        if (node->schema().operator_name() == torch::jit::parseSchema(schema).operator_name()) {
+            break;
+        }
+        schema_index++;
+    }
+
+    if (schema_index >= 8 && schema_index <= 11) {
+        int a = engine->context().get_constant(inputs[0]).toScalar().to<int>();
+        int b = engine->context().get_constant(inputs[1]).toScalar().to<int>();
+        bool output = true;
+        if (schema_index == 8) {
+            output = (a > b);
+        }
+        if (schema_index == 9) {
+            output = (a < b);
+        }
+        if (schema_index == 10) {
+            output = (a >= b);
+        }
+        if (schema_index == 11) {
+            output = (a <= b);
+        }
+        engine->context().set_constant(node->outputs()[0], output);
+        return true;
+    }
+
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())),
+        "input[0] for GreaterOrLessConverter is not Tensor as expected");
+
+    //extract self
+    auto self = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((self != nullptr), "Unable to init input tensor for node: " << *node);
+    auto other = engine->context().get_tensor(inputs[1]);
+    //when other input is Scalar
+    if (other == nullptr) {
+        auto other_const = engine->context().get_constant(inputs[1]);
+        if (other_const.isScalar()) {
+            other = scalar_to_nvtensor(engine, other_const.toScalar());
+            if (self->getType() != other->getType()) {
+                other = cast_itensor(engine, other, self->getType());
+            }
+        } else {
+            POROS_THROW_ERROR("Unable to get input other value for GreaterOrLessConverter");
+        }
+    }
+
+    nvinfer1::ElementWiseOperation ew_option;
+    std::string name_suffix;
+    if (node->kind() == torch::jit::aten::gt || node->kind() == torch::jit::aten::ge) {
+        ew_option = nvinfer1::ElementWiseOperation::kGREATER;
+        name_suffix = "_greater";
+    } else if (node->kind() == torch::jit::aten::lt || node->kind() == torch::jit::aten::le) {
+        ew_option = nvinfer1::ElementWiseOperation::kLESS;
+        name_suffix = "_less";
+    } else {
+        POROS_THROW_ERROR("Meet some unknown node kind in GreaterOrLessConverter");
+    }
+
+    auto new_layer = add_elementwise(engine,
+            ew_option,
+            self,
+            other,
+            layer_info(node) + name_suffix);
+    POROS_CHECK(new_layer, "Unable to create element wise layer from node: " << *node);
+
+    //situation: aten::gt or aten::lt
+    if (node->kind() == torch::jit::aten::gt || node->kind() == torch::jit::aten::lt) {
+        engine->context().set_tensor(node->outputs()[0], new_layer->getOutput(0));
+        LOG(INFO) << "Output tensor shape: " << new_layer->getOutput(0)->getDimensions();
+        return true;
+    }
+
+    // situation: aten::ge or aten::le, 
+    // we should set three layers: kGREATER（or kLESS） and kEQUAL and kOR.
+    if (node->kind() == torch::jit::aten::ge || node->kind() == torch::jit::aten::le) {
+        //equal layer
+        auto equal = add_elementwise(engine,
+                nvinfer1::ElementWiseOperation::kEQUAL,
+                self,
+                other,
+                layer_info(node) + "_equal");
+        POROS_CHECK(equal, "Unable to create Equal layer from node: " << *node);
+        
+        //or layer
+        auto or_op = engine->network()->addElementWise(
+            *new_layer->getOutput(0),
+            *equal->getOutput(0),
+            nvinfer1::ElementWiseOperation::kOR);
+        POROS_CHECK(or_op, "Unable to create Or layer from node: " << *node);
+
+        or_op->setName((layer_info(node) + "_or").c_str());
+        engine->context().set_tensor(node->outputs()[0], or_op->getOutput(0));
+        LOG(INFO) << "Output tensor shape: " << or_op->getOutput(0)->getDimensions();
+        return true;
+    }
+
+    POROS_THROW_ERROR("Meet some unknown node kind in GreaterOrLessConverter");
+    return false;
+}
+
+/*
+"aten::eq.Tensor(Tensor self, Tensor other) -> Tensor",
+"aten::eq.Scalar(Tensor self, Scalar other) -> Tensor",*/
+bool EqualOrNotequalConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 2), "invaid inputs size for EqualOrNotequalConverter");
+    
+    int schema_index = 0;
+    for (std::string schema : this->schema_string()) {
+        if (node->schema().operator_name() == torch::jit::parseSchema(schema).operator_name()) {
+            break;
+        }
+        schema_index++;
+    }
+
+    if (schema_index == 4 || schema_index == 5) {
+        int a = engine->context().get_constant(inputs[0]).toScalar().to<int>();
+        int b = engine->context().get_constant(inputs[1]).toScalar().to<int>();
+        bool output = true;
+        if (schema_index == 4) {
+            output = (a == b);
+        }
+        if (schema_index == 5) {
+            output = (a != b);
+        }
+        engine->context().set_constant(node->outputs()[0], output);
+        return true;
+    }
+
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())),
+        "input[0] for EqualOrNotequalConverter is not Tensor as expected");
+
+    //extract self
+    auto self = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((self != nullptr), "Unable to init input tensor for node: " << *node);
+    auto other = engine->context().get_tensor(inputs[1]);
+    //when other input is Scalar
+    if (other == nullptr) {
+        auto other_const = engine->context().get_constant(inputs[1]);
+        if (other_const.isScalar()) {
+            auto other_scalar = other_const.toScalar().to<float>();
+            other = tensor_to_const(engine, torch::tensor({other_scalar}));
+            if (node->kind() == torch::jit::aten::eq) {
+                //TODO: when aten::ne situation, we may alse need to cat functions below??
+                if (self->getType() == nvinfer1::DataType::kBOOL) {
+                    if (other_scalar == 0 || other_scalar == 1) {
+                        LOG(INFO) << "Since input tensor is type bool, casting input tensor and scalar to int32";
+                        other = cast_itensor(engine, other, nvinfer1::DataType::kINT32);
+                        self = cast_itensor(engine, self, nvinfer1::DataType::kINT32);
+                    } else {
+                        LOG(WARNING) << "Input Tensor has type bool, but scalar is not 0 or 1. Found: " << other_scalar;
+                        return false;
+                    }
+                }
+                if (self->getType() != other->getType()) {
+                    other = cast_itensor(engine, other, self->getType());
+                }
+            }
+        } else {
+            POROS_THROW_ERROR("Unable to get input other value for EqualOrNotequalConverter");
+        }
+    }
+
+    auto equal_layer = add_elementwise(engine,
+            nvinfer1::ElementWiseOperation::kEQUAL,
+            self,
+            other,
+            layer_info(node) + "_equal");
+    POROS_CHECK(equal_layer, "Unable to create equal layer from node: " << *node);
+
+    //situation: aten::eq
+    if (node->kind() == torch::jit::aten::eq) {
+        engine->context().set_tensor(node->outputs()[0], equal_layer->getOutput(0));
+        LOG(INFO) << "Output tensor shape: " << equal_layer->getOutput(0)->getDimensions();
+        return true;
+    }
+
+    // situation: aten::ne
+    // we should set another two layers: all-ones layer and kXOR.
+    if (node->kind() == torch::jit::aten::ne) {
+        // XOR with ones negates and produces not_equal result
+        auto options = torch::TensorOptions().dtype(torch::kFloat32);
+        auto ones = at::full({1}, 1, {options});
+        auto ones_tensor = tensor_to_const(engine, ones);
+        nvinfer1::IIdentityLayer* cast_layer = engine->network()->addIdentity(*ones_tensor);
+        cast_layer->setName((layer_info(node) + "_IIdentityLayer").c_str());
+        cast_layer->setOutputType(0, nvinfer1::DataType::kBOOL);      
+
+        //xor layer
+        auto xor_op = add_elementwise(engine,
+                nvinfer1::ElementWiseOperation::kXOR,
+                cast_layer->getOutput(0),
+                equal_layer->getOutput(0),
+                layer_info(node) + "_xor");
+        POROS_CHECK(xor_op, "Unable to create ne (not equal) layer from node: " << *node);
+        engine->context().set_tensor(node->outputs()[0], xor_op->getOutput(0));
+        LOG(INFO) << "Output tensor shape: " << xor_op->getOutput(0)->getDimensions();
+        return true;
+    }
+
+    POROS_THROW_ERROR("Meet some unknown node kind in EqualOrNotequalConverter");
+    return false;
+}
+
+/*
+"aten::pow.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor",
+"aten::pow.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor",*/
+bool PowOrFloordivideConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 2), "invaid inputs size for PowOrFloordivideConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())),
+        "input[0] for PowOrFloordivideConverter is not Tensor as expected");
+
+    //extract self
+    auto self = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((self != nullptr), "Unable to init input tensor for node: " << *node);
+    auto other = engine->context().get_tensor(inputs[1]);
+    //when other input is Scalar
+    if (other == nullptr) {
+        auto other_const = engine->context().get_constant(inputs[1]);
+        if (other_const.isScalar()) {
+            auto other_scalar = other_const.toScalar().to<float>();
+            other = tensor_to_const(engine, torch::tensor({other_scalar}));
+        } else {
+            POROS_THROW_ERROR("Unable to get input other value for PowOrFloordivideConverter");
+        }
+    }
+
+    nvinfer1::ElementWiseOperation ew_option;
+    std::string name_suffix;
+    if (node->kind() == torch::jit::aten::pow) {
+        ew_option = nvinfer1::ElementWiseOperation::kPOW;
+        name_suffix = "_pow";
+    //TODO: handle floor_divide situaition
+    // } else if (node->kind() == torch::jit::at::floor_divide) {
+    //     ew_option = nvinfer1::ElementWiseOperation::kFLOOR_DIV;
+    //     name_suffix = "_floor_div";
+    } else {
+        POROS_THROW_ERROR("Meet some unknown node kind in PowOrFloordivideConverter");
+    }
+
+    auto new_layer = add_elementwise(engine,
+            ew_option,
+            self,
+            other,
+            layer_info(node) + name_suffix);
+    POROS_CHECK(new_layer, "Unable to create pow or floor_divide layer from node: " << *node);
+    engine->context().set_tensor(node->outputs()[0], new_layer->getOutput(0));
+    LOG(INFO) << "Output tensor shape: " << new_layer->getOutput(0)->getDimensions();
+    return true;
+}
+
+/*
+"aten::clamp(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor",
+"aten::clamp_min(Tensor self, Scalar min) -> Tensor",
+"aten::clamp_max(Tensor self, Scalar max) -> Tensor",*/
+bool ClampConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 2 || inputs.size() == 3), "invaid inputs size for ClampConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())),
+        "input[0] for ClampConverter is not Tensor as expected");
+
+    //extract self
+    auto self = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((self != nullptr), "Unable to init input tensor for node: " << *node);
+    auto clamp_layer_out = self;
+
+    torch::jit::IValue maybe_min;
+    torch::jit::IValue maybe_max;
+    if (node->kind() == torch::jit::aten::clamp) {
+        maybe_min = engine->context().get_constant(inputs[1]);
+        maybe_max = engine->context().get_constant(inputs[2]);
+    } else if (node->kind() == torch::jit::aten::clamp_min) {
+        maybe_min = engine->context().get_constant(inputs[1]);
+        maybe_max = torch::jit::IValue();
+    } else { //node->kind() == torch::jit::aten::clamp_max
+        maybe_min = torch::jit::IValue();
+        maybe_max = engine->context().get_constant(inputs[1]);
+    }
+
+    if (maybe_min.isScalar() && maybe_max.isScalar()) {
+        // note: same as pytorch, first max, then min
+        auto limit = maybe_min.toScalar().to<float>();
+        auto limit_tensor = tensor_to_const(engine, torch::tensor({limit}));
+        auto limit_layer = add_elementwise(engine,
+                            nvinfer1::ElementWiseOperation::kMAX,
+                            self,
+                            limit_tensor,
+                            layer_info(node) + "_max");
+        POROS_CHECK(limit_layer, "Unable to create elementwise(KMAX) layer for node: " << *node);
+        clamp_layer_out = limit_layer->getOutput(0);
+        limit = maybe_max.toScalar().to<float>();
+        limit_tensor = tensor_to_const(engine, torch::tensor({limit}));
+        limit_layer = add_elementwise(engine,
+                            nvinfer1::ElementWiseOperation::kMIN,
+                            clamp_layer_out,
+                            limit_tensor,
+                            layer_info(node) + "_min");
+        POROS_CHECK(limit_layer, "Unable to create elementwise(KMIN) layer for node: " << *node);
+        clamp_layer_out = limit_layer->getOutput(0);
+    } else if (maybe_min.isScalar()) {
+        auto limit = maybe_min.toScalar().to<float>();
+        auto limit_tensor = tensor_to_const(engine, torch::tensor({limit}));
+        auto limit_layer = add_elementwise(engine,
+                            nvinfer1::ElementWiseOperation::kMAX,
+                            self,
+                            limit_tensor,
+                            layer_info(node) + "_max");
+        POROS_CHECK(limit_layer, "Unable to create elementwise(KMAX) layer for node: " << *node);
+        clamp_layer_out = limit_layer->getOutput(0);
+    } else if (maybe_max.isScalar()) {
+        auto limit = maybe_max.toScalar().to<float>();
+        auto limit_tensor = tensor_to_const(engine, torch::tensor({limit}));
+        auto limit_layer = add_elementwise(engine,
+                            nvinfer1::ElementWiseOperation::kMIN,
+                            self,
+                            limit_tensor,
+                            layer_info(node) + "_min");
+        POROS_CHECK(limit_layer, "Unable to create elementwise(KMIN) layer for node: " << *node);
+        clamp_layer_out = limit_layer->getOutput(0);
+    }
+
+    engine->context().set_tensor(node->outputs()[0], clamp_layer_out);
+    LOG(INFO) << "Output tensor shape: " << clamp_layer_out->getDimensions();
+    return true;
+}
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, GreaterOrLessConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, EqualOrNotequalConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, PowOrFloordivideConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, ClampConverter);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/element_wise.h b/poros/src/poros/converter/gpu/element_wise.h
new file mode 100644
index 0000000000..6851063085
--- /dev/null
+++ b/poros/src/poros/converter/gpu/element_wise.h
@@ -0,0 +1,181 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file element_wise.h
+* @author tianjinjin@baidu.com
+* @date Fri Aug 27 15:32:36 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class GreaterOrLessConverter : public GpuConverter {
+public:
+    GreaterOrLessConverter() {}
+    virtual ~GreaterOrLessConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+    
+    // note: gt.int, lt.int, ge.int le.int maybe should not support better.
+    // because they usually appear with if or loop.
+    const std::vector<std::string> schema_string() {
+        return {"aten::gt.Tensor(Tensor self, Tensor other) -> Tensor",
+                "aten::gt.Scalar(Tensor self, Scalar other) -> Tensor",
+                "aten::lt.Tensor(Tensor self, Tensor other) -> Tensor",
+                "aten::lt.Scalar(Tensor self, Scalar other) -> Tensor",
+                "aten::ge.Tensor(Tensor self, Tensor other) -> Tensor",
+                "aten::ge.Scalar(Tensor self, Scalar other) -> Tensor",
+                "aten::le.Tensor(Tensor self, Tensor other) -> Tensor",
+                "aten::le.Scalar(Tensor self, Scalar other) -> Tensor",
+                "aten::gt.int(int a, int b) -> (bool)",
+                "aten::lt.int(int a, int b) -> (bool)",
+                "aten::ge.int(int a, int b) -> (bool)",
+                "aten::le.int(int a, int b) -> (bool)",
+                };
+    }
+
+    /** TODO: TO SUPPORT CONVERTERS BELLOW:
+     * "aten::gt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)",
+     * "aten::lt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)",
+     * "aten::ge.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)",
+     * "aten::le.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)",
+     * "aten::gt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)",
+     * "aten::lt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)",
+     * "aten::ge.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)",
+     * "aten::le.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)",
+     * **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::gt,
+                torch::jit::aten::lt,
+                torch::jit::aten::ge,
+                torch::jit::aten::le};
+    }
+
+private:
+    nvinfer1::ITensor* scalar_to_nvtensor(TensorrtEngine* engine, at::Scalar s);
+};
+
+class EqualOrNotequalConverter : public GpuConverter {
+public:
+    EqualOrNotequalConverter() {}
+    virtual ~EqualOrNotequalConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    // note: eq.int, ne.int maybe should not support better.
+    // because they usually appear with if or loop.
+    const std::vector<std::string> schema_string() {
+        return {"aten::eq.Tensor(Tensor self, Tensor other) -> Tensor",
+                "aten::eq.Scalar(Tensor self, Scalar other) -> Tensor",
+                "aten::ne.Tensor(Tensor self, Tensor other) -> Tensor",
+                "aten::ne.Scalar(Tensor self, Scalar other) -> Tensor",
+                "aten::eq.int(int a, int b) -> (bool)",
+                "aten::ne.int(int a, int b) -> (bool)"
+                };
+    }
+
+    /** TODO: TO SUPPORT CONVERTERS BELLOW:
+     * "aten::eq.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)",
+     * "aten::ne.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)",
+     * "aten::eq.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)",
+     * "aten::ne.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)",
+     * **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::eq,
+                torch::jit::aten::ne,
+                };
+    }
+};
+
+class PowOrFloordivideConverter : public GpuConverter {
+public:
+    PowOrFloordivideConverter() {}
+    virtual ~PowOrFloordivideConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::pow.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor",
+                "aten::pow.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor",
+                //"aten::floor_divide(Tensor self, Tensor other) -> Tensor",
+                //"aten::floor_divide.Scalar(Tensor self, Scalar other) -> Tensor",
+                };
+    }
+
+    /** TODO: TO SUPPORT CONVERTERS BELLOW:
+     * "aten::pow.Tensor_Tensor_out(Tensor self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!)",
+     * "aten::pow.Tensor_Scalar_out(Tensor self, Scalar exponent, *, Tensor(a!) out) -> Tensor(a!)",
+     * "aten::pow.Scalar_out(Scalar self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!)",
+     * "aten::pow.Scalar(Scalar self, Tensor exponent) -> Tensor",
+     * 
+     * aten::floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+     * **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::pow,
+                //torch::jit::aten::floor_divide,
+                };
+    }
+};
+
+class ClampConverter : public GpuConverter {
+public:
+    ClampConverter() {}
+    virtual ~ClampConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::clamp(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor",
+                "aten::clamp_min(Tensor self, Scalar min) -> Tensor",
+                "aten::clamp_max(Tensor self, Scalar max) -> Tensor",
+                };
+    }
+
+    /** TODO: TO SUPPORT CONVERTERS BELLOW:
+     * "aten::clamp.Tensor(Tensor self, Tensor? min=None, Tensor? max=None) -> Tensor",
+     * "aten::clamp_min.Tensor(Tensor self, Tensor min) -> Tensor",
+     * "aten::clamp_max.Tensor(Tensor self, Tensor max) -> Tensor",
+     * 
+     * "aten::clamp.out(Tensor self, Scalar? min=None, Scalar? max=None, *, Tensor(a!) out) -> Tensor(a!)",
+     * "aten::clamp_min.out(Tensor self, Scalar min, *, Tensor(a!) out) -> Tensor(a!)",
+     * "aten::clamp_max.out(Tensor self, Scalar max, *, Tensor(a!) out) -> Tensor(a!)",
+     * 
+     * "aten::clamp.Tensor_out(Tensor self, Tensor? min=None, Tensor? max=None, *, Tensor(a!) out) -> Tensor(a!)",
+     * "aten::clamp_min.Tensor_out(Tensor self, Tensor min, *, Tensor(a!) out) -> Tensor(a!)"
+     * "aten::clamp_max.Tensor_out(Tensor self, Tensor max, *, Tensor(a!) out) -> Tensor(a!)"
+     * **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::clamp,
+                torch::jit::aten::clamp_min,
+                torch::jit::aten::clamp_max,
+                };
+    }
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/expand.cpp b/poros/src/poros/converter/gpu/expand.cpp
new file mode 100644
index 0000000000..7606fe1d79
--- /dev/null
+++ b/poros/src/poros/converter/gpu/expand.cpp
@@ -0,0 +1,315 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file expand.cpp
+* @author tianjinjin@baidu.com
+* @date Mon Mar  8 11:36:11 CST 2021
+* @brief 
+**/
+
+#include "poros/converter/gpu/expand.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/context/poros_global.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+/*
+"aten::expand(Tensor(a) self, int[] size, *, bool implicit=False) -> Tensor(a)",
+"aten::expand_as(Tensor(a) self, Tensor other) -> Tensor(a)"*/
+bool ExpandConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 3 || inputs.size() == 2), "invaid inputs size for ExpandConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for ExpandConverter is not Tensor as expected");
+    if (inputs.size() == 3) {
+        POROS_CHECK_TRUE((inputs[2]->node()->kind() == torch::jit::prim::Constant),
+            "input[2] for ExpandConverter is not come from prim::Constant as expected");
+    }
+    //extract in
+    auto in = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((in != nullptr), "Unable to init input tensor for node: " << *node);
+    auto input_dims = in->getDimensions();
+    auto input_rank = in->getDimensions().nbDims;
+
+    //extract target_dims & init expanded_dims_tensor
+    nvinfer1::Dims target_dims;
+    nvinfer1::ITensor* expanded_dims_tensor = nullptr;
+    bool is_expand_layer = false;
+    bool has_tensor_scalar = false;
+    if (node->kind() == torch::jit::aten::expand) {
+        has_tensor_scalar = check_inputs_tensor_scalar(engine, node);
+        if (has_tensor_scalar) {
+            expanded_dims_tensor = get_tensor_scalar(inputs[1]);
+            POROS_CHECK_TRUE((expanded_dims_tensor != nullptr), node_info(node) + std::string("get int nvtensor false."));
+            target_dims = expanded_dims_tensor->getDimensions();
+        } else {
+            auto expanded_size = (engine->context().get_constant(inputs[1])).toIntList();
+            target_dims = sizes_to_nvdim(expanded_size);
+            auto expanded_size_tensor = torch::tensor(expanded_size.vec(), torch::kInt32);
+            expanded_dims_tensor = tensor_to_const(engine, expanded_size_tensor);
+        }
+        is_expand_layer = true;
+    } else {  //node->kind() == torch::jit::aten::expand_as
+        auto target_tensor = engine->context().get_tensor(inputs[1]);
+        target_dims = target_tensor->getDimensions();
+        expanded_dims_tensor = engine->network()->addShape(*target_tensor)->getOutput(0);
+    }
+    auto output_rank = target_dims.nbDims;
+    if (has_tensor_scalar) {
+        output_rank = target_dims.d[0];
+    }
+    
+    POROS_CHECK(input_rank <= output_rank,
+        "Number of dimensions of the desired expansion must be greater than or equal to the number of input dimensions");
+    
+    auto is_dynamic_shape = PorosGlobalContext::instance().get_poros_options().is_dynamic;
+
+    //situation1: ---------- when input is dynamic shape -------------
+    if (is_dynamic_shape) {
+        // Validate the expansion. Eg: an input of [3, 1] can be expanded to [1, 3, 4] but not [3, 4, 1]
+        if (!has_tensor_scalar) {
+            for (int64_t i = target_dims.nbDims - 1; i >= 0; --i) {
+                int64_t offset = target_dims.nbDims - 1 - i;
+                int64_t dim = input_dims.nbDims - 1 - offset;
+                int64_t size = (dim >= 0) ? input_dims.d[dim] : 1;
+                int64_t target_size = target_dims.d[i];
+                // Passing -1 as the size for a dimension means not changing the size of that dimension in expand layer.
+                if (target_size != -1) {
+                    if (size != target_size) {
+                        // if size == -1, we can't validate the expansion before setBindingDimensions.
+                        POROS_CHECK_TRUE((size == -1 || size == 1), "The expanded size of tensor (" << std::to_string(target_size) << ")"
+                                << " must match the existing size (" << std::to_string(size) << ")" << " at dimension " << i);
+                    }
+                } else {
+                    //expand 的 target_size 不可以出现-1，(因为是intlist)，但expand_as 可以，因为通过shape获取真实的size。
+                    POROS_CHECK_TRUE(!(is_expand_layer && dim < 0), "The target dims " << target_dims  << " for node [" 
+                                    << node_info(node) << "] is illegal, should not have -1 value");
+                }
+            }
+        } else {
+            LOG(INFO) << "aten::expend ints tensor maybe not right, because has no check.";
+        }
+        
+        size_t max_rank = std::max(input_rank, output_rank);
+        // Dimensions are right alignment. Eg: an input of [3, 1] and max_rank = 4, the result of concat is [1, 1, 3, 1]
+        nvinfer1::ITensor* new_input_shape_tensor = nullptr;
+        if (max_rank - input_rank > 0) {
+            torch::Tensor the_one = torch::tensor(std::vector<int32_t>(max_rank - input_rank, 1), torch::kInt32);
+            auto one_tensor = tensor_to_const(engine, the_one);
+            auto in_shape_tensor = engine->network()->addShape(*in)->getOutput(0);
+            nvinfer1::ITensor* const args[2] = {one_tensor, in_shape_tensor};
+            new_input_shape_tensor =  engine->network()->addConcatenation(args, 2)->getOutput(0);
+        } else { //max_rank - input_rank == 0
+            new_input_shape_tensor =  engine->network()->addShape(*in)->getOutput(0);
+        }
+        auto new_output_shape_tensor = expanded_dims_tensor;
+        
+        // Add a reshape layer to expand dims
+        auto shuffle = engine->network()->addShuffle(*in);
+        shuffle->setInput(1, *new_input_shape_tensor);
+        shuffle->setName((layer_info(node) + "_IShuffleLayer").c_str());
+        
+        // Start the slicing from beginning of tensor since this is an expand layer
+        std::vector<int64_t> start_vec(max_rank, 0);
+        nvinfer1::Dims starts_dim = sizes_to_nvdim(c10::IntArrayRef(start_vec));
+        at::Tensor th_start = torch::tensor(nvdim_to_sizes(starts_dim), torch::kInt32);
+        auto starts = tensor_to_const(engine, th_start);
+        
+        // compute sizes = max(x,y).
+        auto sizes = engine->network()->addElementWise(*new_input_shape_tensor, 
+                                            *new_output_shape_tensor, 
+                                            nvinfer1::ElementWiseOperation::kMAX)->getOutput(0);
+        nvinfer1::Dims sizes_dim{-1, {}};
+        sizes_dim.nbDims = max_rank;
+        
+        // Compute (x > 1 ? 1 : 0) for x in newDims, assuming positive x, using only TensorRT operations.
+        // min(1, sub(input_shape, 1))
+        torch::Tensor thOne = torch::tensor({1}, torch::kInt32);
+        auto thone_tensor = tensor_to_const(engine, thOne);
+        auto x_sub_one = engine->network()->addElementWise(*new_input_shape_tensor,
+                                                *thone_tensor,
+                                                nvinfer1::ElementWiseOperation::kSUB)->getOutput(0);
+        auto strides = engine->network()->addElementWise(*thone_tensor,
+                                                *x_sub_one,
+                                                nvinfer1::ElementWiseOperation::kMIN)->getOutput(0);
+        nvinfer1::Dims strides_dim{-1, {}};
+        strides_dim.nbDims = max_rank;
+        
+        // Slice layer does the expansion in TRT. Desired output size is specified by sizes input at index 2.
+        auto slice = engine->network()->addSlice(*shuffle->getOutput(0), starts_dim, sizes_dim, strides_dim);
+        slice->setInput(1, *starts);
+        slice->setInput(2, *sizes);
+        slice->setInput(3, *strides);
+        slice->setName((layer_info(node) + "_ISliceLayer").c_str());
+        
+        engine->context().set_tensor(node->outputs()[0], slice->getOutput(0));
+        LOG(INFO) << "Output tensor shape: " << slice->getOutput(0)->getDimensions();
+        return true;
+
+    //situation2: ---------- when input is NOT dynamic shape -------------    
+    } else {
+        // Validate the expansion. Eg: an input of [3, 1] can be expanded to [1, 3, 4] but not [3, 4, 1]
+        for (int64_t i = target_dims.nbDims - 1; i >= 0; --i) {
+            int64_t offset = target_dims.nbDims - 1 - i;
+            int64_t dim = input_dims.nbDims - 1 - offset;
+            int64_t size = (dim >= 0) ? input_dims.d[dim] : 1;
+            int64_t target_size = target_dims.d[i];
+            // In expand layer passing -1 as the size for a dimension means not changing the size of that dimension.
+            if (target_size != -1) {
+                if (size != target_size) {
+                    POROS_CHECK_TRUE((size == 1), "The expanded size of tensor (" << std::to_string(target_size) << ")"
+                            << " must match the existing size (" << std::to_string(size) << ")" << " at dimension " << i);
+                }
+            } else {
+                //target_size 不可以出现 -1.
+                POROS_CHECK_TRUE((dim >= 0), "The target dims " << target_dims  << " for node [" 
+                                    << node_info(node) << "] is illegal, should not have -1 value");
+                // in(3, 1), expand(3, -1, 4) -> expand(3, 3, 4)
+                target_dims.d[i] = input_dims.d[dim];
+            }
+        }
+
+        auto num_expand_dims = target_dims.nbDims - input_dims.nbDims;
+        if (num_expand_dims > 0) {
+            nvinfer1::Dims reshape_dims;
+            reshape_dims.nbDims = target_dims.nbDims;
+            for (int64_t i = 0; i < num_expand_dims; i++) {
+                reshape_dims.d[i] = 1;
+            }
+            for (int64_t i = 0; i < input_dims.nbDims; i++) {
+                reshape_dims.d[num_expand_dims + i] = input_dims.d[i];
+            }
+            
+            // Add a reshape layer to expand dims
+            auto reshape_layer = engine->network()->addShuffle(*in);
+            reshape_layer->setReshapeDimensions(reshape_dims);
+            reshape_layer->setName((layer_info(node) + "_IShuffleLayer").c_str());
+            in = reshape_layer->getOutput(0);
+            LOG(INFO) << "Input reshaped to : " << in->getDimensions() << " from " << input_dims;
+        }
+        
+        // Start the slicing from beginning of tensor since this is an expand layer
+        std::vector<int64_t> start_vec(target_dims.nbDims, 0);
+        auto start_offset = sizes_to_nvdim(c10::IntArrayRef(start_vec));
+        
+        // Set the stride of non singleton dimension to 1
+        std::vector<int64_t> strides_vec(target_dims.nbDims, 0);
+        for (int64_t i = 0; i < target_dims.nbDims; i++) {
+            strides_vec[i] = (in->getDimensions().d[i] != 1);
+        }
+        
+        auto strides = sizes_to_nvdim(c10::IntArrayRef(strides_vec));
+        // Slice layer does the expansion in TRT. Desired output size is specified by target_dims
+        auto slice_layer = engine->network()->addSlice(*in, start_offset, target_dims, strides);
+        slice_layer->setName((layer_info(node) + "_ISliceLayer").c_str());
+        engine->context().set_tensor(node->outputs()[0], slice_layer->getOutput(0));
+        LOG(INFO) << "Output tensor shape: " << slice_layer->getOutput(0)->getDimensions();
+        return true;
+    }
+}
+
+/*
+"aten::repeat(Tensor self, int[] repeats) -> Tensor",
+*/
+bool RepeatConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 2), "invaid inputs size for RepeatConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for RepeatConverter is not Tensor as expected");
+    POROS_CHECK_TRUE((inputs[1]->node()->kind() == torch::jit::prim::Constant),
+        "input[2] for RepeatConverter is not come from prim::Constant as expected");
+
+    //extract in
+    auto in = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((in != nullptr), "Unable to init input tensor for node: " << *node);
+    auto input_dims = in->getDimensions();
+    int input_rank = input_dims.nbDims;
+
+    //extract repeats
+    auto repeats = (engine->context().get_constant(inputs[1])).toIntList().vec();
+    int repeats_rank = repeats.size();
+    
+    POROS_CHECK(repeats_rank >= input_rank, "Number of repeat dimensions cannot be smaller than number of input dimensions");
+    auto num_expand_dims = repeats_rank - input_rank;
+
+    auto is_dynamic_shape = PorosGlobalContext::instance().get_poros_options().is_dynamic;
+    if (is_dynamic_shape) {
+        nvinfer1::ITensor* new_input_shape_tensor;
+        if (num_expand_dims > 0) {
+            torch::Tensor the_one = torch::tensor(std::vector<int32_t>(num_expand_dims, 1), torch::kInt32);
+            auto one_tensor = tensor_to_const(engine, the_one);
+            auto in_shape_tensor = engine->network()->addShape(*in)->getOutput(0);
+            nvinfer1::ITensor* const args[2] = {one_tensor, in_shape_tensor};
+            new_input_shape_tensor =  engine->network()->addConcatenation(args, 2)->getOutput(0);
+        } else { //num_expand_dims == 0
+            new_input_shape_tensor =  engine->network()->addShape(*in)->getOutput(0);
+        }
+
+        // Add a reshape layer to expand dims
+        auto shuffle = engine->network()->addShuffle(*in);
+        shuffle->setInput(1, *new_input_shape_tensor);
+        shuffle->setName((layer_info(node) + "_IShuffleLayer").c_str());
+        in = shuffle->getOutput(0);
+    } else {
+        if (num_expand_dims > 0) {
+            nvinfer1::Dims reshape_dims;
+            reshape_dims.nbDims = repeats.size();
+            for (int i = 0; i < num_expand_dims; i++) {
+                reshape_dims.d[i] = 1;
+            }
+            for (int i = 0; i < input_rank; i++) {
+                reshape_dims.d[num_expand_dims + i] = input_dims.d[i];
+            }
+            
+            // Add a reshape layer to expand dims
+            auto reshape_layer = engine->network()->addShuffle(*in);
+            reshape_layer->setReshapeDimensions(reshape_dims);
+            reshape_layer->setName((layer_info(node) + "_IShuffleLayer").c_str());
+            in = reshape_layer->getOutput(0);
+            LOG(INFO) << "Input reshaped to : " << in->getDimensions() << " from " << input_dims;
+        }
+    }
+
+    // Concat across all repeat axes.
+    // TODO: Implementation might not be performant. Explore other strategies to improve performance.
+    for (int i = repeats.size() - 1; i >= 0; --i) {
+        std::vector<nvinfer1::ITensor*> tensors_vec;
+        for (int j = 0; j < repeats[i]; j++) {
+            tensors_vec.push_back(in);
+        }
+        auto concat_layer = engine->network()->addConcatenation(tensors_vec.data(), tensors_vec.size());
+        concat_layer->setAxis(i);
+        concat_layer->setName((layer_info(node) + "_IConcatenationLayer_" + std::to_string(i)).c_str());
+        in = concat_layer->getOutput(0);
+    }
+
+    engine->context().set_tensor(node->outputs()[0], in);
+    LOG(INFO) << "Output tensor shape: " << in->getDimensions();
+    return true;
+}
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, ExpandConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, RepeatConverter);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/expand.h b/poros/src/poros/converter/gpu/expand.h
new file mode 100644
index 0000000000..f40f84078a
--- /dev/null
+++ b/poros/src/poros/converter/gpu/expand.h
@@ -0,0 +1,76 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file expand.h
+* @author tianjinjin@baidu.com
+* @date Mon Aug 16 12:26:28 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class ExpandConverter : public GpuConverter {
+public:
+    ExpandConverter() {}
+    virtual ~ExpandConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::expand(Tensor(a) self, int[] size, *, bool implicit=False) -> Tensor(a)",
+                "aten::expand_as(Tensor(a) self, Tensor other) -> Tensor(a)",};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::expand,
+                torch::jit::aten::expand_as};
+    }
+
+    bool assign_schema_attr() {
+        return assign_schema_attr_helper({{"aten::expand(Tensor(a) self, int[] size, *, bool implicit=False) -> Tensor(a)", {1, 1}}});
+    }
+};
+
+class RepeatConverter : public GpuConverter {
+public:
+    RepeatConverter() {}
+    virtual ~RepeatConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::repeat(Tensor self, int[] repeats) -> Tensor"};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::repeat};
+    }
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/generate.cpp b/poros/src/poros/converter/gpu/generate.cpp
new file mode 100644
index 0000000000..61942a2aa8
--- /dev/null
+++ b/poros/src/poros/converter/gpu/generate.cpp
@@ -0,0 +1,578 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file generate.cpp
+* @author tianshaoqing@baidu.com
+* @date Mon Dec 6 14:29:20 CST 2021
+* @brief 
+**/
+
+#include "poros/converter/gpu/generate.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/context/poros_global.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+// aten::zeros_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+bool ZerosLikeConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 6), "invaid inputs size for ZerosLikeConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for ZerosLikeConverter is not Tensor as expected");
+    // extract self
+    nvinfer1::ITensor* self = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((self != nullptr), "Unable to init input tensor for node: " << *node);
+    torch::jit::IValue maybe_type = engine->context().get_constant(inputs[1]);
+    if (maybe_type.isNone()) {
+        nvinfer1::ILayer* sub_layer = add_elementwise(engine, nvinfer1::ElementWiseOperation::kSUB, 
+                                                        self, self, layer_info(node) + "_sub");
+        nvinfer1::ITensor* output = sub_layer->getOutput(0);
+        engine->context().set_tensor(node->outputs()[0], output);
+        LOG(INFO) << "Output tensor shape: " << output->getDimensions();
+        return true;
+    } else {
+        nvinfer1::ITensor* shape_tensor = engine->network()->addShape(*self)->getOutput(0);
+        int32_t self_rank = (shape_tensor->getDimensions()).d[0];
+
+        at::ScalarType input_type = maybe_type.toScalarType();
+        nvinfer1::IFillLayer* fill_layer = engine->network()->addFill(nvinfer1::Dims{1, {1}}, 
+                                                                        nvinfer1::FillOperation::kLINSPACE);
+        fill_layer->setInput(0, *shape_tensor);  // 设置output shape
+
+        at::Tensor value_tensor = torch::tensor(0).to(input_type);
+        nvinfer1::ITensor* value_itensor = tensor_to_const(engine, value_tensor);
+        fill_layer->setInput(1, *value_itensor); // 初始值
+
+        at::Tensor delta_tensor = torch::zeros(self_rank).to(input_type); // 每个方向上的变化，所以self_rank个0
+        nvinfer1::ITensor* delta_itensor = tensor_to_const(engine, delta_tensor);
+        fill_layer->setInput(2, *delta_itensor);
+        fill_layer->setName((layer_info(node) + "_IFillLayer").c_str());
+
+        nvinfer1::ITensor* output = fill_layer->getOutput(0);
+        engine->context().set_tensor(node->outputs()[0], output);
+        LOG(INFO) << "Output tensor shape: " << output->getDimensions();
+        return true;
+    }
+}
+
+// aten::zeros(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor"
+bool ZerosConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 5), "invaid inputs size for ZerosConverter");
+
+    bool has_tensor_scalar = false;
+    has_tensor_scalar = check_inputs_tensor_scalar(engine, node);
+    nvinfer1::ITensor* output = nullptr;
+    // extract dtype
+    torch::jit::IValue maybe_type = engine->context().get_constant(inputs[1]);
+
+    if (has_tensor_scalar) {
+        // extract size
+        nvinfer1::ITensor* shape_tensor = engine->context().get_tensor(inputs[0]); // from size
+        POROS_CHECK_TRUE((shape_tensor != nullptr), "Unable to init input tensor for node: " << *node);
+
+        nvinfer1::Dims self_dims = shape_tensor->getDimensions();
+        int64_t self_rank = self_dims.d[0];
+
+        nvinfer1::IFillLayer* fill_layer = engine->network()->addFill(nvinfer1::Dims{1, {1}}, 
+                                                                        nvinfer1::FillOperation::kLINSPACE);
+        fill_layer->setInput(0, *shape_tensor);  // 设置output shape
+        // default type is float
+        at::Tensor value_tensor = torch::tensor(0.0, torch::kFloat32);
+        at::Tensor delta_tensor = torch::zeros(self_rank, torch::kFloat32); // 每个方向上的变化，所以self_rank个0
+        // type conversion
+        if (!maybe_type.isNone()) {
+            value_tensor = value_tensor.to(maybe_type.toScalarType());
+            delta_tensor = delta_tensor.to(maybe_type.toScalarType());
+        }
+        nvinfer1::ITensor* value_itensor = tensor_to_const(engine, value_tensor);
+        fill_layer->setInput(1, *value_itensor); // 初始值
+        nvinfer1::ITensor* delta_itensor = tensor_to_const(engine, delta_tensor);
+        fill_layer->setInput(2, *delta_itensor);
+        fill_layer->setName((layer_info(node) + "_IFillLayer").c_str());
+        output = fill_layer->getOutput(0);
+    } else {
+        std::vector<int64_t> self_vec = (engine->context().get_constant(inputs[0])).toIntList().vec();
+        at::Tensor value_tensor = torch::zeros(self_vec, torch::kFloat32);
+        if (!maybe_type.isNone()) {
+            value_tensor = value_tensor.to(maybe_type.toScalarType());
+        }
+        output = tensor_to_const(engine, value_tensor);
+    }
+    
+    engine->context().set_tensor(node->outputs()[0], output);
+    LOG(INFO) << "Output tensor shape: " << output->getDimensions();
+    return true;
+}
+
+
+// aten::ones(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+bool OnesConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 5), "invaid inputs size for OnesConverter");
+
+    bool has_tensor_scalar = false;
+    has_tensor_scalar = check_inputs_tensor_scalar(engine, node);
+    nvinfer1::ITensor* output = nullptr;
+    // extract dtype
+    torch::jit::IValue maybe_type = engine->context().get_constant(inputs[1]);
+
+    if (has_tensor_scalar) {
+        // extract size
+        nvinfer1::ITensor* shape_tensor = engine->context().get_tensor(inputs[0]); // from size
+        POROS_CHECK_TRUE((shape_tensor != nullptr), "Unable to init input tensor for node: " << *node);
+
+        nvinfer1::Dims self_dims = shape_tensor->getDimensions();
+        int64_t self_rank = self_dims.d[0];
+
+        nvinfer1::IFillLayer* fill_layer = engine->network()->addFill(nvinfer1::Dims{1, {1}}, 
+                                                                        nvinfer1::FillOperation::kLINSPACE);
+        fill_layer->setInput(0, *shape_tensor);  // 设置output shape
+        // default type is float
+        at::Tensor value_tensor = torch::tensor(1.0, torch::kFloat32);
+        at::Tensor delta_tensor = torch::zeros(self_rank, torch::kFloat32); // 每个方向上的变化，所以self_rank个0
+        // type conversion
+        if (!maybe_type.isNone()) {
+            value_tensor = value_tensor.to(maybe_type.toScalarType());
+            delta_tensor = delta_tensor.to(maybe_type.toScalarType());
+        }
+        nvinfer1::ITensor* value_itensor = tensor_to_const(engine, value_tensor);
+        fill_layer->setInput(1, *value_itensor); // 初始值
+        nvinfer1::ITensor* delta_itensor = tensor_to_const(engine, delta_tensor);
+        fill_layer->setInput(2, *delta_itensor);
+        fill_layer->setName((layer_info(node) + "_IFillLayer").c_str());
+        output = fill_layer->getOutput(0);
+    } else {
+        std::vector<int64_t> self_vec = (engine->context().get_constant(inputs[0])).toIntList().vec();
+        at::Tensor value_tensor = torch::ones(self_vec, torch::kFloat32);
+        if (!maybe_type.isNone()) {
+            value_tensor = value_tensor.to(maybe_type.toScalarType());
+        }
+        output = tensor_to_const(engine, value_tensor);
+    }
+    
+    engine->context().set_tensor(node->outputs()[0], output);
+    LOG(INFO) << "Output tensor shape: " << output->getDimensions();
+    return true;
+}
+
+
+// aten::full(int[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+bool FullConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 6), "invaid inputs size for FullConverter");
+
+    bool has_tensor_scalar = false;
+    has_tensor_scalar = check_inputs_tensor_scalar(engine, node);
+    nvinfer1::ITensor* output = nullptr;
+    // extract fill_value
+    torch::jit::IValue maybe_value = engine->context().get_constant(inputs[1]);
+    POROS_CHECK_TRUE((!maybe_value.isNone()), "Unable to init input fill value for node: " << *node);
+    float fill_value = maybe_value.toScalar().toFloat();
+    // extract dtype
+    torch::jit::IValue maybe_type = engine->context().get_constant(inputs[2]);
+
+    if (has_tensor_scalar) {
+        // extract size
+        nvinfer1::ITensor* shape_tensor = engine->context().get_tensor(inputs[0]); // from size
+        POROS_CHECK_TRUE((shape_tensor != nullptr), "Unable to init input tensor for node: " << *node);
+
+        nvinfer1::Dims self_dims = shape_tensor->getDimensions();
+        int64_t self_rank = self_dims.d[0];
+
+        nvinfer1::IFillLayer* fill_layer = engine->network()->addFill(nvinfer1::Dims{1, {1}}, 
+                                                                        nvinfer1::FillOperation::kLINSPACE);
+        fill_layer->setInput(0, *shape_tensor);  // 设置output shape
+        // default type is float
+        at::Tensor value_tensor = torch::tensor(fill_value, torch::kFloat32);
+        at::Tensor delta_tensor = torch::zeros(self_rank, torch::kFloat32); // 每个方向上的变化，所以self_rank个0
+        // type conversion
+        if (!maybe_type.isNone()) {
+            value_tensor = value_tensor.to(maybe_type.toScalarType());
+            delta_tensor = delta_tensor.to(maybe_type.toScalarType());
+        }
+        nvinfer1::ITensor* value_itensor = tensor_to_const(engine, value_tensor);
+        fill_layer->setInput(1, *value_itensor); // 初始值
+        nvinfer1::ITensor* delta_itensor = tensor_to_const(engine, delta_tensor);
+        fill_layer->setInput(2, *delta_itensor);
+        fill_layer->setName((layer_info(node) + "_IFillLayer").c_str());
+        output = fill_layer->getOutput(0);
+    } else {
+        std::vector<int64_t> self_vec = (engine->context().get_constant(inputs[0])).toIntList().vec();
+        at::Tensor value_tensor = torch::ones(self_vec, torch::kFloat32) * fill_value;
+        if (!maybe_type.isNone()) {
+            value_tensor = value_tensor.to(maybe_type.toScalarType());
+        }
+        output = tensor_to_const(engine, value_tensor);
+    }
+    
+    engine->context().set_tensor(node->outputs()[0], output);
+    LOG(INFO) << "Output tensor shape: " << output->getDimensions();
+    return true;
+}
+
+// reduce input_tensor with shape
+static nvinfer1::ITensor* reduce_dim1_to_dim0(TensorrtEngine* engine, nvinfer1::ITensor* input_tensor) {
+    nvinfer1::Dims input_dims = input_tensor->getDimensions();
+    if (input_dims.nbDims == 1 && input_dims.d[0] == 1) {
+        nvinfer1::IShuffleLayer* shuffle_l = engine->network()->addShuffle(*input_tensor);
+        nvinfer1::Dims squeeze_dim;
+        squeeze_dim.nbDims = 0;
+        shuffle_l->setReshapeDimensions(squeeze_dim);
+        return shuffle_l->getOutput(0);
+    } else {
+        return input_tensor;
+    }
+}
+
+// aten::arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+// aten::arange.start(Scalar start, Scalar end, *, int? dtype=None, int? layout=None, Device? device=None, bool? pin_memory=None) -> (Tensor)
+bool ArangeConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 5 || inputs.size() == 6), "invaid inputs size for ArangeConverter.");
+    // start、end目前只支持int类型
+    POROS_CHECK_TRUE((node->inputs()[0]->type()->kind() == c10::TypeKind::IntType), 
+                                        "The type of input[0] for ArangeConverter must be Int.");
+    if (inputs.size() == 6) {
+        POROS_CHECK_TRUE((node->inputs()[1]->type()->kind() == c10::TypeKind::IntType), 
+                                        "The type of input[1] for ArangeConverter must be Int.");
+    }
+
+    int type_input_index = inputs.size() - 4;
+    torch::jit::IValue maybe_type = engine->context().get_constant(inputs[type_input_index]);
+
+    if (check_inputs_tensor_scalar(engine, node)) {
+        nvinfer1::IFillLayer* fill_layer = engine->network()->addFill(nvinfer1::Dims{1, {1}}, 
+                                                                        nvinfer1::FillOperation::kLINSPACE);
+        if (inputs.size() == 5) {
+            nvinfer1::ITensor* end_tensor = this->get_tensor_scalar(inputs[0]);
+            // 设置output shape
+            fill_layer->setInput(0, *end_tensor);  
+            // 设置 start 和 delta
+            at::Tensor value_tensor = torch::tensor(0, torch::kInt32);
+            at::Tensor delta_tensor = torch::ones(1, torch::kInt32);
+            auto value_itensor = tensor_to_const(engine, value_tensor);
+            fill_layer->setInput(1, *value_itensor);
+            auto delta_itensor = tensor_to_const(engine, delta_tensor); 
+            fill_layer->setInput(2, *delta_itensor);
+        } else {
+            nvinfer1::ITensor* start_tensor = this->get_tensor_scalar(inputs[0]);
+            nvinfer1::ITensor* end_tensor = this->get_tensor_scalar(inputs[1]);
+            // arange_size = end - start
+            nvinfer1::ITensor* arange_size = add_elementwise(engine,
+                                                nvinfer1::ElementWiseOperation::kSUB,
+                                                end_tensor,
+                                                start_tensor,
+                                                layer_info(node) + "_get_arange_size")->getOutput(0);
+            // 设置output shape
+            fill_layer->setInput(0, *arange_size);
+            // 设置 start 和 delta
+            start_tensor = reduce_dim1_to_dim0(engine, start_tensor);
+            fill_layer->setInput(1, *start_tensor);
+            at::Tensor delta_tensor = torch::ones(1, torch::kInt32);
+            auto delta_itensor = tensor_to_const(engine, delta_tensor);
+            fill_layer->setInput(2, *delta_itensor);
+        }
+        fill_layer->setName((layer_info(node) + "_IFillLayer").c_str());
+        nvinfer1::ITensor* output = fill_layer->getOutput(0);
+
+        if (!maybe_type.isNone()) {
+            at::ScalarType scalar_type = maybe_type.toScalarType();
+            if (scalar_type == at::ScalarType::Long) {
+                scalar_type = at::ScalarType::Int;
+                LOG(WARNING) << "aten::arange Converter meets c10::ScalarType::Long tensor type, change this to c10::ScalarType::Int. "
+                    << "Attention: this may leed to percision change";
+            }
+            nvinfer1::DataType output_type = attype_to_nvtype(scalar_type);
+            // Set datatype for data to dtype
+            auto identity = engine->network()->addIdentity(*output);
+            identity->setName((layer_info(node) + "_identity_output").c_str());
+            identity->setOutputType(0, output_type);
+            output = identity->getOutput(0);
+        }
+        engine->context().set_tensor(node->outputs()[0], output);
+        LOG(INFO) << "Output tensor shape: " << output->getDimensions();
+
+    } else {
+        at::Tensor value_tensor;
+        if (inputs.size() == 5) {
+            int64_t end = engine->context().get_constant(inputs[0]).toInt();
+            value_tensor = torch::arange(end, torch::kInt);
+
+        } else {
+            int64_t start = engine->context().get_constant(inputs[0]).toInt();
+            int64_t end = engine->context().get_constant(inputs[1]).toInt();
+            value_tensor = torch::arange(start, end, torch::kInt);
+        } 
+        if (!maybe_type.isNone()) {
+            value_tensor = value_tensor.to(maybe_type.toScalarType());
+        }
+        nvinfer1::ITensor* output = tensor_to_const(engine, value_tensor);
+        engine->context().set_tensor(node->outputs()[0], output);
+        LOG(INFO) << "Output tensor shape: " << output->getDimensions();
+    }
+    return true;
+}
+
+// aten::tensor(t[] data, *, int? dtype=None, Device? device=None, bool requires_grad=False) -> (Tensor)
+bool TensorConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 4), "invaid inputs size for TensorConverter");
+    // extract dtype
+    torch::jit::IValue maybe_type = engine->context().get_constant(inputs[1]);
+
+    nvinfer1::ITensor* output = nullptr;
+    if (check_inputs_tensor_scalar(engine, node)) {
+        output = this->get_tensor_scalar(inputs[0]);
+        if (!maybe_type.isNone()) {
+            at::ScalarType scalar_type = maybe_type.toScalarType();
+            if (scalar_type == at::ScalarType::Long) {
+                scalar_type = at::ScalarType::Int;
+                LOG(WARNING) << "aten::tensor Converter meets c10::ScalarType::Long tensor type, change this to c10::ScalarType::Int. "
+                    << "Attention: this may leed to percision change";
+            }
+            auto output_type = attype_to_nvtype(scalar_type);
+            // Set datatype for data to dtype
+            auto identity = engine->network()->addIdentity(*output);
+            identity->setName((layer_info(node) + "_IIdentityLayer_for_output").c_str());
+            identity->setOutputType(0, output_type);
+            output = identity->getOutput(0);
+        }
+        // mark: 06.30 by tsq
+        // 如果schema为aten::tensor.int，当其输出为子图输出时，即它的output会输出到torchscript，那么需要变换output rank为0。否则会出core。
+        // 理论上来说应该所有aten::tensor.int输出的tensor rank都为0，但这样输出output->getDimensions()为空[]
+        // 暂时不清楚rank为0的nvtensor给其他op会有什么影响，所以先限制aten::tensor输出为子图输出时才squeeze
+        bool need_squeeze_dim = false;
+        if (node->hasUses()) {
+            auto users_list = node->output(0)->uses();
+            for (size_t i = 0; i < users_list.size(); i++) {
+                if (users_list[i].user->kind() == torch::jit::prim::Return) {
+                    need_squeeze_dim = true;
+                    break;
+                }
+            }
+        }
+
+        if (need_squeeze_dim && inputs[0]->type()->kind() == c10::TypeKind::IntType) {
+            nvinfer1::IShuffleLayer* shuffle_l = engine->network()->addShuffle(*output);
+            nvinfer1::Dims squeeze_dim;
+            squeeze_dim.nbDims = 0;
+            shuffle_l->setReshapeDimensions(squeeze_dim);
+            shuffle_l->setName((layer_info(node) + "_IShuffleLayer").c_str());
+            output = shuffle_l->getOutput(0);
+            engine->context().set_tensor(node->outputs()[0], output);
+            return true;
+        }
+
+    } else {
+        // extract dtype
+        at::Tensor input_data = engine->context().get_constant(inputs[0]).toTensor();
+        if (!maybe_type.isNone()) {
+            input_data = input_data.to(maybe_type.toScalarType());
+        }
+        output = tensor_to_const(engine, input_data);
+    }
+    
+    engine->context().set_tensor(node->outputs()[0], output);
+    LOG(INFO) << "Output tensor shape: " << output->getDimensions();
+    return true;
+}
+
+// aten::linspace(Scalar start, Scalar end, int? steps=None, *, int? dtype=None, int? layout=None, Device? device=None, bool? pin_memory=None) -> (Tensor)
+bool LinspaceConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+        at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 7), "invaid inputs size for LinspaceConverter.");
+
+    bool has_tensor_scalar = check_inputs_tensor_scalar(engine, node);
+    
+    auto fill_layer = engine->network()->addFill(nvinfer1::Dims{1, {1}}, nvinfer1::FillOperation::kLINSPACE);
+    if (has_tensor_scalar) {
+        nvinfer1::ITensor* start = this->get_tensor_scalar(inputs[0]);
+        nvinfer1::ITensor* end = this->get_tensor_scalar(inputs[1]);
+        nvinfer1::ITensor* step = this->get_tensor_scalar(inputs[2]);
+        // steps为None时默认值为100
+        if (step == nullptr) {
+            step = tensor_to_const(engine, at::tensor({100}, at::ScalarType::Float));
+        }
+        // 默认输出类型为float，以下操作也需要转为float
+        // alpha=start，delta=(end - start) / (step - 1)
+        if (start->getType() != nvinfer1::DataType::kFLOAT) {
+            auto identity = engine->network()->addIdentity(*start);
+            identity->setOutputType(0, nvinfer1::DataType::kFLOAT);
+            identity->setName((layer_info(node) + "_IIdentityLayer_for_start").c_str());
+            start = identity->getOutput(0);
+        }
+        if (end->getType() != nvinfer1::DataType::kFLOAT) {
+            auto identity = engine->network()->addIdentity(*end);
+            identity->setOutputType(0, nvinfer1::DataType::kFLOAT);
+            identity->setName((layer_info(node) + "_IIdentityLayer_for_end").c_str());
+            end = identity->getOutput(0);
+        }
+        if (step->getType() != nvinfer1::DataType::kFLOAT) {
+            auto identity = engine->network()->addIdentity(*step);
+            identity->setOutputType(0, nvinfer1::DataType::kFLOAT);
+            identity->setName((layer_info(node) + "_IIdentityLayer_for_step").c_str());
+            step = identity->getOutput(0);
+        }
+        // (end - start)
+        nvinfer1::ILayer* sub_layer = add_elementwise(engine, nvinfer1::ElementWiseOperation::kSUB, 
+                                                        end, start, layer_info(node) + "_sub(end_start)");
+        nvinfer1::ITensor* length = sub_layer->getOutput(0);
+        // (step - 1)
+        nvinfer1::ITensor* one = tensor_to_const(engine, at::tensor({1}, at::ScalarType::Float));
+        nvinfer1::ILayer* sub_layer2 = add_elementwise(engine, nvinfer1::ElementWiseOperation::kSUB, 
+                                                        step, one, layer_info(node) + "_sub(step_one)");
+        nvinfer1::ITensor* step_sub_one = sub_layer2->getOutput(0);
+        // (end - start) / (step - 1)
+        nvinfer1::ILayer* div_layer = add_elementwise(engine, nvinfer1::ElementWiseOperation::kDIV, 
+                                                        length, step_sub_one, layer_info(node) + "_div(get_delta)");
+        nvinfer1::ITensor* delta = div_layer->getOutput(0);
+        // step需要转回int32作为Ifilllayer input0的输入，用于指定输出的dim
+        if (step->getType() == nvinfer1::DataType::kFLOAT) {
+            auto identity = engine->network()->addIdentity(*step);
+            identity->setOutputType(0, nvinfer1::DataType::kINT32);
+            identity->setName((layer_info(node) + "_IIdentityLayer_for_step_back").c_str());
+            step = identity->getOutput(0);
+        }
+        // 输出只有一维，Ifilllayer需要start的rank为0（check_inputs_tensor_scalar中start scalar转nvtensor时自带了1维）
+        if (start->getDimensions().nbDims > 0) {
+            nvinfer1::IShuffleLayer* shuffle_l = engine->network()->addShuffle(*start);
+            nvinfer1::Dims start_dim;
+            start_dim.nbDims = 0;
+            shuffle_l->setReshapeDimensions(start_dim);
+            shuffle_l->setName((layer_info(node) + "_IShuffleLayer_for_start").c_str());
+            start = shuffle_l->getOutput(0);
+        }
+        fill_layer->setInput(0, *step);
+        fill_layer->setInput(1, *start);
+        fill_layer->setInput(2, *delta);
+    } else {
+        torch::jit::IValue start_ivalue = engine->context().get_constant(inputs[0]);
+        torch::jit::IValue end_ivalue = engine->context().get_constant(inputs[1]);
+        torch::jit::IValue maybe_step = engine->context().get_constant(inputs[2]);
+        float start = start_ivalue.toScalar().to<float>();
+        float end = end_ivalue.toScalar().to<float>();
+        float step = 100.0;
+        if (!maybe_step.isNone()) {
+            step = maybe_step.toScalar().to<float>();
+        }
+        float delta = (end - start) / (step - 1);
+        std::vector<int64_t> output_dims = {(int64_t)step};
+        fill_layer->setDimensions(sizes_to_nvdim(output_dims));
+        fill_layer->setAlpha(start);
+        fill_layer->setBeta(delta);
+    }
+
+    fill_layer->setName((layer_info(node) + "_IFillLayer").c_str());
+    nvinfer1::ITensor* output = fill_layer->getOutput(0);
+
+    // extract dtype
+    torch::jit::IValue maybe_type = engine->context().get_constant(inputs[3]);
+    // 如果输出不为空，则最后变换输出类型
+    if (!maybe_type.isNone()) {
+        nvinfer1::DataType output_type = attype_to_nvtype(maybe_type.toScalarType());
+        auto identity = engine->network()->addIdentity(*output);
+        identity->setName((layer_info(node) + "_IIdentityLayer_for_output").c_str());
+        identity->setOutputType(0, output_type);
+        output = identity->getOutput(0);
+    }
+    
+    engine->context().set_tensor(node->outputs()[0], output);
+    LOG(INFO) << "Output tensor shape: " << output->getDimensions();
+    return true;
+}
+
+// aten::full_like(Tensor self, Scalar fill_value, *, int? dtype=None, int? layout=None, Device? device=None, bool? pin_memory=None, int? memory_format=None) -> (Tensor)
+bool FulllikeConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 7), "invaid inputs size for FulllikeConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for FulllikeConverter is not Tensor as expected");
+    // extract self
+    nvinfer1::ITensor* self = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((self != nullptr), "Unable to init input tensor for node: " << *node);
+    nvinfer1::Dims self_dims = self->getDimensions();
+    
+    // 先转成float去接input[1]输入
+    auto scalar_ivalue = (engine->context().get_constant(inputs[1]));
+    float scalar = scalar_ivalue.toScalar().to<float>();
+
+    // extract type
+    torch::jit::IValue maybe_type = engine->context().get_constant(inputs[2]);
+
+    bool is_dynamic = check_nvtensor_is_dynamic(self);
+
+    nvinfer1::IFillLayer* fill_layer = engine->network()->addFill(nvinfer1::Dims{1, {1}}, 
+                                                                    nvinfer1::FillOperation::kLINSPACE);
+    // set fill shape
+    if (is_dynamic) {
+        nvinfer1::ITensor* shape_tensor = engine->network()->addShape(*self)->getOutput(0);
+        fill_layer->setInput(0, *shape_tensor);
+    } else {
+        fill_layer->setDimensions(self_dims);
+    }
+
+    at::ScalarType init_type = (inputs[0]->type()->cast<c10::TensorType>())->scalarType().value();
+    if (init_type == at::ScalarType::Long) {
+        init_type = at::ScalarType::Int;
+    } else if (init_type == at::ScalarType::Double) {
+        init_type = at::ScalarType::Float;
+    }
+    // 默认输出类型和self一致，与torch保持一致
+    at::Tensor value_tensor = torch::tensor(scalar, {init_type});
+    at::Tensor delta_tensor = torch::zeros(self_dims.nbDims, {init_type}); // 每个方向上的变化，所以self_rank个0
+    if (!maybe_type.isNone()) {
+        at::ScalarType input_type = maybe_type.toScalarType();
+        if (input_type == at::ScalarType::Long) {
+            input_type = at::ScalarType::Int;
+        } else if (input_type == at::ScalarType::Double) {
+            input_type = at::ScalarType::Float;
+        }
+        value_tensor = value_tensor.to(input_type);
+        delta_tensor = delta_tensor.to(input_type);
+    }
+
+    nvinfer1::ITensor* value_itensor = tensor_to_const(engine, value_tensor);
+    fill_layer->setInput(1, *value_itensor); // 初始值
+
+    nvinfer1::ITensor* delta_itensor = tensor_to_const(engine, delta_tensor);
+    fill_layer->setInput(2, *delta_itensor);
+    nvinfer1::ITensor* output = fill_layer->getOutput(0);
+
+    fill_layer->setName((layer_info(node) + "_IFillLayer").c_str());
+    engine->context().set_tensor(node->outputs()[0], output);
+    LOG(INFO) << "Output tensor shape: " << output->getDimensions();
+    return true;
+}
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, ZerosLikeConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, ZerosConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, OnesConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, FullConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, ArangeConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, TensorConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, LinspaceConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, FulllikeConverter);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/generate.h b/poros/src/poros/converter/gpu/generate.h
new file mode 100644
index 0000000000..22775480df
--- /dev/null
+++ b/poros/src/poros/converter/gpu/generate.h
@@ -0,0 +1,212 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file generate.h
+* @author tianshaoqing@baidu.com
+* @date Mon Dec 6 14:29:20 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include <torch/script.h>
+#include <torch/version.h>
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+// Tensor zeros_like(const Tensor & self, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory, c10::optional<MemoryFormat> memory_format);
+class ZerosLikeConverter : public GpuConverter {
+public:
+    ZerosLikeConverter() {}
+    virtual ~ZerosLikeConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::zeros_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor"};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::zeros_like};
+    }
+};
+
+// Tensor zeros(IntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); 
+// aten::zeros(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor"
+class ZerosConverter : public GpuConverter {
+public:
+    ZerosConverter() {}
+    virtual ~ZerosConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::zeros(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor"};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::zeros};
+    }
+
+    bool assign_schema_attr() {
+        return assign_schema_attr_helper({{"aten::zeros(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", {1, 1}}});
+    }
+};
+
+class OnesConverter : public GpuConverter {
+public:
+    OnesConverter() {}
+    virtual ~OnesConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::ones(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor"};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::ones};
+    }
+
+    bool assign_schema_attr() {
+        return assign_schema_attr_helper({{"aten::ones(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", {1, 1}}});
+    }
+};
+
+class FullConverter : public GpuConverter {
+public:
+    FullConverter() {}
+    virtual ~FullConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::full(int[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor"};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::full};
+    }
+
+    bool assign_schema_attr() {
+        return assign_schema_attr_helper({{"aten::full(int[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", {1, 1}}});
+    }
+};
+
+class ArangeConverter : public GpuConverter {
+public:
+    ArangeConverter() {}
+    virtual ~ArangeConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor",
+                "aten::arange.start(Scalar start, Scalar end, *, int? dtype=None, int? layout=None, Device? device=None, bool? pin_memory=None) -> (Tensor)",};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::arange};
+    }
+
+    bool assign_schema_attr() {
+        bool result = true;
+        result &= assign_schema_attr_helper({{"aten::arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", {1, 1}}});
+        result &= assign_schema_attr_helper({{"aten::arange.start(Scalar start, Scalar end, *, int? dtype=None, int? layout=None, Device? device=None, bool? pin_memory=None) -> (Tensor)", {1, 1}}});
+        return result;
+    }
+};
+
+class TensorConverter : public GpuConverter {
+public:
+    TensorConverter() {}
+    virtual ~TensorConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::tensor(t[] data, *, int? dtype=None, Device? device=None, bool requires_grad=False) -> (Tensor)",
+                "aten::tensor.int(int t, *, int? dtype=None, Device? device=None, bool requires_grad=False) -> (Tensor)"};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::tensor};
+    }
+
+    bool assign_schema_attr() {
+        bool result = true;
+        result &= assign_schema_attr_helper({{"aten::tensor(t[] data, *, int? dtype=None, Device? device=None, bool requires_grad=False) -> (Tensor)", {1, 1}}});
+        result &= assign_schema_attr_helper({{"aten::tensor.int(int t, *, int? dtype=None, Device? device=None, bool requires_grad=False) -> (Tensor)", {1, 1}}});
+        return result;
+    }
+};
+
+class LinspaceConverter : public GpuConverter {
+public:
+    LinspaceConverter() {}
+    virtual ~LinspaceConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::linspace};
+    }
+
+    // aten::linspace schema changed in torch-1.11
+    const std::vector<std::string> schema_string() {
+        if (TORCH_VERSION_MAJOR < 2 && TORCH_VERSION_MINOR < 11) {
+            return {"aten::linspace(Scalar start, Scalar end, int? steps=None, *, int? dtype=None, int? layout=None, Device? device=None, bool? pin_memory=None) -> (Tensor)",};
+        } else {
+            return {"aten::linspace(Scalar start, Scalar end, int steps, *, int? dtype=None, int? layout=None, Device? device=None, bool? pin_memory=None) -> (Tensor)",};
+        }
+    }
+    
+    // aten::linspace schema changed in torch-1.11
+    bool assign_schema_attr() {
+        if (TORCH_VERSION_MAJOR < 2 && TORCH_VERSION_MINOR < 11) {
+            return assign_schema_attr_helper({{"aten::linspace(Scalar start, Scalar end, int? steps=None, *, int? dtype=None, int? layout=None, Device? device=None, bool? pin_memory=None) -> (Tensor)", {1, 1}}});
+        } else {
+            return assign_schema_attr_helper({{"aten::linspace(Scalar start, Scalar end, int steps, *, int? dtype=None, int? layout=None, Device? device=None, bool? pin_memory=None) -> (Tensor)", {1, 1}}});
+        }
+    }
+};
+
+class FulllikeConverter : public GpuConverter {
+public:
+    FulllikeConverter() {}
+    virtual ~FulllikeConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::full_like(Tensor self, Scalar fill_value, *, int? dtype=None, int? layout=None, Device? device=None, bool? pin_memory=None, int? memory_format=None) -> (Tensor)",};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::full_like};
+    }
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
\ No newline at end of file
diff --git a/poros/src/poros/converter/gpu/gpu_converter.cpp b/poros/src/poros/converter/gpu/gpu_converter.cpp
new file mode 100644
index 0000000000..4d23e77b0a
--- /dev/null
+++ b/poros/src/poros/converter/gpu/gpu_converter.cpp
@@ -0,0 +1,100 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file gpu_converter.cpp
+* @author tianshaoqing@baidu.com
+* @date Mon Dec 27 11:24:21 CST 2021
+* @brief 
+**/
+
+#include "poros/converter/gpu/gpu_converter.h"
+
+#include "poros/converter/gpu/weight.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+    
+bool GpuConverter::check_inputs_tensor_scalar(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    bool has_tensor_scalar = false;
+    // 检查int或int[]类型的输入是否包含nvtensor
+    for (size_t i = 0; i < inputs.size(); i++) {
+        const torch::jit::Value* node_input = inputs[i];
+        // int, int[] or float32
+        if (node_input->type()->kind() == c10::TypeKind::IntType ||
+            node_input->type()->isSubtypeOf(c10::ListType::ofInts()) ||
+            node_input->type()->str() == "float") {
+            if (engine->context().get_tensor(node_input) != nullptr) {
+                // 检查成功立刻停止循环
+                LOG(INFO) << node_info(node) << ": inputs[" << i << "] is tensor scalar.";
+                has_tensor_scalar = true;
+                break;
+            }
+        }
+    }
+    // 如果int或int[]中包含nvtensor, 将所有输入int与nvtensor建立映射到map中
+    if (has_tensor_scalar) {
+        _tensor_scalar_map.clear();
+        for (size_t i = 0; i < inputs.size(); i++) {
+            const torch::jit::Value* node_input = inputs[i];
+            // int or int[] 
+            // 2022.5.13 @wangrui39: 这里加了float，这个情况会用在aten::div(Scalar a, Scalar b) -> (float)
+            if (node_input->type()->kind() == c10::TypeKind::IntType ||
+                node_input->type()->isSubtypeOf(c10::ListType::ofInts()) ||
+                node_input->type()->str() == "float") {
+                nvinfer1::ITensor* temp = engine->context().get_tensor(inputs[i]);
+                // 若直接获取到了nvtensor, 直接建立映射
+                if (temp != nullptr) {
+                    _tensor_scalar_map.emplace(inputs[i], temp);
+                } else {
+                    // 若未获取int或int[]对应到nvtensor, get其ivalue值再转成nvtensor, 建立映射关系
+                    torch::jit::IValue temp_ivalue = engine->context().get_constant(inputs[i]);
+                    if (temp_ivalue.isInt()) {
+                        int64_t temp_int = temp_ivalue.toScalar().to<int64_t>();
+                        _tensor_scalar_map.emplace(inputs[i], 
+                        tensor_to_const(engine, torch::tensor({temp_int}, torch::kInt)));
+                    } else if (temp_ivalue.type()->str() == "float") {
+                        float temp_float = temp_ivalue.toScalar().to<float>();
+                        _tensor_scalar_map.emplace(inputs[i], 
+                        tensor_to_const(engine, torch::tensor({temp_float}, torch::kFloat)));
+                    } else if (temp_ivalue.isIntList()){
+                        _tensor_scalar_map.emplace(inputs[i], 
+                        tensor_to_const(engine, torch::tensor(temp_ivalue.toIntList().vec(), torch::kInt)));
+                    } else {
+                        // 若获取ivalue也失败, 则建立int与空指针的关系, 外部获取后需判断
+                        _tensor_scalar_map.emplace(inputs[i], nullptr);
+                        LOG(FATAL) << node_info(node) + std::string(" input[") + 
+                                        std::to_string(i) + std::string("] get int ivalue false.");
+                    }
+                }
+            }
+        }
+    }
+    return has_tensor_scalar;
+}
+
+nvinfer1::ITensor* GpuConverter::get_tensor_scalar(const torch::jit::Value* value) {
+    auto it = _tensor_scalar_map.find(value);
+    if (it == _tensor_scalar_map.end()) {
+        return nullptr;
+    }
+    return it->second;
+}
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/gpu_converter.h b/poros/src/poros/converter/gpu/gpu_converter.h
new file mode 100644
index 0000000000..32a3c96700
--- /dev/null
+++ b/poros/src/poros/converter/gpu/gpu_converter.h
@@ -0,0 +1,74 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file gpu_converter.h
+* @author tianjinjin@baidu.com
+* @author huangben@baidu.com
+* @date Tue Jul 27 11:24:21 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/iconverter.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/log/poros_logging.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class GpuConverter : public IConverter {
+public:
+    virtual ~GpuConverter() {}
+    virtual bool converter(TensorrtEngine* engine, const torch::jit::Node *node) = 0;
+    virtual bool converter(IEngine* engine, const torch::jit::Node *node) {
+        return converter(static_cast<TensorrtEngine*>(engine), node);
+    }
+    virtual const std::vector<std::string> schema_string() = 0;
+    virtual const std::vector<torch::jit::NodeKind> node_kind() = 0;
+    
+protected:
+    /**
+     * @brief Check whether the scalar inputs of the node are nvinfer1::ITensor (not come from prim::Constant). 
+     *        If yes, convert other scalar inputs to nvinfer1::ITensor and save them in _tensor_scalar_map. 
+     *        The type of nvinfer1::ITensor is consistent with the original scalar.
+     *
+     * @param [in] engine : TensorrtEngine
+     * @param [in] node : node in torch::jit::Graph
+     * @return bool
+     * @retval true => yes  false => no
+    **/
+    bool check_inputs_tensor_scalar(TensorrtEngine* engine, const torch::jit::Node *node);
+    /**
+     * @brief get nvinfer1::ITensor* type scalar from _tensor_scalar_map.
+     *
+     * @param [in] value : the input value of the node.
+     * @return nvinfer1::ITensor*
+    **/
+    nvinfer1::ITensor* get_tensor_scalar(const torch::jit::Value* value);
+
+private:
+    std::unordered_map<const torch::jit::Value*, nvinfer1::ITensor*> _tensor_scalar_map;
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/group_norm.cpp b/poros/src/poros/converter/gpu/group_norm.cpp
new file mode 100644
index 0000000000..7282c082f1
--- /dev/null
+++ b/poros/src/poros/converter/gpu/group_norm.cpp
@@ -0,0 +1,461 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file group_norm.cpp
+* @author tianshaoqing@baidu.com
+* @date Fri Jan 21 15:28:37 CST 2022
+* @brief
+**/
+
+#include "poros/converter/gpu/group_norm.h"
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/context/poros_global.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+/**
+ * @brief expand_gamma_beta
+ * such as: 
+ * input shape is [2, 10, 3, 3].
+ * This function first shuffle gamma(or beta) shape from [10] to [10, 1, 1],
+ * then slice gamma(or beta) shape from [10, 1, 1] to [10, 3, 3].
+ *
+ * @param [in] engine : engine of group_norm converter.
+ * @param [in] weight_tensor : gamma or beta tensor.
+ * @param [in] weight_shuffle_dims : gamma or beta shuffle dims.
+ * @param [in] target_size : if input is dynamic, this parameter determines the slice size.
+ * @param [in] target_dims : if input is not dynamic, this parameter determines the slice size.
+ * @param [in] is_dynamic : input is dynamic or not.
+ * @return nvinfer1::ITensor*
+ * @retval 
+**/
+static nvinfer1::ITensor* expand_gamma_beta(TensorrtEngine* engine,
+                                            nvinfer1::ITensor* weight_tensor, 
+                                            const nvinfer1::Dims& weight_shuffle_dims,
+                                            nvinfer1::ITensor* target_size, 
+                                            const nvinfer1::Dims& target_dims,
+                                            const bool& is_dynamic,
+                                            const std::string& name) {
+    nvinfer1::IShuffleLayer* shuffle_l = engine->network()->addShuffle(*weight_tensor);
+    shuffle_l->setReshapeDimensions(weight_shuffle_dims);
+    std::vector<int64_t> start(target_dims.nbDims, 0), stride(target_dims.nbDims, 0);
+    stride[0] = 1;
+    nvinfer1::ISliceLayer* slice_l = engine->network()->addSlice(*(shuffle_l->getOutput(0)), 
+                                                                    sizes_to_nvdim(start), 
+                                                                    target_dims, 
+                                                                    sizes_to_nvdim(stride));
+    if (is_dynamic) {
+        slice_l->setInput(2, *target_size);
+    }
+    slice_l->setName(name.c_str());
+    return slice_l->getOutput(0);
+}
+
+// aten::group_norm(Tensor input, int num_groups, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enabled=True) -> Tensor
+bool GroupNormConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 6), "invaid inputs size for GroupNormConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for GroupNormConverter is not Tensor as expected");
+    // weight & bias
+    POROS_CHECK_TRUE((inputs[2]->node()->kind() == torch::jit::prim::Constant),
+        "input[2] for GroupNormConverter is not come from prim::Constant as expected");
+    POROS_CHECK_TRUE((inputs[3]->node()->kind() == torch::jit::prim::Constant),
+        "input[3] for GroupNormConverter is not come from prim::Constant as expected");
+
+    nvinfer1::ITensor* input = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((input != nullptr), "Unable to init input tensor for node: " << *node);
+
+    //extract rank of input and check it
+    int input_rank = input->getDimensions().nbDims;
+    if (input_rank < 2) {
+        LOG(WARNING) << *node << ": num of input dimensions must be greater than 2, but got " << input_rank;
+        return false;
+    }
+
+    //extract tensor type info of input
+    at::ScalarType tensor_type = nvtype_to_attype(input->getType());
+    auto options = torch::TensorOptions().dtype(tensor_type);
+    
+    //extract shape of input
+    nvinfer1::Dims ori_shape = input->getDimensions();
+    nvinfer1::ITensor* ori_shape_tensor = engine->network()->addShape(*input)->getOutput(0);
+    std::vector<int64_t> ori_shape_vec = nvdim_to_sizes(ori_shape);
+    int64_t channel_size = ori_shape_vec[1];
+
+    //extract num_groups info
+    int64_t num_groups = (engine->context().get_constant(inputs[1])).toInt();
+
+    // check input is dynamic or not
+    bool is_dynamic = check_nvtensor_is_dynamic(input);
+    
+    if (!is_dynamic && channel_size % num_groups != 0) {
+        LOG(WARNING) << *node << ":Expected number of channels in input to be divisible by num_groups," 
+        << " but got input of shape " << ori_shape << ", and num_groups=" << num_groups;
+        return false;
+    }
+    
+    // ATTENTION! we need to static_cast eps from double to float. otherwise coredump happened in instancenorm plugin
+    //double eps = (engine->context().get_constant(inputs[4])).toDouble();
+    float eps = static_cast<float>(engine->context().get_constant(inputs[4]).toDouble());
+
+    //reshape input
+    std::vector<int64_t> new_shape = {0, num_groups, -1};
+    nvinfer1::ITensor* new_shape_tensor = tensor_to_const(engine, torch::tensor(new_shape, torch::kInt64));    
+    nvinfer1::IShuffleLayer* input_shuffle = engine->network()->addShuffle(*input);
+    input_shuffle->setInput(1, *new_shape_tensor);
+    input_shuffle->setName((layer_info(node) + "_IShuffleLayer_for_input").c_str());
+    nvinfer1::ITensor*  input_reshaped = input_shuffle->getOutput(0);
+
+    // const std::vector<int32_t> expand_axes{3};
+    // input_reshaped = unsqueeze_itensor(engine, input_reshaped, expand_axes);
+    nvinfer1::ITensor* norm_input = add_padding(engine, node, input_reshaped, 4);
+
+    torch::Tensor weight_ = at::ones(num_groups, options).cpu().contiguous();
+    torch::Tensor bias_ = at::zeros(num_groups, options).cpu().contiguous();
+
+    //set to instancenorm first
+    const int relu = 0;
+    const float alpha = 0;
+    std::vector<nvinfer1::PluginField> f;
+    f.emplace_back(nvinfer1::PluginField("epsilon", &eps, nvinfer1::PluginFieldType::kFLOAT32, 1));
+    f.emplace_back(nvinfer1::PluginField("scales", weight_.data_ptr<float>(), nvinfer1::PluginFieldType::kFLOAT32, weight_.numel()));
+    f.emplace_back(nvinfer1::PluginField("bias", bias_.data_ptr<float>(), nvinfer1::PluginFieldType::kFLOAT32, bias_.numel()));
+    f.emplace_back(nvinfer1::PluginField("relu", &relu, nvinfer1::PluginFieldType::kINT32, 1));
+    f.emplace_back(nvinfer1::PluginField("alpha", &alpha, nvinfer1::PluginFieldType::kFLOAT32, 1));
+    
+    nvinfer1::PluginFieldCollection fc;
+    fc.nbFields = f.size();
+    fc.fields = f.data();
+    
+    auto creator = getPluginRegistry()->getPluginCreator("InstanceNormalization_TRT", "1", "");
+    auto instance_norm_plugin = creator->createPlugin("instance_norm", &fc);
+    
+    POROS_CHECK(instance_norm_plugin, "Unable to create instance_norm plugin from TensorRT plugin registry" << *node);
+    auto new_layer = engine->network()->addPluginV2(
+        reinterpret_cast<nvinfer1::ITensor* const*>(&norm_input), 1, *instance_norm_plugin);
+    new_layer->setName((layer_info(node) + "_plugin_instance_norm").c_str());
+    nvinfer1::ITensor* norm_reshaped = new_layer->getOutput(0);
+
+    nvinfer1::IShuffleLayer* norm_shuffle = engine->network()->addShuffle(*norm_reshaped);
+    norm_shuffle->setInput(1, *ori_shape_tensor);
+    norm_shuffle->setName((layer_info(node) + "_IShuffleLayer_for_input_back").c_str());
+    nvinfer1::ITensor* norm = norm_shuffle->getOutput(0);
+
+    std::vector<int> axes(input_rank - 2);
+    std::iota(axes.begin(), axes.end(), 1);
+
+    nvinfer1::ITensor* weight = engine->context().get_tensor(inputs[2]);
+    if (weight == nullptr) {
+        weight = tensor_to_const(engine, at::ones(1, options));
+    }
+    weight =  unsqueeze_itensor(engine, weight, axes);
+
+    nvinfer1::ITensor* bias = engine->context().get_tensor(inputs[3]);
+    if (bias == nullptr) {
+        bias = tensor_to_const(engine, at::zeros(1, options));
+    }
+    bias =  unsqueeze_itensor(engine, bias, axes);
+
+    //add(mul(norm, weight), bias)
+    nvinfer1::ITensor* mul_tensor = add_elementwise(engine,
+                            nvinfer1::ElementWiseOperation::kPROD,
+                            norm,
+                            weight,
+                            layer_info(node) + "_prod")->getOutput(0);
+    
+    nvinfer1::ITensor* final_tensor = add_elementwise(engine,
+                            nvinfer1::ElementWiseOperation::kSUM,
+                            mul_tensor,
+                            bias,
+                            layer_info(node) + "_sum")->getOutput(0);
+
+    engine->context().set_tensor(node->outputs()[0], final_tensor);
+    LOG(INFO) << "Output tensor shape: " << final_tensor->getDimensions();
+    return true;
+}
+
+// aten::group_norm(Tensor input, int num_groups, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enabled=True) -> Tensor
+bool GroupNormConverter::converter_old(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 6), "invaid inputs size for GroupNormConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for GroupNormConverter is not Tensor as expected");
+    // weight & bias
+    POROS_CHECK_TRUE((inputs[2]->node()->kind() == torch::jit::prim::Constant),
+        "input[2] for GroupNormConverter is not come from prim::Constant as expected");
+    POROS_CHECK_TRUE((inputs[3]->node()->kind() == torch::jit::prim::Constant),
+        "input[3] for GroupNormConverter is not come from prim::Constant as expected");
+
+    nvinfer1::ITensor* input = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((input != nullptr), "Unable to init input tensor for node: " << *node);
+    nvinfer1::Dims ori_shape = input->getDimensions();
+    
+    if (ori_shape.nbDims < 2) {
+        LOG(WARNING) << *node << ": num of input dimensions must be greater than 2, but got " << ori_shape.nbDims;
+        return false;
+    }
+    
+    std::vector<int64_t> ori_shape_vec = nvdim_to_sizes(ori_shape);
+    int64_t num_groups = (engine->context().get_constant(inputs[1])).toInt();
+
+    // check input is dynamic or not
+    bool is_dynamic = check_nvtensor_is_dynamic(input);
+    
+    if (!is_dynamic && ori_shape_vec[1] % num_groups != 0) {
+        LOG(WARNING) << *node << ":Expected number of channels in input to be divisible by num_groups," 
+        << " but got input of shape " << ori_shape << ", and num_groups=" << num_groups;
+        return false;
+    }
+
+    // Unwrap eps.
+    double eps = (engine->context().get_constant(inputs[4])).toDouble();
+    std::vector<nvinfer1::ITensor*> input_groups;
+    std::vector<nvinfer1::ITensor*> output_groups;
+    
+    // divide input into num_group parts on channels
+    // such as: 
+    // input shape is [2, 10, 3, 3] and num_group = 2.
+    // input is divided into 2 groups on channels, and each shape is [2, 5, 3, 3].
+    std::vector<int64_t> start_vec(ori_shape_vec.size(), 0), size_vec(ori_shape_vec), stride_vec(ori_shape_vec.size(), 1);
+    std::vector<int64_t> group_channel_rev_mask_vec(ori_shape_vec.size(), 1);
+    std::vector<int64_t> group_channel_mask_vec(ori_shape_vec.size(), 0);
+    group_channel_rev_mask_vec[1] = 0;
+    group_channel_mask_vec[1] = 1;
+    nvinfer1::Dims start_dims, size_dims, stride_dims;
+    size_vec[1] = ori_shape_vec[1] / num_groups;
+    if (is_dynamic) {
+        for (size_t i = 0; i < size_vec.size(); i++) {
+            size_vec[i] = 0;
+        }
+    }
+    start_dims = sizes_to_nvdim(start_vec);
+    size_dims = sizes_to_nvdim(size_vec);
+    stride_dims = sizes_to_nvdim(stride_vec);
+    nvinfer1::ITensor* ori_shape_tensor = nullptr;
+    nvinfer1::ITensor* size_tensor = nullptr;
+    nvinfer1::ITensor* start_tensor = nullptr;
+
+    if (is_dynamic) {
+        ori_shape_tensor = engine->network()->addShape(*input)->getOutput(0);
+        at::Tensor group_channel_rev_mask_tensor = torch::tensor(group_channel_rev_mask_vec, torch::kInt);
+        group_channel_rev_mask_tensor[1] = num_groups;
+        size_tensor = add_elementwise(engine, 
+                        nvinfer1::ElementWiseOperation::kDIV, 
+                        ori_shape_tensor, 
+                        tensor_to_const(engine, group_channel_rev_mask_tensor), 
+                        (layer_info(node) + "_div_for_shape").c_str())->getOutput(0);
+        at::Tensor group_channel_mask_tensor = torch::tensor(group_channel_mask_vec, torch::kInt);
+
+        start_tensor = add_elementwise(engine, 
+                        nvinfer1::ElementWiseOperation::kPROD, 
+                        size_tensor, 
+                        tensor_to_const(engine, group_channel_mask_tensor), 
+                        (layer_info(node) + "_prod_for_shape").c_str())->getOutput(0);
+    }
+
+    for (int i = 0; i < num_groups; i++) {
+        start_dims.d[1] = size_vec[1] * i;
+        nvinfer1::ISliceLayer* slice_l = engine->network()->addSlice(*input, start_dims, size_dims, stride_dims);
+        if (is_dynamic) {
+            nvinfer1::ITensor* start_it_tensor = add_elementwise(engine, 
+                                    nvinfer1::ElementWiseOperation::kPROD,
+                                    start_tensor,
+                                    tensor_to_const(engine, torch::tensor(i, torch::kInt)),
+                                    (layer_info(node) + "_prod_for_start_" + std::to_string(i)).c_str())->getOutput(0);
+            slice_l->setInput(1, *start_it_tensor);
+            slice_l->setInput(2, *size_tensor);
+            slice_l->setName((layer_info(node) + "_ISliceLayer_" + std::to_string(i)).c_str());
+        }
+        input_groups.push_back(slice_l->getOutput(0));
+    }
+    // calculate (x - E[x]) / sqrt((var + eps)) for each group
+    for (size_t i = 0; i < input_groups.size(); i++) {
+        // Set up axis_ask for E[x].
+        uint32_t axis_mask = 0;
+        for (size_t i = 0; i < ori_shape_vec.size() - 1; i++) {
+            axis_mask |= 1 << (ori_shape_vec.size() - i - 1);
+        }
+        LOG(INFO) << "Axis Mask for E[x]" << std::bitset<32>(axis_mask);
+        
+        // E[x]
+        nvinfer1::IReduceLayer* mean_expected = engine->network()->addReduce(*input_groups[i], 
+                                                            nvinfer1::ReduceOperation::kAVG, axis_mask, true);
+        POROS_CHECK(mean_expected, "Unable to create mean_expected from node: " << *node);
+        mean_expected->setName((layer_info(node) + "_IReduceLayer(mean_expected)_" + std::to_string(i)).c_str());
+        nvinfer1::ITensor* mean_expected_out = mean_expected->getOutput(0);
+
+        // X-E[x]
+        nvinfer1::ILayer* sub = add_elementwise(engine, 
+                        nvinfer1::ElementWiseOperation::kSUB, 
+                        input_groups[i], 
+                        mean_expected_out, 
+                        (layer_info(node) + "_sub_" + std::to_string(i)).c_str());
+        POROS_CHECK(sub, "Unable to create Sub layer from node: " << *node);
+        nvinfer1::ITensor* xsubmean_out = sub->getOutput(0);
+        
+        // Variance = mean(pow(xsubmean,2))
+        float pow_scalar = 2.0;
+        nvinfer1::ITensor* exponent = tensor_to_const(engine, torch::tensor({pow_scalar}, torch::kFloat));
+        nvinfer1::ILayer* pow = add_elementwise(engine, 
+                        nvinfer1::ElementWiseOperation::kPOW, 
+                        xsubmean_out, 
+                        exponent, 
+                        (layer_info(node) + "_pow_" + std::to_string(i)).c_str());
+        POROS_CHECK(pow, "Unable to create Pow layer from node: " << *node);
+        nvinfer1::ITensor* pow_out = pow->getOutput(0);
+        
+        nvinfer1::IReduceLayer* mean_var = engine->network()->addReduce(*pow_out, 
+                                                                nvinfer1::ReduceOperation::kAVG, axis_mask, true);
+        POROS_CHECK(mean_var, "Unable to create mean_var from node: " << *node);
+        mean_var->setName((layer_info(node) + "_IReduceLayer(mean_var)_" + std::to_string(i)).c_str());
+        nvinfer1::ITensor* mean_var_out = mean_var->getOutput(0);
+        
+        // Variance + eps
+        nvinfer1::ITensor* eps_tensor = tensor_to_const(engine, torch::tensor({eps}, torch::kFloat));
+        nvinfer1::ILayer* add = add_elementwise(engine, 
+                        nvinfer1::ElementWiseOperation::kSUM, 
+                        mean_var_out, 
+                        eps_tensor, 
+                        (layer_info(node) + "_add_" + std::to_string(i)).c_str());
+        POROS_CHECK(add, "Unable to create Add layer from node: " << *node);
+        nvinfer1::ITensor* add_out = add->getOutput(0);
+        
+        // SQRT((Var + eps))
+        nvinfer1::IUnaryLayer* sqrt = engine->network()->addUnary(*add_out, nvinfer1::UnaryOperation::kSQRT);
+        POROS_CHECK(sqrt, "Unable to create unary(sqrt) from node: " << *node);
+        sqrt->setName((layer_info(node) + "_IUnaryLayer_" + std::to_string(i)).c_str());
+        nvinfer1::ITensor* sqrt_out = sqrt->getOutput(0);
+        
+        // (x - E[x]) / sqrt((var + eps))
+        nvinfer1::ILayer* div = add_elementwise(engine, 
+                        nvinfer1::ElementWiseOperation::kDIV, 
+                        xsubmean_out, 
+                        sqrt_out, 
+                        (layer_info(node) + "_div_" + std::to_string(i)).c_str());
+        POROS_CHECK(div, "Unable to create div layer from node: " << *node);
+        nvinfer1::ITensor* div_out = div->getOutput(0);
+        output_groups.push_back(div_out);
+    }
+    nvinfer1::IConcatenationLayer* cat_layer = engine->network()->addConcatenation(output_groups.data(), 
+                                                                                    output_groups.size());
+    cat_layer->setAxis(1);
+    cat_layer->setName((layer_info(node) + "_IConcatenationLayer").c_str());
+    nvinfer1::ITensor* cat_out = cat_layer->getOutput(0);
+    engine->context().set_tensor(node->outputs()[0], cat_out);
+    
+    torch::jit::IValue maybe_weight = engine->context().get_constant(inputs[2]);
+    torch::jit::IValue maybe_bias = engine->context().get_constant(inputs[3]);
+    //when weight and bias setting is both None
+    if (!maybe_weight.isTensor() && !maybe_bias.isTensor()) {
+        engine->context().set_tensor(node->outputs()[0], cat_out);
+        LOG(INFO) << "Output tensor shape: " << cat_out->getDimensions();
+        return true;
+    }
+    
+    /*------------------------------------------------------------
+     * situation when weight or bias setting is not None
+     * ------------------------------------------------------------*/
+    // Remove batch dimension from input shape for expand_size, which will
+    // be used to create weights for addScaleNd later.
+
+    /** TODO: IS the first input size always are always be batch?????
+    * if not, this converter is not ok。
+    * */
+    
+    nvinfer1::ILayer* scale_l = nullptr;
+    nvinfer1::ILayer* shift_l = nullptr;
+    std::vector<int64_t> weights_dims_vec;
+    std::vector<int64_t> weights_shuffle_dims_vec(ori_shape_vec.size() - 1, 1);
+    weights_shuffle_dims_vec[0] = ori_shape_vec[1];
+    weights_dims_vec.insert(weights_dims_vec.end(), ori_shape_vec.begin() + 1, ori_shape_vec.end());
+    nvinfer1::ITensor* weights_shape_tensor = nullptr;
+    // although shape of input is dynamic, its rank is fix. so we can remove its batch dim to get expand size.
+    if (is_dynamic) {
+        for (size_t i = 0; i < weights_dims_vec.size(); i++) {
+            weights_dims_vec[i] = 0;
+        }
+        std::vector<int64_t> start = {1}, size = {ori_shape.nbDims - 1}, stride = {1};
+        weights_shape_tensor = engine->network()->addSlice(*ori_shape_tensor, 
+                                                            sizes_to_nvdim(start), 
+                                                            sizes_to_nvdim(size), 
+                                                            sizes_to_nvdim(stride))->getOutput(0);
+    }
+    // if gamma exist
+    if (maybe_weight.isTensor()) {
+        torch::Tensor gamma = maybe_weight.toTensor();
+        nvinfer1::ITensor* gamma_tensor = tensor_to_const(engine, gamma);
+        nvinfer1::ITensor* gamma_tensor_expand = expand_gamma_beta(engine, 
+                                                                    gamma_tensor, 
+                                                                    sizes_to_nvdim(weights_shuffle_dims_vec), 
+                                                                    weights_shape_tensor, 
+                                                                    sizes_to_nvdim(weights_dims_vec), 
+                                                                    is_dynamic,
+                                                                    layer_info(node) + "_ISliceLayer_for_gamma");
+        scale_l = add_elementwise(engine, 
+                        nvinfer1::ElementWiseOperation::kPROD, 
+                        cat_out, 
+                        gamma_tensor_expand, 
+                        (layer_info(node) + "_prod_for_scale").c_str());
+    }
+    // if beta exist
+    if (maybe_bias.isTensor()) {
+        torch::Tensor ori_beta = maybe_bias.toTensor();
+        nvinfer1::ITensor* beta_tensor = tensor_to_const(engine, ori_beta);
+        nvinfer1::ITensor* beta_tensor_expand = expand_gamma_beta(engine, 
+                                                                beta_tensor, 
+                                                                sizes_to_nvdim(weights_shuffle_dims_vec), 
+                                                                weights_shape_tensor, 
+                                                                sizes_to_nvdim(weights_dims_vec), 
+                                                                is_dynamic,
+                                                                layer_info(node) + "_ISliceLayer_for_beta");
+        if (scale_l == nullptr) {
+            shift_l = add_elementwise(engine,
+                        nvinfer1::ElementWiseOperation::kSUM,
+                        cat_out,
+                        beta_tensor_expand,
+                        (layer_info(node) + "_sum_for_shift").c_str());
+
+        } else {
+            shift_l = add_elementwise(engine,
+                        nvinfer1::ElementWiseOperation::kSUM,
+                        scale_l->getOutput(0),
+                        beta_tensor_expand,
+                        (layer_info(node) + "_sum_for_shift").c_str());
+        }
+        nvinfer1::ITensor* shift_l_out = shift_l->getOutput(0);
+        engine->context().set_tensor(node->outputs()[0], shift_l_out);
+        LOG(INFO) << "Output tensor shape: " << shift_l_out->getDimensions();
+    } else {
+        nvinfer1::ITensor* scale_l_out = scale_l->getOutput(0);
+        engine->context().set_tensor(node->outputs()[0], scale_l_out);
+        LOG(INFO) << "Output tensor shape: " << scale_l_out->getDimensions();
+        
+    }
+    return true;
+}
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, GroupNormConverter);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/group_norm.h b/poros/src/poros/converter/gpu/group_norm.h
new file mode 100644
index 0000000000..c7057cbc79
--- /dev/null
+++ b/poros/src/poros/converter/gpu/group_norm.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file group_norm.h
+* @author tianshaoqing@baidu.com
+* @date Fri Jan 21 15:28:37 CST 2022
+* @brief
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class GroupNormConverter : public GpuConverter {
+public:
+    GroupNormConverter() {}
+    virtual ~GroupNormConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    bool converter_old(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    virtual const std::vector<std::string> schema_string() {
+        return {"aten::group_norm(Tensor input, int num_groups, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enabled=True) -> Tensor"};
+    }
+
+    virtual const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::group_norm};
+    }
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/interpolate.cpp b/poros/src/poros/converter/gpu/interpolate.cpp
new file mode 100644
index 0000000000..2a98b09068
--- /dev/null
+++ b/poros/src/poros/converter/gpu/interpolate.cpp
@@ -0,0 +1,599 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Part of the following code in this file refs to
+// https://github.com/pytorch/TensorRT/blob/master/core/conversion/converters/impl/interpolate.cpp
+//
+// Copyright (c) 2020-present, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// Licensed under the 3-Clause BSD License
+
+/**
+* @file interpolate.cpp
+* @author tianjinjin@baidu.com
+* @date Mon Mar  8 11:36:11 CST 2021
+* @brief 
+**/
+
+#include "poros/converter/gpu/interpolate.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/context/poros_global.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+/*
+ * Helper functions
+ */
+void create_plugin(TensorrtEngine* engine,
+                    const torch::jit::Node* node,
+                    nvinfer1::ITensor* in,
+                    const char* name,
+                    std::vector<int64_t> in_shape,
+                    std::vector<int64_t> out_shape,
+                    std::vector<int64_t> out_size,
+                    std::vector<double> scales,
+                    std::string mode,
+                    bool align_corners,
+                    bool use_scales = false) {
+    LOG(WARNING) << "Interpolation layer will be run through ATen, not TensorRT. "
+                 << "Performance may be lower than expected";
+    nvinfer1::PluginFieldCollection fc;
+    std::vector<nvinfer1::PluginField> f;
+    std::vector<int32_t> in_shape_casted(in_shape.begin(), in_shape.end());
+    f.emplace_back(nvinfer1::PluginField(
+        "in_shape", in_shape_casted.data(), nvinfer1::PluginFieldType::kINT32, in_shape.size()));
+        
+    std::vector<int32_t> out_shape_casted(out_shape.begin(), out_shape.end());
+    f.emplace_back(nvinfer1::PluginField(
+        "out_shape", out_shape_casted.data(), nvinfer1::PluginFieldType::kINT32, out_shape.size()));
+        
+    std::vector<int32_t> out_size_casted(out_size.begin(), out_size.end());
+    f.emplace_back(nvinfer1::PluginField(
+        "out_size", out_size_casted.data(), nvinfer1::PluginFieldType::kINT32, out_size.size()));
+        
+    f.emplace_back(nvinfer1::PluginField(
+        "scales", scales.data(), nvinfer1::PluginFieldType::kFLOAT64, scales.size()));
+    f.emplace_back(nvinfer1::PluginField(
+        "mode", &mode, nvinfer1::PluginFieldType::kCHAR, 1));
+    
+    int32_t align_corners_casted = static_cast<int32_t>(align_corners);
+    f.emplace_back(nvinfer1::PluginField(
+        "align_corners", &align_corners_casted, nvinfer1::PluginFieldType::kINT32, 1));
+    
+    int32_t use_scales_casted = static_cast<int32_t>(use_scales);
+    f.emplace_back(nvinfer1::PluginField(
+        "use_scales", &use_scales_casted, nvinfer1::PluginFieldType::kINT32, 1));
+    
+    fc.nbFields = f.size();
+    fc.fields = f.data();
+    auto creator = getPluginRegistry()->getPluginCreator("Interpolate", "1", "");
+    auto interpolate_plugin = creator->createPlugin(name, &fc);
+    
+    auto resize_layer = engine->network()->addPluginV2(
+        reinterpret_cast<nvinfer1::ITensor* const*>(&in), 1, *interpolate_plugin);
+    POROS_CHECK(resize_layer, "Unable to create interpolation plugin from node" << *node);
+    resize_layer->setName((layer_info(node) + "_plugin_Interpolate").c_str());
+
+    engine->context().set_tensor(node->outputs()[0], resize_layer->getOutput(0));
+    LOG(INFO) << "Output tensor shape: " << resize_layer->getOutput(0)->getDimensions();
+}
+
+void resize_layer_size(TensorrtEngine* engine,
+                        const torch::jit::Node* node,
+                        nvinfer1::ITensor* in,
+                        std::vector<int64_t> out_shape,
+                        std::vector<float> scales,
+                        nvinfer1::ResizeMode mode,
+                        bool align_corners = false) {
+    POROS_CHECK((out_shape.size() > 0) ^ (scales.size() > 0), "only one of out_shape or scales should be defined");
+    auto resize_layer = engine->network()->addResize(*in);
+    POROS_CHECK(resize_layer, "Unable to create interpolation (resizing) layer from node" << *node);
+    
+    if (out_shape.size() > 0) {
+        auto th_dynamic_shape_mask = torch::zeros(out_shape.size(), torch::kInt32);
+        auto th_static_shape_mask = torch::zeros(out_shape.size(), torch::kInt32);
+        for (size_t idx = 0; idx < out_shape.size(); ++idx) {
+            if (out_shape[idx] == -1) {
+                th_dynamic_shape_mask[idx] = 1;
+            } else {
+                th_static_shape_mask[idx] = out_shape[idx];
+            }
+        }
+
+    auto dynamic_shape_mask = tensor_to_const(engine, th_dynamic_shape_mask);
+    auto static_shape_mask = tensor_to_const(engine, th_static_shape_mask);
+    auto input_shape = engine->network()->addShape(*in)->getOutput(0);
+    auto dynamic_shape = engine->network()->addElementWise(
+        *input_shape, *dynamic_shape_mask, nvinfer1::ElementWiseOperation::kPROD)->getOutput(0);
+    auto target_output_shape = engine->network()->addElementWise(
+        *dynamic_shape, *static_shape_mask, nvinfer1::ElementWiseOperation::kSUM)->getOutput(0);
+    resize_layer->setInput(1, *target_output_shape);
+    } else {
+        resize_layer->setScales(scales.data(), scales.size());
+        if (align_corners) {
+            LOG(WARNING) << "interpolate with align_corners and scale_factor works differently in TensorRT and PyTorch.";
+        }
+    }
+    
+    resize_layer->setResizeMode(mode);
+    resize_layer->setName((layer_info(node) + "_IResizeLayer").c_str());
+#if NV_TENSORRT_MAJOR < 8
+    resize_layer->setAlignCorners(align_corners);
+#else
+    if (align_corners) {
+        resize_layer->setCoordinateTransformation(nvinfer1::ResizeCoordinateTransformation::kALIGN_CORNERS);
+    }
+#endif
+    engine->context().set_tensor(node->outputs()[0], resize_layer->getOutput(0));
+    LOG(INFO) << "Output tensor shape: " << resize_layer->getOutput(0)->getDimensions();
+}
+
+/*
+"aten::upsample_nearest1d(Tensor self, int[1] output_size, float? scales=None) -> Tensor",
+"aten::upsample_nearest1d.vec(Tensor input, int[]? output_size, float[]? scale_factors) -> Tensor",
+*/
+bool UnsampleNearest1DConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for UnsampleNearest1DConverter is not Tensor as expected");
+
+    //extract in
+    auto in = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((in != nullptr), "Unable to init input tensor for node: " << *node);
+    auto in_shape = nvdim_to_sizes(in->getDimensions());
+
+    auto maybe_outsize = engine->context().get_constant(inputs[1]);
+    auto maybe_scales = engine->context().get_constant(inputs[2]);
+    if (maybe_outsize.isNone() &&  maybe_scales.isNone()) {
+        POROS_THROW_ERROR("Unable to convert node: " << node_info(node)
+                << "\nOne of output_size or scale_factors should be defined");
+    }
+
+    if (!maybe_scales.isNone()) {
+        float scale = 0.0f;
+        // Case 1: user uses scales
+        if (maybe_scales.isDouble()) {
+            scale = maybe_scales.toDouble();
+        } else { // maybe_scales.isDoubleList()
+            auto scale_factors = maybe_scales.toDoubleList();
+            POROS_ASSERT(scale_factors.size() == 1, "Number of scale factors should match the input size");
+            scale = scale_factors[0];
+        }
+        std::vector<float> padded_scales(in_shape.size(), 1);
+        padded_scales[padded_scales.size() - 1] = scale;
+        resize_layer_size(engine, node, in, {}, padded_scales, nvinfer1::ResizeMode::kNEAREST);
+    } else {
+        // Case 2: user uses output size
+        auto output_size = maybe_outsize.toIntList();
+        auto out_size = nvdim_to_sizes(sizes_to_nvdim(output_size));
+        POROS_ASSERT(out_size.size() == 1, "aten::upsample_nearest1d input Tensor and output size dimension mismatch");
+        auto out_shape = in_shape;
+        std::copy(out_size.begin(), out_size.end(), out_shape.begin() + (in_shape.size() - out_size.size()));
+        resize_layer_size(engine, node, in, out_shape, {}, nvinfer1::ResizeMode::kNEAREST);
+    }
+    return true;
+}
+
+/*
+"aten::upsample_nearest2d(Tensor self, int[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor",
+"aten::upsample_nearest2d.vec(Tensor input, int[]? output_size, float[]? scale_factors) -> Tensor",
+*/
+bool UnsampleNearest2DConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for UnsampleNearest2DConverter is not Tensor as expected");
+
+    //extract in
+    auto in = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((in != nullptr), "Unable to init input tensor for node: " << *node);
+    auto in_shape = nvdim_to_sizes(in->getDimensions());
+
+    auto maybe_outsize = engine->context().get_constant(inputs[1]);
+    float scale_h = 0.0f;
+    float scale_w = 0.0f;
+
+    if (inputs.size() == 4) {
+        auto maybe_scales_h = engine->context().get_constant(inputs[2]);
+        auto maybe_scales_w = engine->context().get_constant(inputs[3]);
+        if (maybe_outsize.isNone() &&  (maybe_scales_h.isNone() || maybe_scales_w.isNone())) {
+            POROS_THROW_ERROR("Unable to convert node: " << node_info(node)
+                    << "\nOne of output_size or scales should be defined");
+        }
+        if (!maybe_scales_h.isNone() && !maybe_scales_w.isNone()) {
+            // Case 1: user uses scales
+            scale_h = maybe_scales_h.toDouble();
+            scale_w = maybe_scales_w.toDouble();
+        }
+    } else {  //(inputs_size() == 3)
+        auto maybe_scale_factors = engine->context().get_constant(inputs[2]);
+        if (maybe_outsize.isNone() &&  maybe_scale_factors.isNone()) {
+            POROS_THROW_ERROR("Unable to convert node: " << node_info(node)
+                    << "\nOne of output_size or scale_factors should be defined");
+        }
+
+        if (!maybe_scale_factors.isNone()) {
+            // Case 1: user uses scales
+            auto scale_factors = maybe_scale_factors.toDoubleList();
+            POROS_ASSERT(scale_factors.size() == 2, "Number of scale factors should match the input size");
+            scale_h = scale_factors[0];
+            scale_w = scale_factors[1];
+        }
+    }
+
+    if (!engine->context().get_constant(inputs[2]).isNone()) {
+        std::vector<float> padded_scales(in_shape.size(), 1);
+        padded_scales[padded_scales.size() - 2] = scale_h;
+        padded_scales[padded_scales.size() - 1] = scale_w;
+        resize_layer_size(engine, node, in, {}, padded_scales, nvinfer1::ResizeMode::kNEAREST);  
+    } else {
+        // Case 2: user uses output size
+        auto output_size = maybe_outsize.toIntList();
+        auto out_size = nvdim_to_sizes(sizes_to_nvdim(output_size));
+        POROS_ASSERT(out_size.size() == 2, "aten::upsample_nearest2d input Tensor and output size dimension mismatch");
+        auto out_shape = in_shape;
+        std::copy(out_size.begin(), out_size.end(), out_shape.begin() + (in_shape.size() - out_size.size()));
+        resize_layer_size(engine, node, in, out_shape, {}, nvinfer1::ResizeMode::kNEAREST);
+    }
+    return true;
+}
+
+/*
+"aten::upsample_nearest3d(Tensor self, int[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor",
+"aten::upsample_nearest3d.vec(Tensor input, int[]? output_size, float[]? scale_factors) -> Tensor",
+*/
+bool UnsampleNearest3DConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for UnsampleNearest3DConverter is not Tensor as expected");
+
+    //extract in
+    auto in = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((in != nullptr), "Unable to init input tensor for node: " << *node);
+    auto in_shape = nvdim_to_sizes(in->getDimensions());
+
+    auto maybe_outsize = engine->context().get_constant(inputs[1]);
+    float scale_d = 0.0f;
+    float scale_h = 0.0f;
+    float scale_w = 0.0f;
+
+    if (inputs.size() == 5) {
+        auto maybe_scales_d = engine->context().get_constant(inputs[2]);
+        auto maybe_scales_h = engine->context().get_constant(inputs[3]);
+        auto maybe_scales_w = engine->context().get_constant(inputs[4]);
+        if (maybe_outsize.isNone() && (maybe_scales_d.isNone() || 
+            maybe_scales_h.isNone() || maybe_scales_w.isNone())) {
+            POROS_THROW_ERROR("Unable to convert node: " << node_info(node)
+                    << "\nOne of output_size or scales should be defined");
+        }
+        if (!maybe_scales_d.isNone() && !maybe_scales_h.isNone() && !maybe_scales_w.isNone()) {
+            // Case 1: user uses scales
+            scale_d = maybe_scales_d.toDouble();
+            scale_h = maybe_scales_h.toDouble();
+            scale_w = maybe_scales_w.toDouble();
+        }
+    } else {  //(inputs_size() == 3)
+        auto maybe_scale_factors = engine->context().get_constant(inputs[2]);
+        if (maybe_outsize.isNone() &&  maybe_scale_factors.isNone()) {
+            POROS_THROW_ERROR("Unable to convert node: " << node_info(node)
+                    << "\nOne of output_size or scale_factors should be defined");
+        }
+
+        if (!maybe_scale_factors.isNone()) {
+            // Case 1: user uses scales
+            auto scale_factors = maybe_scale_factors.toDoubleList();
+            POROS_ASSERT(scale_factors.size() == 3, "Number of scale factors should match the input size");
+            scale_d = scale_factors[0];
+            scale_h = scale_factors[1];
+            scale_w = scale_factors[2];
+        }
+    }
+
+    if (!engine->context().get_constant(inputs[2]).isNone()) {
+        std::vector<float> padded_scales(in_shape.size(), 1);
+        padded_scales[padded_scales.size() - 3] = scale_d;
+        padded_scales[padded_scales.size() - 2] = scale_h;
+        padded_scales[padded_scales.size() - 1] = scale_w;
+        resize_layer_size(engine, node, in, {}, padded_scales, nvinfer1::ResizeMode::kNEAREST);  
+    } else {
+        // Case 2: user uses output size
+        auto output_size = maybe_outsize.toIntList();
+        auto out_size = nvdim_to_sizes(sizes_to_nvdim(output_size));
+        POROS_ASSERT(out_size.size() == 3, "aten::upsample_nearest3d input Tensor and output size dimension mismatch");
+        auto out_shape = in_shape;
+        std::copy(out_size.begin(), out_size.end(), out_shape.begin() + (in_shape.size() - out_size.size()));
+        resize_layer_size(engine, node, in, out_shape, {}, nvinfer1::ResizeMode::kNEAREST);
+    }
+    return true;
+}
+
+/*
+"aten::upsample_linear1d(Tensor self, int[1] output_size, bool align_corners, float? scales=None) -> Tensor",
+"aten::upsample_linear1d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor",
+*/
+bool UnsampleLinear1DConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for UnsampleLinear1DConverter is not Tensor as expected");
+
+    //extract in
+    auto in = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((in != nullptr), "Unable to init input tensor for node: " << *node);
+    auto in_shape = nvdim_to_sizes(in->getDimensions());
+    //extract align_corners
+    auto align_corners = engine->context().get_constant(inputs[2]).toBool();
+
+    auto maybe_outsize = engine->context().get_constant(inputs[1]);
+    auto maybe_scales = engine->context().get_constant(inputs[3]);
+    if (maybe_outsize.isNone() && maybe_scales.isNone()) {
+        POROS_THROW_ERROR("Unable to convert node: " << node_info(node)
+                << "\nOne of output_size or scales should be defined");
+    }
+    
+    if (!maybe_scales.isNone()) {
+        // Case 1: user uses scales
+        float scale = 0.0f;
+        if (maybe_scales.isDouble()) {
+            scale = maybe_scales.toDouble();
+        } else { //maybe_scales.isDoubleList()
+            auto scale_factors = maybe_scales.toDoubleList();
+            POROS_ASSERT(scale_factors.size() == 1, "Number of scale factors should match the input size");
+            scale = scale_factors[0];
+        }
+        std::vector<float> padded_scales(in_shape.size(), 1);
+        padded_scales[padded_scales.size() - 1] = scale;
+#if NV_TENSORRT_MAJOR < 7 || (NV_TENSORRT_MAJOR == 7 && NV_TENSORRT_MINOR < 1) // IF TRT VERSION <= 7.0
+        if (!align_corners) {
+            POROS_THROW_ERROR("Unable to convert node: " << node_info(node) 
+                << "\nupsample_linear1d only supports align_corner with TensorRT <= 7.0.");
+        } else {
+            resize_layer_size(engine, node, in, {}, padded_scales, nvinfer1::ResizeMode::kLINEAR, true);
+        }
+#else
+        auto is_dynamic_shape = PorosGlobalContext::instance().get_poros_options().is_dynamic;
+        POROS_CHECK(!(align_corners && is_dynamic_shape), "Poros currently does not support the compilation of dynamc engines"
+           << "from code using using PyTorch [bi/tri]linear interpolation via scale factor and align_corners=True");
+        if (align_corners) {
+        // Align corners and scale factor behave slightly different together in TRT and PyTorch so run the
+        // layer in ATen to maintain consistancy between TRTorch and PyTorch
+        // https://pytorch.org/docs/stable/nn.functional.html#torch.nn.functional.interpolate
+            create_plugin(engine, node, in, "linear1d", in_shape, {}, {}, {scale}, std::string("linear"), align_corners, true);
+        } else {
+            resize_layer_size(engine, node, in, {}, padded_scales, nvinfer1::ResizeMode::kLINEAR, align_corners);
+        }
+#endif 
+    } else {
+        // Case 2: user uses output size
+        auto output_size = maybe_outsize.toIntList();
+        auto out_size = nvdim_to_sizes(sizes_to_nvdim(output_size));
+        POROS_ASSERT(out_size.size() == 1, "aten::upsample_linear1d input Tensor and output size dimension mismatch");
+        auto out_shape = in_shape;
+        std::copy(out_size.begin(), out_size.end(), out_shape.begin() + (in_shape.size() - out_size.size()));
+#if NV_TENSORRT_MAJOR < 7 || (NV_TENSORRT_MAJOR == 7 && NV_TENSORRT_MINOR < 1) // IF TRT VERSION <= 7.0
+        if (!align_corners) {
+            create_plugin(engine, node, in, "linear1d", in_shape, out_shape, out_size, {}, std::string("linear"), align_corners);            
+        } else {
+            resize_layer_size(engine, node, in, out_shape, {}, nvinfer1::ResizeMode::kLINEAR, true);
+        }
+#else
+        resize_layer_size(engine, node, in, out_shape, {}, nvinfer1::ResizeMode::kLINEAR, align_corners);
+#endif
+    }
+    return true;
+}
+
+/*
+"aten::upsample_bilinear2d(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor",
+"aten::upsample_bilinear2d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor",
+*/
+bool UnsampleBilinear2DConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for UnsampleBilinear2DConverter is not Tensor as expected");
+
+    //extract in
+    auto in = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((in != nullptr), "Unable to init input tensor for node: " << *node);
+    auto in_shape = nvdim_to_sizes(in->getDimensions());
+    //extract align_corners
+    auto align_corners = engine->context().get_constant(inputs[2]).toBool();
+
+    auto maybe_outsize = engine->context().get_constant(inputs[1]);
+    float scale_h = 0.0f;
+    float scale_w = 0.0f;
+
+    if (inputs.size() == 5) {
+        auto maybe_scales_h = engine->context().get_constant(inputs[3]);
+        auto maybe_scales_w = engine->context().get_constant(inputs[4]);
+        if (maybe_outsize.isNone() &&  (maybe_scales_h.isNone() || maybe_scales_w.isNone())) {
+            POROS_THROW_ERROR("Unable to convert node: " << node_info(node)
+                    << "\nOne of output_size or scales should be defined");
+        }
+        if (!maybe_scales_h.isNone() && !maybe_scales_w.isNone()) {
+            // Case 1: user uses scales
+            scale_h = maybe_scales_h.toDouble();
+            scale_w = maybe_scales_w.toDouble();
+        }
+    } else {  //(inputs_size() == 4)
+        auto maybe_scale_factors = engine->context().get_constant(inputs[3]);
+        if (maybe_outsize.isNone() &&  maybe_scale_factors.isNone()) {
+            POROS_THROW_ERROR("Unable to convert node: " << node_info(node)
+                    << "\nOne of output_size or scale_factors should be defined");
+        }
+        if (!maybe_scale_factors.isNone()) {
+            // Case 1: user uses scales
+            auto scale_factors = maybe_scale_factors.toDoubleList();
+            POROS_ASSERT(scale_factors.size() == 2, "Number of scale factors should match the input size");
+            scale_h = scale_factors[0];
+            scale_w = scale_factors[1];
+        }
+    }
+
+    if (!engine->context().get_constant(inputs[3]).isNone()) {
+        std::vector<float> padded_scales(in_shape.size(), 1);
+        padded_scales[padded_scales.size() - 2] = scale_h;
+        padded_scales[padded_scales.size() - 1] = scale_w;
+#if NV_TENSORRT_MAJOR < 7 || (NV_TENSORRT_MAJOR == 7 && NV_TENSORRT_MINOR < 1) // IF TRT VERSION <= 7.0
+        if (!align_corners) {
+            POROS_THROW_ERROR("Unable to convert node: " << node_info(node) 
+                << "\nupsample_linear1d only supports align_corner with TensorRT <= 7.0.");
+        } else {
+            resize_layer_size(engine, node, in, {}, padded_scales, nvinfer1::ResizeMode::kLINEAR, true);
+        }
+#else
+        auto is_dynamic_shape = PorosGlobalContext::instance().get_poros_options().is_dynamic;
+        POROS_CHECK(!(align_corners && is_dynamic_shape), "Poros currently does not support the compilation of dynamc engines"
+           << "from code using using PyTorch [bi/tri]linear interpolation via scale factor and align_corners=True");
+        if (align_corners) {
+        // Align corners and scale factor behave slightly different together in TRT and PyTorch so run the
+        // layer in ATen to maintain consistancy between TRTorch and PyTorch
+        // https://pytorch.org/docs/stable/nn.functional.html#torch.nn.functional.interpolate
+            create_plugin(engine, node, in, "bilinear2d", in_shape, {}, {}, {scale_h, scale_w}, std::string("bilinear"), align_corners, true);
+        } else {
+            resize_layer_size(engine, node, in, {}, padded_scales, nvinfer1::ResizeMode::kLINEAR, align_corners);
+        }
+#endif 
+    } else {
+        // Case 2: user uses output size
+        auto output_size = maybe_outsize.toIntList();
+        auto out_size = nvdim_to_sizes(sizes_to_nvdim(output_size));
+        POROS_ASSERT(out_size.size() == 2, "aten::upsample_bilinear2d input Tensor and output size dimension mismatch");
+        auto out_shape = in_shape;
+        std::copy(out_size.begin(), out_size.end(), out_shape.begin() + (in_shape.size() - out_size.size()));
+#if NV_TENSORRT_MAJOR < 7 || (NV_TENSORRT_MAJOR == 7 && NV_TENSORRT_MINOR < 1) // IF TRT VERSION <= 7.0
+        if (!align_corners) {
+            create_plugin(engine, node, in, "bilinear2d", in_shape, out_shape, out_size, {}, std::string("bilinear"), align_corners);            
+        } else {
+            resize_layer_size(engine, node, in, out_shape, {}, nvinfer1::ResizeMode::kLINEAR, true);
+        }
+#else
+        resize_layer_size(engine, node, in, out_shape, {}, nvinfer1::ResizeMode::kLINEAR, align_corners);
+#endif
+    }
+    return true;
+}
+
+/*
+"aten::upsample_trilinear3d(Tensor self, int[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor",
+"aten::upsample_trilinear3d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor",
+*/
+bool UnsampleTrilinear3DConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for UnsampleTrilinear3DConverter is not Tensor as expected");
+
+    //extract in
+    auto in = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((in != nullptr), "Unable to init input tensor for node: " << *node);
+    auto in_shape = nvdim_to_sizes(in->getDimensions());
+    //extract align_corners
+    auto align_corners = engine->context().get_constant(inputs[2]).toBool();
+
+    auto maybe_outsize = engine->context().get_constant(inputs[1]);
+    float scale_d = 0.0f;
+    float scale_h = 0.0f;
+    float scale_w = 0.0f;
+
+    if (inputs.size() == 6) {
+        auto maybe_scales_d = engine->context().get_constant(inputs[3]);
+        auto maybe_scales_h = engine->context().get_constant(inputs[4]);
+        auto maybe_scales_w = engine->context().get_constant(inputs[5]);
+        if (maybe_outsize.isNone() && (maybe_scales_h.isNone() 
+            || maybe_scales_w.isNone() || maybe_scales_d.isNone())) {
+            POROS_THROW_ERROR("Unable to convert node: " << node_info(node)
+                    << "\nOne of output_size or scales should be defined");
+        }
+        if (!maybe_scales_h.isNone() && !maybe_scales_w.isNone() && maybe_scales_d.isNone()) {
+            // Case 1: user uses scales
+            scale_d = maybe_scales_d.toDouble();
+            scale_h = maybe_scales_h.toDouble();
+            scale_w = maybe_scales_w.toDouble();
+        }
+    } else {  //(inputs_size() == 4)
+        auto maybe_scale_factors = engine->context().get_constant(inputs[3]);
+        if (maybe_outsize.isNone() &&  maybe_scale_factors.isNone()) {
+            POROS_THROW_ERROR("Unable to convert node: " << node_info(node)
+                    << "\nOne of output_size or scale_factors should be defined");
+        }
+        if (!maybe_scale_factors.isNone()) {
+            // Case 1: user uses scales
+            auto scale_factors = maybe_scale_factors.toDoubleList();
+            POROS_ASSERT(scale_factors.size() == 3, "Number of scale factors should match the input size");
+            scale_d = scale_factors[0];
+            scale_h = scale_factors[1];
+            scale_w = scale_factors[2];
+        }
+    }
+
+    if (!engine->context().get_constant(inputs[3]).isNone()) {
+        std::vector<float> padded_scales(in_shape.size(), 1);
+        padded_scales[padded_scales.size() - 3] = scale_d;
+        padded_scales[padded_scales.size() - 2] = scale_h;
+        padded_scales[padded_scales.size() - 1] = scale_w;
+#if NV_TENSORRT_MAJOR < 7 || (NV_TENSORRT_MAJOR == 7 && NV_TENSORRT_MINOR < 1) // IF TRT VERSION <= 7.0
+        if (!align_corners) {
+            POROS_THROW_ERROR("Unable to convert node: " << node_info(node) 
+                << "\nupsample_linear1d only supports align_corner with TensorRT <= 7.0.");
+        } else {
+            resize_layer_size(engine, node, in, {}, padded_scales, nvinfer1::ResizeMode::kLINEAR, true);
+        }
+#else
+        auto is_dynamic_shape = PorosGlobalContext::instance().get_poros_options().is_dynamic;
+        POROS_CHECK(!(align_corners && is_dynamic_shape), "Poros currently does not support the compilation of dynamc engines"
+           << "from code using using PyTorch [bi/tri]linear interpolation via scale factor and align_corners=True");
+        if (align_corners) {
+        // Align corners and scale factor behave slightly different together in TRT and PyTorch so run the
+        // layer in ATen to maintain consistancy between TRTorch and PyTorch
+        // https://pytorch.org/docs/stable/nn.functional.html#torch.nn.functional.interpolate
+            create_plugin(engine, node, in, "trilinear3d", in_shape, {}, {}, {scale_d, scale_h, scale_w}, std::string("trilinear"), align_corners, true);
+        } else {
+            resize_layer_size(engine, node, in, {}, padded_scales, nvinfer1::ResizeMode::kLINEAR, align_corners);
+        }
+#endif 
+    } else {
+        // Case 2: user uses output size
+        auto output_size = maybe_outsize.toIntList();
+        auto out_size = nvdim_to_sizes(sizes_to_nvdim(output_size));
+        POROS_ASSERT(out_size.size() == 3, "aten::upsample_trilinear3d input Tensor and output size dimension mismatch");
+        auto out_shape = in_shape;
+        std::copy(out_size.begin(), out_size.end(), out_shape.begin() + (in_shape.size() - out_size.size()));
+#if NV_TENSORRT_MAJOR < 7 || (NV_TENSORRT_MAJOR == 7 && NV_TENSORRT_MINOR < 1) // IF TRT VERSION <= 7.0
+        if (!align_corners) {
+            create_plugin(engine, node, in, "trilinear3d", in_shape, out_shape, out_size, {}, std::string("trilinear"), align_corners);            
+        } else {
+            resize_layer_size(engine, node, in, out_shape, {}, nvinfer1::ResizeMode::kLINEAR, true);
+        }
+#else
+        resize_layer_size(engine, node, in, out_shape, {}, nvinfer1::ResizeMode::kLINEAR, align_corners);
+#endif
+    }
+    return true;
+}
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, UnsampleNearest1DConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, UnsampleNearest2DConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, UnsampleNearest3DConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, UnsampleLinear1DConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, UnsampleBilinear2DConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, UnsampleTrilinear3DConverter);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/interpolate.h b/poros/src/poros/converter/gpu/interpolate.h
new file mode 100644
index 0000000000..90ceca75fd
--- /dev/null
+++ b/poros/src/poros/converter/gpu/interpolate.h
@@ -0,0 +1,164 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file interpolate.h
+* @author tianjinjin@baidu.com
+* @date Mon Aug 16 12:26:28 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class UnsampleNearest1DConverter : public GpuConverter {
+public:
+    UnsampleNearest1DConverter() {}
+    virtual ~UnsampleNearest1DConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::upsample_nearest1d(Tensor self, int[1] output_size, float? scales=None) -> Tensor",
+                "aten::upsample_nearest1d.vec(Tensor input, int[]? output_size, float[]? scale_factors) -> Tensor",
+                };
+    }
+
+    /** TODO: TO SUPPORT CONVERTERS BELLOW:
+     * "aten::upsample_nearest1d.out(Tensor self, int[1] output_size, float? scales=None, *, Tensor(a!) out) -> Tensor(a!)",
+     * **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::upsample_nearest1d};
+    }
+};
+
+class UnsampleNearest2DConverter : public GpuConverter {
+public:
+    UnsampleNearest2DConverter() {}
+    virtual ~UnsampleNearest2DConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::upsample_nearest2d(Tensor self, int[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor",
+                "aten::upsample_nearest2d.vec(Tensor input, int[]? output_size, float[]? scale_factors) -> Tensor",
+                };
+    }
+
+    /** TODO: TO SUPPORT CONVERTERS BELLOW:
+     * "aten::upsample_nearest2d.out(Tensor self, int[2] output_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)",
+     * **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::upsample_nearest2d};
+    }
+};
+
+class UnsampleNearest3DConverter : public GpuConverter {
+public:
+    UnsampleNearest3DConverter() {}
+    virtual ~UnsampleNearest3DConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::upsample_nearest3d(Tensor self, int[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor",
+                "aten::upsample_nearest3d.vec(Tensor input, int[]? output_size, float[]? scale_factors) -> Tensor",
+                };
+    }
+
+    /** TODO: TO SUPPORT CONVERTERS BELLOW:
+     * "aten::upsample_nearest3d.out(Tensor self, int[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)",
+     * **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::upsample_nearest3d};
+    }
+};
+
+class UnsampleLinear1DConverter : public GpuConverter {
+public:
+    UnsampleLinear1DConverter() {}
+    virtual ~UnsampleLinear1DConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::upsample_linear1d(Tensor self, int[1] output_size, bool align_corners, float? scales=None) -> Tensor",
+                "aten::upsample_linear1d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor",
+                };
+    }
+
+    /** TODO: TO SUPPORT CONVERTERS BELLOW:
+     * "aten::upsample_linear1d.out(Tensor self, int[1] output_size, bool align_corners, float? scales=None, *, Tensor(a!) out) -> Tensor(a!)",
+     * **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::upsample_linear1d};
+    }
+};
+
+class UnsampleBilinear2DConverter : public GpuConverter {
+public:
+    UnsampleBilinear2DConverter() {}
+    virtual ~UnsampleBilinear2DConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::upsample_bilinear2d(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor",
+                "aten::upsample_bilinear2d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor",
+                };
+    }
+
+    /** TODO: TO SUPPORT CONVERTERS BELLOW:
+     * "aten::upsample_bilinear2d.out(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)",
+     * **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::upsample_bilinear2d};
+    }
+};
+
+class UnsampleTrilinear3DConverter : public GpuConverter {
+public:
+    UnsampleTrilinear3DConverter() {}
+    virtual ~UnsampleTrilinear3DConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::upsample_trilinear3d(Tensor self, int[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor",
+                "aten::upsample_trilinear3d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor",
+                };
+    }
+
+    /** TODO: TO SUPPORT CONVERTERS BELLOW:
+     * "aten::upsample_trilinear3d.out(Tensor self, int[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)",
+     * **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::upsample_trilinear3d};
+    }
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/layer_norm.cpp b/poros/src/poros/converter/gpu/layer_norm.cpp
new file mode 100644
index 0000000000..16cd2296c1
--- /dev/null
+++ b/poros/src/poros/converter/gpu/layer_norm.cpp
@@ -0,0 +1,198 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file layer_norm.cpp
+* @author tianjinjin@baidu.com
+* @date Fri Aug 20 15:28:37 CST 2021
+* @brief
+**/
+
+#include "poros/converter/gpu/layer_norm.h"
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/context/poros_global.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+/*
+aten::layer_norm(Tensor input, 
+int[] normalized_shape, 
+Tensor? weight=None, 
+Tensor? bias=None, 
+float eps=1e-05, 
+bool cudnn_enable=True) -> Tensor
+*/
+bool LayerNormConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 6), "invaid inputs size for LayerNormConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for LayerNormConverter is not Tensor as expected");
+    // weight & bias
+    POROS_CHECK_TRUE((inputs[2]->node()->kind() == torch::jit::prim::Constant),
+        "input[2] for LayerNormConverter is not come from prim::Constant as expected");
+    POROS_CHECK_TRUE((inputs[3]->node()->kind() == torch::jit::prim::Constant),
+        "input[3] for LayerNormConverter is not come from prim::Constant as expected");
+
+    nvinfer1::ITensor* input = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((input != nullptr), "Unable to init input tensor for node: " << *node);
+    nvinfer1::Dims orig_shape = input->getDimensions();
+    std::vector<int64_t> shape = nvdim_to_sizes(orig_shape);
+    
+    /* Layer_Norm normalizes over last N dimensions.
+        normalizaed_shape could be (C,H,W), (H,W), or (W). */
+    c10::List<int64_t> normalized_shape = (engine->context().get_constant(inputs[1])).toIntList();
+    std::vector<int64_t> normalized_shape_vec = nvdim_to_sizes(sizes_to_nvdim(normalized_shape));
+    
+    // Unwrap eps.
+    double eps = (engine->context().get_constant(inputs[4])).toDouble();
+
+    // Set up  axis_ask for E[x].
+    uint32_t axis_mask = 0;
+    for (size_t i = 0; i < normalized_shape_vec.size(); i++) {
+        axis_mask |= 1 << (shape.size() - i - 1);
+    }
+    LOG(INFO) << "Axis Mask for E[x]" << std::bitset<32>(axis_mask);
+    
+    // E[x]
+    nvinfer1::IReduceLayer* mean_expected = engine->network()->addReduce(*input, 
+                                                        nvinfer1::ReduceOperation::kAVG, axis_mask, true);
+    POROS_CHECK(mean_expected, "Unable to create mean_expected from node: " << *node);
+    mean_expected->setName((layer_info(node) + "_IReduceLayer(mean_expected)").c_str());
+    nvinfer1::ITensor* mean_expected_out = mean_expected->getOutput(0);
+
+    // X-E[x]
+    nvinfer1::ILayer* sub = add_elementwise(engine, 
+                    nvinfer1::ElementWiseOperation::kSUB, 
+                    input, 
+                    mean_expected_out, 
+                    (layer_info(node) + "_sub").c_str());
+    POROS_CHECK(sub, "Unable to create Sub layer from node: " << *node);
+    nvinfer1::ITensor* xsubmean_out = sub->getOutput(0);
+    
+    // Variance = mean(pow(xsubmean,2))
+    float pow_scalar = 2;
+    nvinfer1::ITensor* exponent = tensor_to_const(engine, torch::tensor({pow_scalar}));
+    nvinfer1::ILayer* pow = add_elementwise(engine, 
+                    nvinfer1::ElementWiseOperation::kPOW, 
+                    xsubmean_out, 
+                    exponent, 
+                    (layer_info(node) + "_pow").c_str());
+    POROS_CHECK(pow, "Unable to create Pow layer from node: " << *node);
+    nvinfer1::ITensor* pow_out = pow->getOutput(0);
+    
+    nvinfer1::IReduceLayer* mean_var = engine->network()->addReduce(*pow_out, 
+                                                            nvinfer1::ReduceOperation::kAVG, axis_mask, true);
+    POROS_CHECK(mean_var, "Unable to create mean_var from node: " << *node);
+    mean_var->setName((layer_info(node) + "_IReduceLayer(mean_var)").c_str());
+    nvinfer1::ITensor* mean_var_out = mean_var->getOutput(0);
+    
+    // Variance + eps
+    nvinfer1::ITensor* eps_tensor = tensor_to_const(engine, torch::tensor({eps}));
+    nvinfer1::ILayer* add = add_elementwise(engine, 
+                    nvinfer1::ElementWiseOperation::kSUM, 
+                    mean_var_out, 
+                    eps_tensor, 
+                    (layer_info(node) + "_sum").c_str());
+    POROS_CHECK(add, "Unable to create Add layer from node: " << *node);
+    nvinfer1::ITensor* add_out = add->getOutput(0);
+    
+    // SQRT((Var + eps))
+    nvinfer1::IUnaryLayer* sqrt = engine->network()->addUnary(*add_out, nvinfer1::UnaryOperation::kSQRT);
+    POROS_CHECK(sqrt, "Unable to create unary(sqrt) from node: " << *node);
+    sqrt->setName((layer_info(node) + "_IUnaryLayer").c_str());
+    nvinfer1::ITensor* sqrt_out = sqrt->getOutput(0);
+    
+    // (x - E[x]) / sqrt((var + eps))
+    nvinfer1::ILayer* div = add_elementwise(engine, 
+                    nvinfer1::ElementWiseOperation::kDIV, 
+                    xsubmean_out, 
+                    sqrt_out, 
+                    (layer_info(node) + "_div").c_str());
+    POROS_CHECK(div, "Unable to create div layer from node: " << *node);
+    nvinfer1::ITensor* div_out = div->getOutput(0);
+    
+    torch::jit::IValue maybe_weight = engine->context().get_constant(inputs[2]);
+    torch::jit::IValue maybe_bias = engine->context().get_constant(inputs[3]);
+    //when weight and bias setting is both None
+    if (!maybe_weight.isTensor() && !maybe_bias.isTensor()) {
+        engine->context().set_tensor(node->outputs()[0], div_out);
+        LOG(INFO) << "Output tensor shape: " << div_out->getDimensions();
+        return true;
+    }
+    
+    /*------------------------------------------------------------
+     * situation when weight or bias setting is not None
+     * ------------------------------------------------------------*/
+    // Remove batch dimension from input shape for expand_size, which will
+    // be used to create weights for addScaleNd later.
+
+    /** TODO: IS the first input size always are always be batch?????
+    * if not, this converter is not ok。
+    * */
+    
+    // Set up gamma and beta by tensor_to_const directly, 
+    // boardcast will be done automatically when add_elementwise, so need not expand
+    nvinfer1::ILayer* scale_l = nullptr;
+    nvinfer1::ILayer* shift_l = nullptr;
+    if (maybe_weight.isTensor()) {
+        torch::Tensor gamma = maybe_weight.toTensor();
+        nvinfer1::ITensor* gamma_tensor = tensor_to_const(engine, gamma);
+        scale_l = add_elementwise(engine, 
+                        nvinfer1::ElementWiseOperation::kPROD, 
+                        div_out, 
+                        gamma_tensor, 
+                        (layer_info(node) + "_prod_for_gamma").c_str());
+    }
+    
+    if (maybe_bias.isTensor()) {
+        torch::Tensor ori_beta = maybe_bias.toTensor();
+        nvinfer1::ITensor* beta_tensor = tensor_to_const(engine, ori_beta);
+        if (scale_l == nullptr) {
+            shift_l = add_elementwise(engine,
+                        nvinfer1::ElementWiseOperation::kSUM,
+                        div_out,
+                        beta_tensor,
+                        (layer_info(node) + "_sum_for_beta").c_str());
+
+        } else {
+            shift_l = add_elementwise(engine,
+                        nvinfer1::ElementWiseOperation::kSUM,
+                        scale_l->getOutput(0),
+                        beta_tensor,
+                        (layer_info(node) + "_sum_for_beta").c_str());
+        }
+        nvinfer1::ITensor* shift_l_out = shift_l->getOutput(0);
+        engine->context().set_tensor(node->outputs()[0], shift_l_out);
+        LOG(INFO) << "Output tensor shape: " << shift_l_out->getDimensions();
+    } else {
+        nvinfer1::ITensor* scale_l_out = scale_l->getOutput(0);
+        engine->context().set_tensor(node->outputs()[0], scale_l_out);
+        LOG(INFO) << "Output tensor shape: " << scale_l_out->getDimensions();
+        
+    }
+    return true;
+}
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, LayerNormConverter);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/layer_norm.h b/poros/src/poros/converter/gpu/layer_norm.h
new file mode 100644
index 0000000000..cd4d9504a2
--- /dev/null
+++ b/poros/src/poros/converter/gpu/layer_norm.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file layer_norm.h
+* @author tianjinjin@baidu.com
+* @date Fri Aug 20 15:28:37 CST 2021
+* @brief
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class LayerNormConverter : public GpuConverter {
+public:
+    LayerNormConverter() {}
+    virtual ~LayerNormConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    virtual const std::vector<std::string> schema_string() {
+        return {"aten::layer_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor"};
+    }
+
+    virtual const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::layer_norm};
+    }
+
+    bool assign_schema_attr() {
+        bool result = true;
+        result &= assign_schema_attr_helper({{"aten::layer_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor", {1, 1}}});
+        return result;
+    }
+
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/linear.cpp b/poros/src/poros/converter/gpu/linear.cpp
new file mode 100644
index 0000000000..cf9237ab84
--- /dev/null
+++ b/poros/src/poros/converter/gpu/linear.cpp
@@ -0,0 +1,233 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file linear.cpp
+* @author tianjinjin@baidu.com
+* @date Fri Aug 20 17:21:44 CST 2021
+* @brief 
+**/
+
+#include "poros/converter/gpu/linear.h"
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/context/poros_global.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+/** aten::linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
+ * the implementation of aten::linear in pytorch is in file: aten/src/Aten/native/Linear.cpp
+ * the core function is like this:
+ *  auto bias = bias_opt.has_value()
+    ? c10::MaybeOwned<Tensor>::borrowed(*bias_opt)
+    : c10::MaybeOwned<Tensor>::owned(c10::in_place);
+    if (input.dim() == 2 && bias->defined()) {
+        return at::addmm(*bias, input, weight.t());
+    }
+    auto output = at::matmul(input, weight.t());
+    if (bias->defined()) {
+        output.add_(*bias);
+    }
+    return output;
+* we can refer to the implement of original pytorch.
+* ******************************
+* %res = aten::linear(%input, %weight_0, %bias)
+* try to converter matmul like below:
+*
+* %weight = aten::t(%weight_0)
+* %mm  = aten::matmul(%input, %weight)
+* if %bias is None:
+*    return %mm
+* else:
+*   if (input.dim == 2):
+*        %res = aten::add(%bias, %mm, 1)
+*   else:
+*        %res = aten::add(%mm, %bias, 1)
+**/
+bool LinearConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 3), "invaid inputs size for LinearConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for LinearConverter is not Tensor as expected");
+
+    auto self = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((self != nullptr), "Unable to init input tensor for node: " << *node);
+    //auto origin_self_dim = self->getDimensions().nbDims;
+    
+    // handle weight
+    nvinfer1::ITensor* weight = nullptr;
+    bool need_trans = false;
+    auto maybe_weight = engine->context().get_constant(inputs[1]);
+    if (maybe_weight.isTensor()) {
+        //常量tensor
+        at::Tensor weight_t = maybe_weight.toTensor().t();
+        int weight_rank = weight_t.sizes().size();
+        //需要padding tensor 的情况，直接转置并padding完成后，再转constant_tensor, 避免命中tensorrt中constshuffle的tatic.
+        if (weight_rank < self->getDimensions().nbDims) {
+            at::Tensor padding_weight  = weight_t;
+            for (int dim = weight_rank; dim < self->getDimensions().nbDims; ++dim) {
+                padding_weight = weight_t.unsqueeze(0);
+            }
+            weight = tensor_to_const(engine, padding_weight);
+        } else {
+            weight = tensor_to_const(engine, weight_t);
+        }
+    } else {
+        //weight 来自其他的tensor
+        weight = engine->context().get_tensor(inputs[1]);
+        if (weight->getDimensions().nbDims >= 2) {
+            need_trans = true;
+        }
+        /*  //转置交给matmul, 不再自己shuffle实现。
+        auto weight_before_trans = engine->context().get_tensor(inputs[1]);
+        auto weight_dims = weight_before_trans->getDimensions();
+        if (weight_dims.nbDims < 2) {
+            weight = weight_before_trans;
+        } else {
+            //like aten::transpose(input, 0, 1)
+            auto shuffle_layer = engine->network()->addShuffle(*weight_before_trans);
+            POROS_CHECK(shuffle_layer, "Unable to create shuffle layer from node: " << *node);
+            nvinfer1::Permutation first_perm;
+            first_perm.order[0] = 1;
+            first_perm.order[1] = 0;
+            shuffle_layer->setFirstTranspose(first_perm);
+            shuffle_layer->setZeroIsPlaceholder(false);
+            shuffle_layer->setName((layer_info(node) + "_IShuffleLayer(weight_transpose)").c_str());
+            weight = shuffle_layer->getOutput(0);
+        } */
+    }
+    
+    // Ensure self and weight tensors have same nbDims by expanding the dimensions (from 0 axis) if
+    // necessary.
+    if (self->getDimensions().nbDims < weight->getDimensions().nbDims) {
+        self = add_padding(engine, node, self, weight->getDimensions().nbDims, false, false);
+    } else {
+        weight = add_padding(engine, node, weight, self->getDimensions().nbDims, false, false);
+    }
+    
+    nvinfer1::IMatrixMultiplyLayer* mm_layer = nullptr;
+    if (need_trans == true) {
+        mm_layer = engine->network()->addMatrixMultiply(
+            *self, nvinfer1::MatrixOperation::kNONE, *weight, nvinfer1::MatrixOperation::kTRANSPOSE);
+    } else {
+        mm_layer = engine->network()->addMatrixMultiply(
+            *self, nvinfer1::MatrixOperation::kNONE, *weight, nvinfer1::MatrixOperation::kNONE);
+    }
+    POROS_CHECK(mm_layer, "Unable to create matrix multiplication node: " << *node);
+    
+    auto bias = engine->context().get_tensor(inputs[2]);
+    /*--------------------------------------------------------------
+     *               bias is None situation
+     * -------------------------------------------------------------*/
+    //bias is None situation return directly
+    if (bias == nullptr) {
+        mm_layer->setName((layer_info(node) + "_IMatrixMultiplyLayer").c_str());
+        engine->context().set_tensor(node->outputs()[0], mm_layer->getOutput(0));
+        LOG(INFO) << "Output tensor shape: " << mm_layer->getOutput(0)->getDimensions();
+        return true;
+    }
+
+    /*--------------------------------------------------------------
+     *               bias is not None situation
+     * -------------------------------------------------------------*/
+    mm_layer->setName((layer_info(node) + "_IMatrixMultiplyLayer").c_str());
+
+    nvinfer1::ILayer* new_layer = nullptr;
+    // if (origin_self_dim == 2) {
+    //     //TODO: ADD SOME FUNCTION HERE
+    // } else {
+    new_layer = add_elementwise(engine, 
+        nvinfer1::ElementWiseOperation::kSUM,
+        mm_layer->getOutput(0),
+        bias,
+        layer_info(node) + "_sum");
+    //}
+    POROS_CHECK(new_layer, "Unable to create add layer from node: " << *node);
+    engine->context().set_tensor(node->outputs()[0], new_layer->getOutput(0));
+
+    LOG(INFO) << "Output tensor shape: " << new_layer->getOutput(0)->getDimensions();
+    return true;
+}
+
+//DEPRECATED: result do not match the pytorch output
+bool LinearConverter::converter_fully_connect_version(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 3), "invaid inputs size for LinearConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for LinearConverter is not Tensor as expected");
+    // weight & bias
+    POROS_CHECK_TRUE((inputs[1]->type()->isSubtypeOf(c10::TensorType::get())),
+        "input[1] for LinearConverter is not Tensor as expected");
+    POROS_CHECK_TRUE((inputs[2]->node()->kind() == torch::jit::prim::Constant),
+        "input[2] for LinearConverter is not come from prim::Constant as expected");
+
+    auto in = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((in != nullptr), "Unable to init input tensor for node: " << *node);
+    auto shape = nvdim_to_sizes(in->getDimensions());
+    LOG(INFO) << "Input tensor shape: " << in->getDimensions();
+
+    // PyTorch follows in: Nx*xIN, W: OUTxIN, B: OUT, out: Nx*xOUT
+    // TensorRT inserts a flatten in when following conv
+    POROS_ASSERT(shape.size() >= 2,
+        "aten::linear expects input tensors to be of shape [N,..., in features], but found input Tensor less than 2D");
+        
+    if (shape.size() < 4) {
+        // Flatten
+        std::vector<int64_t> new_shape;
+        new_shape.push_back(shape[0]);
+        new_shape.push_back(1);
+        new_shape.push_back(1);
+        new_shape.push_back(nvdim_to_volume(sizes_to_nvdim(shape)) / shape[0]);
+        auto new_dims = sizes_to_nvdim(new_shape);
+        
+        LOG(INFO) << "Input shape is less than 4D got: " << sizes_to_nvdim(shape)
+                << ", inserting shuffle layer to reshape to 4D tensor shape: " << new_dims;
+        
+        auto in_shuffle = engine->network()->addShuffle(*in);
+        in_shuffle->setReshapeDimensions(new_dims);
+        in_shuffle->setName((layer_info(node) + "_IShuffleLayer").c_str());
+        in = in_shuffle->getOutput(0);
+    }
+
+    auto w_tensor = (engine->context().get_constant(inputs[1])).toTensor();
+    Weights w = Weights(w_tensor);
+    
+    nvinfer1::ILayer* new_layer;
+    auto maybe_bias = engine->context().get_constant(inputs[2]);
+    if (maybe_bias.isTensor()) {
+        auto bias = maybe_bias.toTensor();
+        Weights b = Weights(bias);
+        new_layer = engine->network()->addFullyConnected(*in, w.outputs_num, w.data, b.data);
+    } else {
+        LOG(INFO) << "There is no bias for the linear layer";
+        new_layer = engine->network()->addFullyConnected(*in, w.outputs_num, w.data, Weights().data);
+    }
+    POROS_CHECK(new_layer, "Unable to create linear layer from node: " << *node);
+    new_layer->setName((layer_info(node) + "_IFullyConnectedLayer").c_str());
+    engine->context().set_tensor(node->outputs()[0], new_layer->getOutput(0));
+    LOG(INFO) << "Output tensor shape: " << new_layer->getOutput(0)->getDimensions();
+    return true;
+} 
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, LinearConverter);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/linear.h b/poros/src/poros/converter/gpu/linear.h
new file mode 100644
index 0000000000..939215443b
--- /dev/null
+++ b/poros/src/poros/converter/gpu/linear.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file linear.h
+* @author tianjinjin@baidu.com
+* @date Fri Aug 20 17:21:44 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class LinearConverter : public GpuConverter {
+public:
+    LinearConverter() {}
+    virtual ~LinearConverter() {}
+
+    //当前使用的版本
+    //参考了pytorch的实现，根据情况将其转换成addmm或者matmul + add。
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    //DEPRECATED: 调用addFullyConnected进行组网的版本
+    //在transform的模型中遇到了dimention不一致的问题,先搁置。
+    bool converter_fully_connect_version(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    virtual const std::vector<std::string> schema_string() {
+        return {"aten::linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor"};
+    }
+
+    virtual const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::linear};
+    }
+
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
\ No newline at end of file
diff --git a/poros/src/poros/converter/gpu/list.cpp b/poros/src/poros/converter/gpu/list.cpp
new file mode 100644
index 0000000000..7b33a8e25a
--- /dev/null
+++ b/poros/src/poros/converter/gpu/list.cpp
@@ -0,0 +1,184 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file list.cpp
+* @author tianjinjin@baidu.com
+* @date Mon Mar  8 11:36:11 CST 2021
+* @brief 
+**/
+#include "torch/script.h"
+
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/converter/gpu/list.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/context/poros_global.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+bool ListConstructConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    const torch::jit::Value* output = node->outputs()[0];
+    const auto num_inputs = inputs.size();
+    //typical situation: Construct a TensorList
+    if (output->type()->isSubtypeOf(c10::ListType::ofTensors()) || 
+        output->type()->str().find("Tensor?[]") != std::string::npos) {
+        std::vector<nvinfer1::ITensor*> tensorlist;
+        tensorlist.reserve(num_inputs);
+        for (auto in : inputs) {
+            auto in_tensor = engine->context().get_tensor(in);
+            POROS_CHECK_TRUE((in_tensor != nullptr), "Unable to extract in_tensor for node: " << *node);
+            tensorlist.emplace_back(in_tensor);
+        }
+        engine->context().set_tensorlist(node->outputs()[0], tensorlist);
+
+    // IntList
+    } else if (output->type()->isSubtypeOf(c10::ListType::ofInts())) {
+        // 检查int是否以nvtensor的形式输入
+        if (check_inputs_tensor_scalar(engine, node)) {
+            std::vector<nvinfer1::ITensor*> inputs_nvtensor;
+            // 将所有int对应的nvtensor加入vector, 最后cat起来
+            for (auto in : inputs) {
+                nvinfer1::ITensor* temp_tensor = this->get_tensor_scalar(in);
+                POROS_CHECK_TRUE((temp_tensor != nullptr), node_info(node) + std::string("get int nvtensor false."));
+                inputs_nvtensor.push_back(temp_tensor);
+            }
+            nvinfer1::IConcatenationLayer* concat_layer = 
+                    engine->network()->addConcatenation(inputs_nvtensor.data(), inputs_nvtensor.size());
+            // 这里确保输出类型是int
+            concat_layer->setOutputType(0, nvinfer1::DataType::kINT32);
+            concat_layer->setName((layer_info(node) + "_IConcatenationLayer").c_str());
+            concat_layer->setAxis(0);
+            engine->context().set_tensor(node->outputs()[0], concat_layer->getOutput(0));  
+        }
+        else {
+            // 输入是正常情况
+            c10::List<int64_t> list;
+            list.reserve(num_inputs);
+            for (auto in : inputs) {
+                auto in_const = engine->context().get_constant(in);
+                list.emplace_back(std::move(in_const.toInt()));
+            }
+            auto output_ivalue = c10::optional<torch::jit::IValue>(std::move(torch::jit::IValue(list)));
+            engine->context().set_constant(node->outputs()[0], output_ivalue);   
+        }
+
+    // FloatList
+    } else if (output->type()->isSubtypeOf(c10::ListType::ofFloats())) {
+        c10::List<double> list;
+        list.reserve(num_inputs);
+        for (auto in : inputs) {
+            auto in_const = engine->context().get_constant(in);
+            list.emplace_back(std::move(in_const.toDouble()));
+        }
+        auto output_ivalue = c10::optional<torch::jit::IValue>(std::move(torch::jit::IValue(list)));
+        engine->context().set_constant(node->outputs()[0], output_ivalue);
+
+    // BoolList
+    } else if (output->type()->isSubtypeOf(c10::ListType::ofBools())) {
+        c10::List<bool> list;
+        list.reserve(num_inputs);
+        for (auto in : inputs) {
+            auto in_const = engine->context().get_constant(in);
+            list.emplace_back(std::move(in_const.toBool()));
+        }
+        auto output_ivalue = c10::optional<torch::jit::IValue>(std::move(torch::jit::IValue(list)));
+        engine->context().set_constant(node->outputs()[0], output_ivalue);
+
+    //TODO: meet some unsupported type
+    } else {
+        POROS_THROW_ERROR("Meet some unsupported output value type in ListConstructConverter" << *node);
+    }
+    return true;
+}
+
+bool ListUnpackConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    at::ArrayRef<const torch::jit::Value*> outputs = node->outputs();
+    // 检查int[]是否以nvtensor的形式输入
+    if (check_inputs_tensor_scalar(engine, node)) {
+        nvinfer1::ITensor* input_int_nvtensor = get_tensor_scalar(inputs[0]);
+        POROS_CHECK_TRUE((input_int_nvtensor != nullptr), node_info(node) + std::string("get int nvtensor false."));
+
+        nvinfer1::Dims input_dims = input_int_nvtensor->getDimensions();
+        // int[]只有一维数据, 获得要unpack的int数量
+        int64_t dim_rank = input_dims.d[0];
+        POROS_CHECK_TRUE((outputs.size() == (size_t)dim_rank), 
+                "the input list size do not equal output size for ListUnpackConverter as expected");
+        // int[]
+        for (int64_t i = 0; i < dim_rank; i++) {
+            std::vector<int64_t> start_vec{i}, size_vec{1}, stride_vec{1};
+            auto slice_layer = engine->network()->addSlice(*input_int_nvtensor,
+                                                    sizes_to_nvdim(start_vec),
+                                                    sizes_to_nvdim(size_vec),
+                                                    sizes_to_nvdim(stride_vec));
+            POROS_CHECK(slice_layer, "Unable to given dim info from node: " << *node);
+            slice_layer->setName((layer_info(node) + "_ISliceLayer" + std::to_string(i)).c_str());
+            nvinfer1::ITensor* slice_output = slice_layer->getOutput(0);
+            engine->context().set_tensor(outputs[i], slice_output);
+        }
+        return true;
+    }
+    
+    std::vector<nvinfer1::ITensor*> output_vec;
+    engine->context().get_tensorlist(inputs[0], output_vec);
+    POROS_CHECK_TRUE((outputs.size() == output_vec.size()),
+        "the input list size do not equal output size for ListUnpackConverter as expected");   
+    
+    //TODO: check if this implement is right, check output is tuple or mulituple ivalues.
+    for (size_t index = 0; index < outputs.size(); index++) {
+        auto out = outputs[index];
+        //Tensor situation
+        if (out->type()->isSubtypeOf(c10::TensorType::get())) {
+            engine->context().set_tensor(out, output_vec[index]);
+        } else {
+            POROS_THROW_ERROR("Meet some unsupported output value type in ListUnpackConverter" << *node);
+        }
+    }
+    return true;
+}
+
+// OP aten::list, original python code looks like: "for a_shape in list(data.shape): ......"
+bool ListConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    const torch::jit::Value* output = node->outputs()[0];
+    const auto num_inputs = inputs.size();
+    POROS_CHECK_TRUE((num_inputs == 1),"More than 1 input is not supported for node:" << *node)
+    auto input = inputs[0];
+    POROS_CHECK_TRUE((input->type()->str() == output->type()->str()),"Input and Output are in different types")
+    auto input_tensor = engine->context().get_tensor(input);
+    if (!input_tensor) {
+        std::vector<nvinfer1::ITensor*> tensor_list;
+        POROS_CHECK_TRUE(engine->context().get_tensorlist(input, tensor_list), "extract tensor list err");
+        engine->context().set_tensorlist(node->outputs()[0], tensor_list);
+    }
+    else {
+        engine->context().set_tensor(node->outputs()[0],input_tensor);
+    }
+    return true;
+}
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, ListConstructConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, ListUnpackConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, ListConverter);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/list.h b/poros/src/poros/converter/gpu/list.h
new file mode 100644
index 0000000000..12f4555e38
--- /dev/null
+++ b/poros/src/poros/converter/gpu/list.h
@@ -0,0 +1,89 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file list.h
+* @author tianjinjin@baidu.com
+* @date Tue Jul 27 11:24:21 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class ListConstructConverter : public GpuConverter {
+public:
+    ListConstructConverter() {}
+    virtual ~ListConstructConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    //prim::ListConstruct kind node has no schema
+    const std::vector<std::string> schema_string() {
+        return {};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::prim::ListConstruct};
+    }
+};
+
+class ListUnpackConverter : public GpuConverter {
+public:
+    ListUnpackConverter() {}
+    virtual ~ListUnpackConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    //prim::ListUnpack kind node has no schema
+    const std::vector<std::string> schema_string() {
+        return {};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::prim::ListUnpack};
+    }
+};
+
+class ListConverter : public GpuConverter {
+public:
+    ListConverter() {}
+    virtual ~ListConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    //prim::List kind node has no schema
+    const std::vector<std::string> schema_string() {
+        return {};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::list};
+    }
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/logical.cpp b/poros/src/poros/converter/gpu/logical.cpp
new file mode 100644
index 0000000000..bf95220262
--- /dev/null
+++ b/poros/src/poros/converter/gpu/logical.cpp
@@ -0,0 +1,162 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file logical.cpp
+* @author Lin Xiao Chun (linxiaochun@baidu.com)
+* @date 2022-02-17 18:32:04
+* @brief
+**/
+
+#include "poros/converter/gpu/logical.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/context/poros_global.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+bool AndConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 2), "invaid inputs size for AndConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())),
+        "input[0] for AndConverter is not Tensor as expected");
+    POROS_CHECK_TRUE(((inputs[0]->node()->kind() != torch::jit::prim::Constant) &&
+        (inputs[1]->node()->kind() != torch::jit::prim::Constant)),
+                     "constant input is not support for AndConverter");
+
+    auto self = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((self != nullptr), "Unable to init input tensor for node: " << *node);
+
+    auto other = engine->context().get_tensor(inputs[1]);
+    POROS_CHECK_TRUE((other != nullptr), "Unable to init input tensor for node: " << *node);
+
+    POROS_CHECK_TRUE(((self->getType() == nvinfer1::DataType::kBOOL) && (other->getType() == nvinfer1::DataType::kBOOL)),
+                     "Only Bool type supported for for node: " << *node);
+
+    POROS_CHECK_TRUE(((self->getDimensions().nbDims > 0) && (other->getDimensions().nbDims > 0)),
+                     "scalar input is not supported for node: " << *node);
+
+    auto new_layer = add_elementwise(engine,
+            nvinfer1::ElementWiseOperation::kAND,
+            self,
+            other,
+            layer_info(node) + "_and");
+
+    POROS_CHECK(new_layer, "Unable to create And layer from node: " << *node);
+    engine->context().set_tensor(node->outputs()[0], new_layer->getOutput(0));
+    LOG(INFO) << "Output tensor shape: " << new_layer->getOutput(0)->getDimensions();
+    return true;
+}
+
+
+bool OrConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 2), "invaid inputs size for OrConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())),
+                     "input[0] for OrConverter is not Tensor as expected");
+    POROS_CHECK_TRUE(((inputs[0]->node()->kind() != torch::jit::prim::Constant) &&
+                      (inputs[1]->node()->kind() != torch::jit::prim::Constant)),
+                     "constant input is not support for OrConverter");
+
+    auto self = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((self != nullptr), "Unable to init input tensor for node: " << *node);
+
+    auto other = engine->context().get_tensor(inputs[1]);
+    POROS_CHECK_TRUE((other != nullptr), "Unable to init input tensor for node: " << *node);
+
+    POROS_CHECK_TRUE(((self->getType() == nvinfer1::DataType::kBOOL) && (other->getType() == nvinfer1::DataType::kBOOL)),
+                     "Only Bool type supported for for node: " << *node);
+
+    POROS_CHECK_TRUE(((self->getDimensions().nbDims > 0) && (other->getDimensions().nbDims > 0)),
+                     "scalar input is not supported for node: " << *node);
+
+    auto new_layer = add_elementwise(engine,
+                                     nvinfer1::ElementWiseOperation::kOR,
+                                     self,
+                                     other,
+                                     layer_info(node) + "_or");
+
+    POROS_CHECK(new_layer, "Unable to create Or layer from node: " << *node);
+    engine->context().set_tensor(node->outputs()[0], new_layer->getOutput(0));
+    LOG(INFO) << "Output tensor shape: " << new_layer->getOutput(0)->getDimensions();
+    return true;
+}
+
+bool XorConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 2), "invaid inputs size for XorConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())),
+                     "input[0] for XorConverter is not Tensor as expected");
+    POROS_CHECK_TRUE(((inputs[0]->node()->kind() != torch::jit::prim::Constant) &&
+                      (inputs[1]->node()->kind() != torch::jit::prim::Constant)),
+                     "constant input is not support for XorConverter");
+
+    auto self = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((self != nullptr), "Unable to init input tensor for node: " << *node);
+
+    auto other = engine->context().get_tensor(inputs[1]);
+    POROS_CHECK_TRUE((other != nullptr), "Unable to init input tensor for node: " << *node);
+
+    POROS_CHECK_TRUE(((self->getType() == nvinfer1::DataType::kBOOL) && (other->getType() == nvinfer1::DataType::kBOOL)),
+                     "Only Bool type supported for for node: " << *node);
+
+    POROS_CHECK_TRUE(((self->getDimensions().nbDims > 0) && (other->getDimensions().nbDims > 0)),
+                     "scalar input is not supported for node: " << *node);
+
+    auto new_layer = add_elementwise(engine,
+                                     nvinfer1::ElementWiseOperation::kXOR,
+                                     self,
+                                     other,
+                                     layer_info(node) + "_xor");
+
+    POROS_CHECK(new_layer, "Unable to create Xor layer from node: " << *node);
+    engine->context().set_tensor(node->outputs()[0], new_layer->getOutput(0));
+    LOG(INFO) << "Output tensor shape: " << new_layer->getOutput(0)->getDimensions();
+    return true;
+}
+
+bool NotConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 1), "invaid inputs size for NotConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())),
+                     "input[0] for NotConverter is not Tensor as expected");
+    POROS_CHECK_TRUE((inputs[0]->node()->kind() != torch::jit::prim::Constant),
+                     "constant input is not support for NotConverter");
+
+    auto self = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((self != nullptr), "Unable to init input tensor for node: " << *node);
+    auto new_layer = engine->network()->addUnary(*self,nvinfer1::UnaryOperation::kNOT);
+
+    POROS_CHECK(new_layer, "Unable to create And layer from node: " << *node);
+    new_layer->setName((layer_info(node) + "_IUnaryLayer").c_str());
+
+    engine->context().set_tensor(node->outputs()[0], new_layer->getOutput(0));
+    LOG(INFO) << "Output tensor shape: " << new_layer->getOutput(0)->getDimensions();
+    return true;
+}
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, AndConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, OrConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, XorConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, NotConverter);
+
+}  // namespace poros
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/logical.h b/poros/src/poros/converter/gpu/logical.h
new file mode 100644
index 0000000000..b168d6a50a
--- /dev/null
+++ b/poros/src/poros/converter/gpu/logical.h
@@ -0,0 +1,139 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file logical.h
+* @author Lin Xiao Chun (linxiaochun@baidu.com)
+* @date 2022-02-17 18:32:23
+* @brief
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class AndConverter : public GpuConverter {
+public:
+    AndConverter() {}
+    virtual ~AndConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    //aten::__and__.Scalar(Tensor self, Scalar other) -> Tensor
+    const std::vector<std::string> schema_string() {
+        return {
+                "aten::__and__.Tensor(Tensor self, Tensor other) -> Tensor",
+                "aten::bitwise_and.Tensor(Tensor self, Tensor other) -> Tensor",
+                };
+    }
+
+    /** TODO: TO SUPPORT CONVERTERS BELLOW:
+     *
+     * **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::__and__,
+                torch::jit::aten::__iand__,
+                torch::jit::aten::bitwise_and,
+                };
+    }
+};
+
+class OrConverter : public GpuConverter {
+public:
+    OrConverter() {}
+    virtual ~OrConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {
+                "aten::__or__.Tensor(Tensor self, Tensor other) -> Tensor",
+                "aten::bitwise_or.Tensor(Tensor self, Tensor other) -> Tensor",
+        };
+    }
+
+    /** TODO: TO SUPPORT CONVERTERS BELLOW:
+     *
+     * **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return  {torch::jit::aten::__or__,
+                 torch::jit::aten::__ior__,
+                 torch::jit::aten::bitwise_or,
+        };
+    }
+};
+
+class XorConverter : public GpuConverter {
+public:
+    XorConverter() {}
+    virtual ~XorConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {
+                "aten::__xor__.Tensor(Tensor self, Tensor other) -> Tensor",
+                "aten::bitwise_xor.Tensor(Tensor self, Tensor other) -> Tensor",
+        };
+    }
+
+    /** TODO: TO SUPPORT CONVERTERS BELLOW:
+     *
+     * **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::__xor__,
+                torch::jit::aten::__ixor__,
+                torch::jit::aten::bitwise_xor,
+        };
+    }
+};
+
+class NotConverter : public GpuConverter {
+public:
+    NotConverter() {}
+    virtual ~NotConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    //aten::bitwise_not(Tensor self) -> Tensor
+    const std::vector<std::string> schema_string() {
+        return {
+                "aten::bitwise_not(Tensor self) -> Tensor",
+        };
+    }
+
+    /** TODO: TO SUPPORT CONVERTERS BELLOW:
+     *
+     * **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {
+                torch::jit::aten::bitwise_not,
+        };
+
+    }
+};
+
+}  // namespace poros
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/lstm.cpp b/poros/src/poros/converter/gpu/lstm.cpp
new file mode 100644
index 0000000000..8d59998e24
--- /dev/null
+++ b/poros/src/poros/converter/gpu/lstm.cpp
@@ -0,0 +1,195 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file lstm.cpp
+* @author wangrui39@baidu.com
+* @date Mon December 13 11:36:11 CST 2021
+* @brief 
+**/
+
+#include "poros/converter/gpu/lstm.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/context/poros_global.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+#include "poros/converter/gpu/add.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+bool add_rnnv2_params(at::Tensor params, nvinfer1::IRNNv2Layer* &layer, bool isW, int rela_index, 
+                        int hidden_size, int idx, nvinfer1::RNNGateType gate, bool bias = false) {
+    std::vector<at::Tensor> w;
+    for (int i = idx * hidden_size; i < hidden_size * (idx + 1); i++){
+        w.push_back(params[i].unsqueeze(0));
+    }
+    if (bias) {
+        layer->setBiasForGate(rela_index, gate, isW, Weights(at::cat(w, 0)).data);
+    }
+    else {
+        layer->setWeightsForGate(rela_index, gate, isW, Weights(at::cat(w, 0)).data);
+    }
+    return true;
+}
+
+bool LstmConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    /*aten::lstm.input(Tensor input,
+                       Tensor[] hx,
+                       Tensor[] params,
+                       bool has_biases,
+                       int num_layers,
+                       float dropout,
+                       bool train,
+                       bool bidirectional,
+                       bool batch_first) -> (Tensor, Tensor, Tensor))*/
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 9), "invaid inputs size for LstmConverter");
+
+    // 获取输入
+    nvinfer1::ITensor *input = engine->context().get_tensor(inputs[0]);
+    std::vector<nvinfer1::ITensor*> hx_tensorlist;
+    engine->context().get_tensorlist(inputs[1], hx_tensorlist);
+    POROS_CHECK_TRUE((hx_tensorlist.size() == 2), "Unable to init input List[tensor] for node: " << *node);
+
+    // 获取参数
+    torch::jit::IValue params = engine->context().get_constant(inputs[2]);
+    POROS_CHECK_TRUE((params.isTensorList()), "Unable to init second input List[tensor] for node: " << *node);
+    c10::List<at::Tensor> param_list =  params.toTensorList();
+    int num_layers = engine->context().get_constant(inputs[4]).toInt();
+    bool bidirectional = engine->context().get_constant(inputs[7]).toBool();
+    bool batch_first = engine->context().get_constant(inputs[8]).toBool();
+
+    // 获取构建trt rnnlayer的输入
+    nvinfer1::ITensor *h_0 = hx_tensorlist[0];
+    nvinfer1::ITensor *c_0 = hx_tensorlist[1];
+    int32_t hidden_size = c_0->getDimensions().d[c_0->getDimensions().nbDims - 1];
+
+    if (!batch_first) {
+        auto input_shuffle_layer = engine->network()->addShuffle(*input);
+        input_shuffle_layer->setFirstTranspose(nvinfer1::Permutation{1, 0, 2});
+        input_shuffle_layer->setName((layer_info(node) + "_IShuffleLayer_for_input").c_str());
+        input = input_shuffle_layer->getOutput(0);
+    }
+    int max_seqlen = input->getDimensions().d[1];
+
+    // 使用trt现有的lstm
+    auto rnnv2_layer = engine->network()->addRNNv2(*input, num_layers, hidden_size, max_seqlen, nvinfer1::RNNOperation::kLSTM);
+    if (bidirectional) {
+        rnnv2_layer->setDirection(nvinfer1::RNNDirection::kBIDIRECTION);
+    }
+    rnnv2_layer->setName((layer_info(node) + "_IRNNv2Layer").c_str());
+
+    auto c_0_shuffle_layer = engine->network()->addShuffle(*c_0);
+    c_0_shuffle_layer->setFirstTranspose(nvinfer1::Permutation{1, 0, 2});
+    c_0_shuffle_layer->setName((layer_info(node) + "_IShuffleLayer_for_c0").c_str());
+    rnnv2_layer->setCellState(*c_0_shuffle_layer->getOutput(0));
+
+    auto h_0_shuffle_layer = engine->network()->addShuffle(*h_0);
+    h_0_shuffle_layer->setFirstTranspose(nvinfer1::Permutation{1, 0, 2});
+    h_0_shuffle_layer->setName((layer_info(node) + "_IShuffleLayer_for_h0").c_str());
+    rnnv2_layer->setHiddenState(*h_0_shuffle_layer->getOutput(0));
+
+    // 循环生成layer
+    for (int i = 0; i < num_layers; i++){
+        size_t rela_index = 0;
+        if (bidirectional) {
+            rela_index = 2 * i;
+        }
+        else {
+            rela_index = i;
+        }
+        
+        // weight_ih_l
+        add_rnnv2_params(param_list[rela_index * 4 + 0], rnnv2_layer, true, rela_index, hidden_size, 0, nvinfer1::RNNGateType::kINPUT);
+        add_rnnv2_params(param_list[rela_index * 4 + 0], rnnv2_layer, true, rela_index, hidden_size, 1, nvinfer1::RNNGateType::kFORGET);
+        add_rnnv2_params(param_list[rela_index * 4 + 0], rnnv2_layer, true, rela_index, hidden_size, 2, nvinfer1::RNNGateType::kCELL);
+        add_rnnv2_params(param_list[rela_index * 4 + 0], rnnv2_layer, true, rela_index, hidden_size, 3, nvinfer1::RNNGateType::kOUTPUT);
+
+        // weight_hh_l
+        add_rnnv2_params(param_list[rela_index * 4 + 1], rnnv2_layer, false, rela_index, hidden_size, 0, nvinfer1::RNNGateType::kINPUT);
+        add_rnnv2_params(param_list[rela_index * 4 + 1], rnnv2_layer, false, rela_index, hidden_size, 1, nvinfer1::RNNGateType::kFORGET);
+        add_rnnv2_params(param_list[rela_index * 4 + 1], rnnv2_layer, false, rela_index, hidden_size, 2, nvinfer1::RNNGateType::kCELL);
+        add_rnnv2_params(param_list[rela_index * 4 + 1], rnnv2_layer, false, rela_index, hidden_size, 3, nvinfer1::RNNGateType::kOUTPUT);
+
+        // bias_ih_l
+        add_rnnv2_params(param_list[rela_index * 4 + 2], rnnv2_layer, true, rela_index, hidden_size, 0, nvinfer1::RNNGateType::kINPUT, true);
+        add_rnnv2_params(param_list[rela_index * 4 + 2], rnnv2_layer, true, rela_index, hidden_size, 1, nvinfer1::RNNGateType::kFORGET, true);
+        add_rnnv2_params(param_list[rela_index * 4 + 2], rnnv2_layer, true, rela_index, hidden_size, 2, nvinfer1::RNNGateType::kCELL, true);
+        add_rnnv2_params(param_list[rela_index * 4 + 2], rnnv2_layer, true, rela_index, hidden_size, 3, nvinfer1::RNNGateType::kOUTPUT, true);
+
+        // bias_hh_l
+        add_rnnv2_params(param_list[rela_index * 4 + 3], rnnv2_layer, false, rela_index, hidden_size, 0, nvinfer1::RNNGateType::kINPUT, true);
+        add_rnnv2_params(param_list[rela_index * 4 + 3], rnnv2_layer, false, rela_index, hidden_size, 1, nvinfer1::RNNGateType::kFORGET, true);
+        add_rnnv2_params(param_list[rela_index * 4 + 3], rnnv2_layer, false, rela_index, hidden_size, 2, nvinfer1::RNNGateType::kCELL, true);
+        add_rnnv2_params(param_list[rela_index * 4 + 3], rnnv2_layer, false, rela_index, hidden_size, 3, nvinfer1::RNNGateType::kOUTPUT, true);
+
+        if (bidirectional) {
+            // ================reverse=====================
+            // weight_ih_l
+            add_rnnv2_params(param_list[(rela_index + 1) * 4 + 0], rnnv2_layer, true, rela_index + 1, hidden_size, 0, nvinfer1::RNNGateType::kINPUT);
+            add_rnnv2_params(param_list[(rela_index + 1) * 4 + 0], rnnv2_layer, true, rela_index + 1, hidden_size, 1, nvinfer1::RNNGateType::kFORGET);
+            add_rnnv2_params(param_list[(rela_index + 1) * 4 + 0], rnnv2_layer, true, rela_index + 1, hidden_size, 2, nvinfer1::RNNGateType::kCELL);
+            add_rnnv2_params(param_list[(rela_index + 1) * 4 + 0], rnnv2_layer, true, rela_index + 1, hidden_size, 3, nvinfer1::RNNGateType::kOUTPUT);
+
+            // weight_hh_l
+            add_rnnv2_params(param_list[(rela_index + 1) * 4 + 1], rnnv2_layer, false, rela_index + 1, hidden_size, 0, nvinfer1::RNNGateType::kINPUT);
+            add_rnnv2_params(param_list[(rela_index + 1) * 4 + 1], rnnv2_layer, false, rela_index + 1, hidden_size, 1, nvinfer1::RNNGateType::kFORGET);
+            add_rnnv2_params(param_list[(rela_index + 1) * 4 + 1], rnnv2_layer, false, rela_index + 1, hidden_size, 2, nvinfer1::RNNGateType::kCELL);
+            add_rnnv2_params(param_list[(rela_index + 1) * 4 + 1], rnnv2_layer, false, rela_index + 1, hidden_size, 3, nvinfer1::RNNGateType::kOUTPUT);
+
+            // bias_ih_l
+            add_rnnv2_params(param_list[(rela_index + 1) * 4 + 2], rnnv2_layer, true, rela_index + 1, hidden_size, 0, nvinfer1::RNNGateType::kINPUT, true);
+            add_rnnv2_params(param_list[(rela_index + 1) * 4 + 2], rnnv2_layer, true, rela_index + 1, hidden_size, 1, nvinfer1::RNNGateType::kFORGET, true);
+            add_rnnv2_params(param_list[(rela_index + 1) * 4 + 2], rnnv2_layer, true, rela_index + 1, hidden_size, 2, nvinfer1::RNNGateType::kCELL, true);
+            add_rnnv2_params(param_list[(rela_index + 1) * 4 + 2], rnnv2_layer, true, rela_index + 1, hidden_size, 3, nvinfer1::RNNGateType::kOUTPUT, true);
+
+            // bias_hh_l
+            add_rnnv2_params(param_list[(rela_index + 1) * 4 + 3], rnnv2_layer, false, rela_index + 1, hidden_size, 0, nvinfer1::RNNGateType::kINPUT, true);
+            add_rnnv2_params(param_list[(rela_index + 1) * 4 + 3], rnnv2_layer, false, rela_index + 1, hidden_size, 1, nvinfer1::RNNGateType::kFORGET, true);
+            add_rnnv2_params(param_list[(rela_index + 1) * 4 + 3], rnnv2_layer, false, rela_index + 1, hidden_size, 2, nvinfer1::RNNGateType::kCELL, true);
+            add_rnnv2_params(param_list[(rela_index + 1) * 4 + 3], rnnv2_layer, false, rela_index + 1, hidden_size, 3, nvinfer1::RNNGateType::kOUTPUT, true);
+        }
+    }
+
+    nvinfer1::ITensor* output = rnnv2_layer->getOutput(0);
+    if (!batch_first) {
+        auto output1_shuffle_layer = engine->network()->addShuffle(*output);
+        output1_shuffle_layer->setFirstTranspose(nvinfer1::Permutation{1, 0, 2});
+        output1_shuffle_layer->setName((layer_info(node) + "_IShuffleLayer_for_output").c_str());
+        output = output1_shuffle_layer->getOutput(0);
+    }
+    auto output2_shuffle_layer = engine->network()->addShuffle(*rnnv2_layer->getOutput(1));
+    output2_shuffle_layer->setFirstTranspose(nvinfer1::Permutation{1, 0, 2});
+    output2_shuffle_layer->setName((layer_info(node) + "_IShuffleLayer_for_output1").c_str());
+    auto output3_shuffle_layer = engine->network()->addShuffle(*rnnv2_layer->getOutput(2));
+    output3_shuffle_layer->setFirstTranspose(nvinfer1::Permutation{1, 0, 2});
+    output3_shuffle_layer->setName((layer_info(node) + "_IShuffleLayer_for_output2").c_str());
+
+    engine->context().set_tensor(node->outputs()[0], output);
+    engine->context().set_tensor(node->outputs()[1], output2_shuffle_layer->getOutput(0));
+    engine->context().set_tensor(node->outputs()[2], output3_shuffle_layer->getOutput(0));
+
+    return true;
+}
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, LstmConverter);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/lstm.h b/poros/src/poros/converter/gpu/lstm.h
new file mode 100644
index 0000000000..eb69ffd127
--- /dev/null
+++ b/poros/src/poros/converter/gpu/lstm.h
@@ -0,0 +1,63 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file lstm.h
+* @author wangrui39@baidu.com
+* @date Mon January 17 11:36:11 CST 2022
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+// Correspons to torch.lstm_cell https://pytorch.org/docs/1.9.1/generated/torch.nn.LSTM.html?highlight=lstm#torch.nn.LSTM
+class LstmConverter : public GpuConverter {
+public:
+    LstmConverter() {}
+    virtual ~LstmConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::lstm.input(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor)"};
+    }
+
+    /** TODO: TO SUPPORT CONVERTERS BELLOW:
+     * aten::lstm.input(Tensor input,
+     *                  Tensor[] hx, Tensor[] params,
+     *                  bool has_biases, int num_layers, float dropout,
+     *                  bool train,
+     *                  bool bidirectional,
+     *                  bool batch_first) -> (Tensor, Tensor, Tensor))
+     * **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::lstm};
+    }
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/lstm_cell.cpp b/poros/src/poros/converter/gpu/lstm_cell.cpp
new file mode 100644
index 0000000000..3eb12685e1
--- /dev/null
+++ b/poros/src/poros/converter/gpu/lstm_cell.cpp
@@ -0,0 +1,197 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file lstm_cell.cpp
+* @author wangrui39@baidu.com
+* @date Mon December 13 11:36:11 CST 2021
+* @brief 
+**/
+
+#include "poros/converter/gpu/lstm_cell.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/context/poros_global.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+#include "poros/converter/gpu/add.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+nvinfer1::Dims todims_pad(nvinfer1::Dims s_dim, int32_t pad_to) {
+    if (s_dim.nbDims > pad_to){
+        LOG(WARNING) << "Requested padding of dimensions to " << pad_to << " but found " << 
+            s_dim.nbDims << " dimensions, not going to pad";
+        return s_dim;
+    }
+
+    nvinfer1::Dims dims;
+    dims.nbDims = pad_to;
+    for (int32_t i = 0; i < pad_to - s_dim.nbDims; ++i) {
+        dims.d[i] = 1;
+    }
+    for (int32_t i = pad_to - s_dim.nbDims; i < pad_to; ++i) {
+        dims.d[i] = s_dim.d[i - (pad_to - s_dim.nbDims)];
+    }
+    return dims;
+}
+
+nvinfer1::ITensor* calculate_gate(
+    TensorrtEngine* engine, nvinfer1::ITensor *input, nvinfer1::ITensor *w, 
+    std::string b_name = "", nvinfer1::ITensor *b = nullptr) {
+    
+    auto mm = engine->network()->addMatrixMultiply(
+        *input, nvinfer1::MatrixOperation::kNONE, *w, nvinfer1::MatrixOperation::kTRANSPOSE);
+    nvinfer1::ITensor *mm_out = mm->getOutput(0);
+    
+    if (b != nullptr) {
+        auto mout_dim = mm_out->getDimensions();
+        auto b_dim = b->getDimensions();
+        
+        if (mout_dim.d != b_dim.d) {
+            auto shuffle = engine->network()->addShuffle(*b);
+            shuffle->setReshapeDimensions(todims_pad(b_dim, mout_dim.nbDims));
+            b = shuffle->getOutput(0);
+        }
+
+        auto add_layer = engine->network()->addElementWise(*mm_out, *b, nvinfer1::ElementWiseOperation::kSUM);
+        return add_layer->getOutput(0);
+    }
+    else{
+        return mm_out;
+    }
+    
+}
+
+bool LstmCellConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    //aten::lstm_cell(Tensor input, Tensor[] hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> (Tensor, Tensor)
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "inputs[0] for LstmCellConverter is not Tensor as expected");
+    POROS_CHECK_TRUE((inputs[1]->type()->isSubtypeOf(c10::ListType::ofTensors())), 
+        "inputs[1] for LstmCellConverter is not Tensor as expected");
+    POROS_CHECK_TRUE((inputs[2]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "inputs[2] for LstmCellConverter is not Tensor as expected");
+    POROS_CHECK_TRUE((inputs[3]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "inputs[3] for LstmCellConverter is not Tensor as expected");
+    
+    //extract Tensors[]
+    std::vector<nvinfer1::ITensor*> state;
+    bool ret = engine->context().get_tensorlist(inputs[1], state);
+    POROS_CHECK_TRUE((state.size() == 2), "Unable to init input List[tensor] for node: " << *node);
+    POROS_CHECK_TRUE(ret, "Unable to init input List[tensor] for node: " << *node);
+    
+    //extract Tensor
+    nvinfer1::ITensor *input = engine->context().get_tensor(inputs[0]);
+    nvinfer1::ITensor *w_ih = engine->context().get_tensor(inputs[2]);
+    nvinfer1::ITensor *w_hh = engine->context().get_tensor(inputs[3]);
+
+    // calculate first half of gates
+    nvinfer1::ITensor *out1 = nullptr;
+    nvinfer1::ITensor *out2 = nullptr;
+    
+    if (inputs[4]->type()->isSubtypeOf(c10::TensorType::get())) {
+        nvinfer1::ITensor *b_ih = engine->context().get_tensor(inputs[4]);
+        out1 = calculate_gate(engine, input, w_ih, "b_ih", b_ih);
+    }
+    else {
+        out1 = calculate_gate(engine, input, w_ih);
+    }
+    POROS_CHECK_TRUE((out1 != nullptr), "invaid b_ih size for ConcatConverter");
+
+    // calculate second half of gates
+    if (inputs[5]->type()->isSubtypeOf(c10::TensorType::get())) {
+        nvinfer1::ITensor *b_hh = engine->context().get_tensor(inputs[5]);
+        out2 = calculate_gate(engine, state[0], w_hh, "b_hh", b_hh);
+    }
+    else {
+        out2 = calculate_gate(engine, state[0], w_hh);
+    }
+    POROS_CHECK_TRUE((out2 != nullptr), "invaid b_hh size for ConcatConverter");
+
+    // get all 4 gates
+    auto add_layer = engine->network()->addElementWise(*out1, *out2, nvinfer1::ElementWiseOperation::kSUM);
+    add_layer->setName((layer_info(node) + "_sum_" + "for_add_out").c_str());
+    nvinfer1::ITensor *add_out = add_layer->getOutput(0);
+
+    // chunk Tensor into 4 parts and apply activation functions
+    auto dims = add_out->getDimensions().d;
+    auto batch = dims[0];
+    auto hidden = dims[1] / 4;
+
+    auto size = nvinfer1::Dims2(batch, hidden);
+    auto stride = nvinfer1::Dims2(1, 1);
+    auto offset0 = nvinfer1::Dims2(0, 0);
+    auto offset1 = nvinfer1::Dims2(0, hidden);
+    auto offset2 = nvinfer1::Dims2(0, 2 * hidden);
+    auto offset3 = nvinfer1::Dims2(0, 3 * hidden);
+
+    auto slice1 = engine->network()->addSlice(*add_out, offset0, size, stride);
+    slice1->setName((layer_info(node) + "_ISliceLayer_" + "for_offset0").c_str());
+    auto active1 = engine->network()->addActivation(*slice1->getOutput(0), nvinfer1::ActivationType::kSIGMOID);
+    active1->setName((layer_info(node) + "_IActivationLayer_" + "for_offset0").c_str());
+    auto ingate = active1->getOutput(0);
+    
+    auto slice2 = engine->network()->addSlice(*add_out, offset1, size, stride);
+    slice2->setName((layer_info(node) + "_ISliceLayer_" + "for_offset1").c_str());
+    auto active2 = engine->network()->addActivation(*slice2->getOutput(0), nvinfer1::ActivationType::kSIGMOID);
+    active2->setName((layer_info(node) + "_IActivationLayer_" + "for_offset1").c_str());
+    auto forgetgate = active2->getOutput(0);
+    
+    auto slice3 = engine->network()->addSlice(*add_out, offset2, size, stride);
+    slice3->setName((layer_info(node) + "_ISliceLayer_" + "for_offset2").c_str());
+    auto active3 = engine->network()->addActivation(*slice3->getOutput(0), nvinfer1::ActivationType::kTANH);
+    active3->setName((layer_info(node) + "_IActivationLayer_" + "for_offset2").c_str());
+    auto cellgate = active3->getOutput(0);
+
+    auto slice4 = engine->network()->addSlice(*add_out, offset3, size, stride);
+    slice4->setName((layer_info(node) + "_ISliceLayer_" + "for_offset3").c_str());
+    auto active4 = engine->network()->addActivation(*slice4->getOutput(0), nvinfer1::ActivationType::kSIGMOID);
+    active4->setName((layer_info(node) + "_IActivationLayer_" + "for_offset3").c_str());
+    auto outgate = active4->getOutput(0);
+
+    // compute cy
+    auto forget_cx = engine->network()->addElementWise(*forgetgate, *state[1], nvinfer1::ElementWiseOperation::kPROD);
+    forget_cx->setName((layer_info(node) + "_prod_" + "for_forget_cx").c_str());
+    auto in_cell = engine->network()->addElementWise(*ingate, *cellgate, nvinfer1::ElementWiseOperation::kPROD);
+    in_cell->setName((layer_info(node) + "_prod_" + "for_in_cell").c_str());
+    auto cy = engine->network()->addElementWise(
+        *forget_cx->getOutput(0), *in_cell->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
+    cy->setName((layer_info(node) + "_prod_" + "for_cy").c_str());
+    auto cy_out = cy->getOutput(0);
+
+    // compute hy
+    auto cy_tanh = engine->network()->addActivation(*cy_out, nvinfer1::ActivationType::kTANH);
+    cy_tanh->setName((layer_info(node) + "_IActivationLayer_" + "for_cy_tanh").c_str());
+    auto hy = engine->network()->addElementWise(*outgate, *cy_tanh->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);
+    hy->setName((layer_info(node) + "_prod_" + "for_hy").c_str());
+    auto hy_out = hy->getOutput(0);
+
+    engine->context().set_tensor(node->outputs()[0], hy_out);
+    engine->context().set_tensor(node->outputs()[1], cy_out);
+   
+    LOG(INFO) << "Output tensor shape: " << hy_out->getDimensions();
+    LOG(INFO) << "Output tensor shape: " << cy_out->getDimensions();
+    return true;
+}
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, LstmCellConverter);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/lstm_cell.h b/poros/src/poros/converter/gpu/lstm_cell.h
new file mode 100644
index 0000000000..17d2c13974
--- /dev/null
+++ b/poros/src/poros/converter/gpu/lstm_cell.h
@@ -0,0 +1,63 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file lstm_cell.h
+* @author wangrui39@baidu.com
+* @date Mon December 13 11:36:11 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+// Correspons to torch.lstm_cell https://pytorch.org/docs/stable/generated/torch.nn.LSTMCell.htmls
+class LstmCellConverter : public GpuConverter {
+public:
+    LstmCellConverter() {}
+    virtual ~LstmCellConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    //aten::lstm_cell(Tensor input, Tensor[] hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> (Tensor, Tensor)
+    const std::vector<std::string> schema_string() {
+        return {"aten::lstm_cell(Tensor input, Tensor[] hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> (Tensor, Tensor)"};
+    }
+
+    /** TODO: TO SUPPORT CONVERTERS BELLOW:
+     * "aten::lstm_cell(Tensor input, Tensor[] hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> (Tensor, Tensor)",
+     * **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::lstm_cell};
+    }
+
+    bool assign_schema_attr() {
+        return assign_schema_attr_helper({{"aten::lstm_cell(Tensor input, Tensor[] hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> (Tensor, Tensor)", {0, 0}}});
+    }
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/matrix_multiply.cpp b/poros/src/poros/converter/gpu/matrix_multiply.cpp
new file mode 100644
index 0000000000..61c256be99
--- /dev/null
+++ b/poros/src/poros/converter/gpu/matrix_multiply.cpp
@@ -0,0 +1,422 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file matrix_multiply.cpp
+* @author tianjinjin@baidu.com
+* @date Mon Mar  8 11:36:11 CST 2021
+* @brief 
+**/
+
+#include "poros/converter/gpu/aten_trt_util.h"
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/converter/gpu/matrix_multiply.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/context/poros_global.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+/** aten::matmul(Tensor self, Tensor other) -> Tensor 
+ * this can be much complicated than i expected.
+ * The behavior depends on the dimensionality of the Tensors as follows:
+- If both Tensors are 1-dimensional, the dot product (scalar) is returned.
+- If both arguments are 2-dimensional, the matrix-matrix product is returned.
+- If the first argument is 1-dimensional and the second argument is 2-dimensional,
+  a 1 is prepended to its dimension for the purpose of the matrix multiply.
+  After the matrix multiply, the prepended dimension is removed.
+- If the first argument is 2-dimensional and the second argument is 1-dimensional,
+  the matrix-vector product is returned.
+- If both arguments are at least 1-dimensional and at least one argument is
+  N-dimensional (where N > 2), then a batched matrix multiply is returned.  If the first
+  argument is 1-dimensional, a 1 is prepended to its dimension for the purpose of the
+  batched matrix multiply and removed after.  If the second argument is 1-dimensional, a
+  1 is appended to its dimension for the purpose of the batched matrix multiple and removed after.
+  The non-matrix (i.e. batch) dimensions are broadcasted (and thus
+  must be broadcastable).  For example, if tensor1 is a (j x 1 x n x m) Tensor
+  and tensor2 is a (k x m x p) Tensor, the returned tensor will be an (j x k x n x p) Tensor.
+the original pytorch implementation is:
+https://github.com/pytorch/pytorch/blob/v1.9.0/aten/src/ATen/native/LinearAlgebra.cpp#L1354
+*/
+nvinfer1::ITensor* MatmulConverter::converter(TensorrtEngine* engine,
+                                            const torch::jit::Node *node,
+                                            nvinfer1::ITensor* self,
+                                            nvinfer1::ITensor* other) {
+    auto self_dim = self->getDimensions().nbDims;
+    auto other_dim = other->getDimensions().nbDims;
+    auto origin_self_size = nvdim_to_sizes(self->getDimensions());
+    auto origin_other_size = nvdim_to_sizes(other->getDimensions());
+
+    LOG(INFO) << "self dim info : " << self->getDimensions()  << " and other dim  info: " << other->getDimensions();
+
+    nvinfer1::ILayer* mm_layer = nullptr;
+    //situation one: both tensors are 1D. this is like aten::dot
+    if (self_dim == 1 && other_dim == 1) {
+        mm_layer = engine->network()->addMatrixMultiply(
+            *self, nvinfer1::MatrixOperation::kVECTOR, *other, nvinfer1::MatrixOperation::kVECTOR);
+
+    //situation two: input tensor is 1D.
+    } else if (self_dim == 1 && other_dim == 2) {
+        mm_layer = engine->network()->addMatrixMultiply(
+            *self, nvinfer1::MatrixOperation::kVECTOR, *other, nvinfer1::MatrixOperation::kNONE);
+
+    //situation three: other tensor is 1D.
+    } else if  (self_dim == 2 && other_dim == 1) {
+        mm_layer = engine->network()->addMatrixMultiply(
+            *self, nvinfer1::MatrixOperation::kNONE, *other, nvinfer1::MatrixOperation::kVECTOR);
+
+    //situation four: input tensor is N-D(N > 2) and other tensor is 1D or 2D
+    } else if (self_dim > 2 && (other_dim == 1 || other_dim == 2)) {
+        if (other_dim == 1) {
+            auto other_shuffle = engine->network()->addShuffle(*other);
+            POROS_CHECK(other_shuffle, "Unable to create other shuffle layer for MatmulConverter");
+            other_shuffle->setReshapeDimensions(unsqueeze_dims(other->getDimensions(), 1));
+            other_shuffle->setName((layer_info(node) + "_IShuffleLayer_for_other").c_str());
+            other = other_shuffle->getOutput(0);
+            LOG(INFO) << "after shuffle other dim info turn to: " << other->getDimensions();
+        }
+
+        //prepare output_size info
+        std::vector<int64_t> output_size;
+        output_size.insert(output_size.end(), origin_self_size.begin(), origin_self_size.end() - 1);
+        if (other_dim == 2) {
+            auto other_size = nvdim_to_sizes(other->getDimensions());
+            output_size.push_back(other_size[1]);
+        }
+
+        std::vector<int64_t> new_order = {-1, origin_self_size[self_dim -1]};
+        auto self_shuffle = engine->network()->addShuffle(*self);
+        POROS_CHECK(self_shuffle, "Unable to create self shuffle layer for MatmulConverter");
+        self_shuffle->setReshapeDimensions(sizes_to_nvdim(new_order));
+        self_shuffle->setName((layer_info(node) + "_IShuffleLayer_for_self").c_str());
+        self = self_shuffle->getOutput(0);
+        LOG(INFO) << "after shuffle self dim info turn to: " << self->getDimensions();
+
+        auto tmp_mm_layer = engine->network()->addMatrixMultiply(
+            *self, nvinfer1::MatrixOperation::kNONE, *other, nvinfer1::MatrixOperation::kNONE);
+        POROS_CHECK(tmp_mm_layer, "Unable to create matrixmul layer for MatmulConverter");
+        tmp_mm_layer->setName((layer_info(node) + "_IMatrixMultiplyLayer").c_str());
+        auto tmp_output = tmp_mm_layer->getOutput(0);
+        LOG(INFO) << "matmul output dim info : " << tmp_output->getDimensions();
+
+        auto out_shuffle = engine->network()->addShuffle(*tmp_output);
+        POROS_CHECK(out_shuffle, "Unable to create shuffle layer for MatmulConverter");
+        out_shuffle->setReshapeDimensions(sizes_to_nvdim(output_size));
+        self_shuffle->setName((layer_info(node) + "_IShuffleLayer_for_out").c_str());
+        auto output = out_shuffle->getOutput(0);
+        LOG(INFO) << "reshape output back to original dim info : " << tmp_output->getDimensions();
+        return output;
+
+    //situation five: input tensor is N-D(N > 2) and other tensor is 1D or 2D
+    } else if (other_dim > 2 && (self_dim == 1 || self_dim == 2)) {
+        const int64_t n = self_dim == 2 ? origin_self_size[0] : 1;
+        const int64_t m = origin_self_size[self_dim - 1];
+        const int64_t p = origin_other_size[other_dim - 1];
+
+        //let's do other.transpose(-1, -2)
+        std::vector<int64_t> new_order;
+        for (int i = 0; i < other_dim; i++) {
+            new_order.push_back(i);
+        }
+        new_order[other_dim - 1] = new_order[other_dim - 2];
+        new_order[other_dim - 2] = other_dim - 1;
+        auto other_shuffle = engine->network()->addShuffle(*other);
+        POROS_CHECK(other_shuffle, "Unable to create shuffle layer from node: " << *node);
+        nvinfer1::Permutation permute;
+        std::copy(new_order.begin(), new_order.end(), permute.order);
+        other_shuffle->setSecondTranspose(permute);
+        other_shuffle->setName((layer_info(node) + "_IShuffleLayer_for_other").c_str());
+        other = other_shuffle->getOutput(0);
+        LOG(INFO) << "after transpose other dim info turn to: " << other->getDimensions();
+
+        //self_T = self_dim == 2 ? self.t() : self.reshape({n, m}).t();
+        if (self_dim == 1) {
+            //tensor1.reshape({n, m})
+            std::vector<int64_t> new_shape;
+            new_shape = torch::reshape(torch::rand(origin_self_size), {n, m}).sizes().vec();
+            auto tmp_shuffle = engine->network()->addShuffle(*self);
+            POROS_CHECK(tmp_shuffle, "Unable to create shuffle layer for MatmulConverter");
+            tmp_shuffle->setReshapeDimensions(sizes_to_nvdim(new_shape));
+            tmp_shuffle->setName((layer_info(node) + "_IShuffleLayer_for_self_tmp").c_str());
+            self = tmp_shuffle->getOutput(0);
+            LOG(INFO) << "after reshape self dim info turn to: " << self->getDimensions();
+        }
+        //self.t()
+        auto self_shuffle = engine->network()->addShuffle(*self);
+        POROS_CHECK(self_shuffle, "Unable to create shuffle layer for MatmulConverter");
+        nvinfer1::Permutation first_perm;
+        first_perm.order[0] = 1;
+        first_perm.order[1] = 0;
+        self_shuffle->setFirstTranspose(first_perm);
+        self_shuffle->setZeroIsPlaceholder(false);
+        self_shuffle->setName((layer_info(node) + "_IShuffleLayer_for_self").c_str());
+        self = self_shuffle->getOutput(0);
+        LOG(INFO) << "after transpose self dim info turn to: " << self->getDimensions();
+
+        //求 other.t() 与 self.t() 的matmul 的结果。
+        auto mm_output = converter(engine, node, other, self);
+        POROS_CHECK(mm_output, "Unable to calculate transpose matmul for MatmulConverter");
+        auto mm_dim = mm_output->getDimensions().nbDims;
+        auto mm_dim_size = nvdim_to_sizes(mm_output->getDimensions());
+
+        //给我转置回来... 要哭了
+        if (self_dim == 2) {
+            std::vector<int64_t> new_order;
+            for (int i = 0; i < mm_dim; i++) {
+                new_order.push_back(i);
+            }
+            new_order[mm_dim - 1] = new_order[mm_dim - 2];
+            new_order[mm_dim - 2] = mm_dim - 1;
+            auto mm_shuffle = engine->network()->addShuffle(*mm_output);
+            POROS_CHECK(mm_shuffle, "Unable to create shuffle layer for MatmulConverter");
+            nvinfer1::Permutation permute;
+            std::copy(new_order.begin(), new_order.end(), permute.order);
+            mm_shuffle->setSecondTranspose(permute);
+            mm_shuffle->setName((layer_info(node) + "_IShuffleLayer_for_output").c_str());
+            auto output = mm_shuffle->getOutput(0);
+            LOG(INFO) << "after transpose back ouput info turn to: " << output->getDimensions();
+            return output;
+        } else {
+            //res_tensor.reshape(shape)
+            std::vector<int64_t> shape;
+            for (int i = 0; i < other_dim - 2; i++) {
+                shape.push_back(origin_other_size[i]);
+            }
+            shape.push_back(p);
+            auto new_shape = torch::reshape(torch::rand(mm_dim_size), shape).sizes().vec();
+            auto mm_shuffle = engine->network()->addShuffle(*mm_output);
+            POROS_CHECK(mm_shuffle, "Unable to create shuffle layer for MatmulConverter");
+            mm_shuffle->setReshapeDimensions(sizes_to_nvdim(new_shape));
+            mm_shuffle->setName((layer_info(node) + "_IShuffleLayer_for_output").c_str());
+            auto output = mm_shuffle->getOutput(0);
+            LOG(INFO) << "after transpose back ouput info turn to: " << output->getDimensions();
+            return output;
+        }
+
+    } else {
+        // expanding the dimensions if necessary.
+        if (self->getDimensions().nbDims < other->getDimensions().nbDims) {
+            auto newDims = self->getDimensions();
+            for (int dim = self->getDimensions().nbDims; dim < other->getDimensions().nbDims; ++dim) {
+                newDims = unsqueeze_dims(newDims, 0, 1, false);
+            }
+            LOG(INFO) << "Original self shape: " << self->getDimensions() << ", reshaping to: " << newDims;
+            auto shuffle_layer = engine->network()->addShuffle(*self);
+            POROS_CHECK(shuffle_layer, "Unable to create shuffle layer for MatmulConverter");
+            shuffle_layer->setReshapeDimensions(newDims);
+            shuffle_layer->setZeroIsPlaceholder(false);
+            shuffle_layer->setName((layer_info(node) + "_IShuffleLayer_for_self").c_str());
+            self = shuffle_layer->getOutput(0);
+            //self = add_padding(engine, node, self, other->getDimensions().nbDims, false, false);
+        } else if (other->getDimensions().nbDims < self->getDimensions().nbDims) {
+            auto newDims = other->getDimensions();
+            for (int dim = other->getDimensions().nbDims; dim < self->getDimensions().nbDims; ++dim) {
+                newDims = unsqueeze_dims(newDims, 0, 1, false);
+            }
+            LOG(INFO) << "Original other shape: " << other->getDimensions() << ", reshaping to: " << newDims;
+            auto shuffle_layer = engine->network()->addShuffle(*other);
+            POROS_CHECK(shuffle_layer, "Unable to create shuffle layer for MatmulConverter");
+            shuffle_layer->setReshapeDimensions(newDims);
+            shuffle_layer->setZeroIsPlaceholder(false);
+            shuffle_layer->setName((layer_info(node) + "_IShuffleLayer_for_other").c_str());
+            other = shuffle_layer->getOutput(0);
+            //other = add_padding(engine, node, other, self->getDimensions().nbDims, false, false);
+        }
+
+        mm_layer = engine->network()->addMatrixMultiply(
+            *self, nvinfer1::MatrixOperation::kNONE, *other, nvinfer1::MatrixOperation::kNONE);
+    }
+
+    mm_layer->setName((layer_info(node) + "_IMatrixMultiplyLayer_for_other").c_str());
+    POROS_CHECK(mm_layer, "Unable to create matrix multiplication node: " << *node);
+    auto output = mm_layer->getOutput(0);
+    return output;
+}
+
+bool MatmulConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 2), "invaid inputs size for MatmulConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())),
+        "input[0] for MatmulConverter is not Tensor as expected");
+    POROS_CHECK_TRUE((inputs[1]->type()->isSubtypeOf(c10::TensorType::get())),
+        "input[1] for MatmulConverter is not Tensor as expected");
+
+    auto self = engine->context().get_tensor(inputs[0]);
+    auto other = engine->context().get_tensor(inputs[1]);
+    POROS_CHECK_TRUE(((self != nullptr) && (other != nullptr)),
+        "Unable to init input tensor for node: " << *node);
+
+    //add more log info for matmulConverter
+    LOG(INFO) << "input[0] tensor is: " << node_info(inputs[0]->node());
+    LOG(INFO) << "input[1] tensor is: " << node_info(inputs[1]->node());
+
+    auto ouput = converter(engine, node, self, other);
+    if (ouput != nullptr) {
+        engine->context().set_tensor(node->outputs()[0], ouput);
+        LOG(INFO) << "Output tensor shape: " << ouput->getDimensions();
+        return true;
+    } else {
+        return false;
+    }
+}
+
+/* aten::bmm(Tensor self, Tensor mat2) -> Tensor */
+bool BmmConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 2), "invaid inputs size for BmmConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for BmmConverter is not Tensor as expected");
+    POROS_CHECK_TRUE((inputs[1]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[1] for BmmConverter is not Tensor as expected");
+
+
+    auto self = engine->context().get_tensor(inputs[0]);
+    auto mat2 = engine->context().get_tensor(inputs[1]);
+    POROS_CHECK_TRUE(((self != nullptr) && (mat2 != nullptr)), 
+        "Unable to init input tensor for node: " << *node);
+    
+    nvinfer1::Dims selfDims = self->getDimensions();
+    nvinfer1::Dims mat2Dims = mat2->getDimensions();
+
+    // check dimensions
+    POROS_CHECK(selfDims.nbDims == 3,
+        "Expected 3-dimensional tensor, but got " << selfDims.nbDims
+        << "-dimensional tensor for argument #1 'batch1' (while checking arguments for bmm)");
+    POROS_CHECK(mat2Dims.nbDims == 3,
+        "Expected 3-dimensional tensor, but got " << mat2Dims.nbDims
+        << "-dimensional tensor for argument #2 'batch2' (while checking arguments for bmm)");
+
+    // Self and mat2 should have same size at dimension 0
+    POROS_CHECK(selfDims.d[0] == mat2Dims.d[0],
+        "Expected tensor to have size " << selfDims.d[0] << " at dimension 0, but got size " << mat2Dims.d[0]
+        << " for argument #2 'batch2' (while checking arguments for bmm)");
+    
+    // The size of mat2 at dimension 1 should be the same as that of self at dimension 2.
+    POROS_CHECK(selfDims.d[2] == mat2Dims.d[1],
+        "Expected tensor to have size " << selfDims.d[2] << " at dimension 1, but got size " << mat2Dims.d[1]
+        << " for argument #2 'batch2' (while checking arguments for bmm)");
+
+    auto mm_layer = engine->network()->addMatrixMultiply(
+        *self, nvinfer1::MatrixOperation::kNONE, *mat2, nvinfer1::MatrixOperation::kNONE);
+    POROS_CHECK(mm_layer, "Unable to create matrix multiplication node: " << *node);
+    
+    mm_layer->setName((layer_info(node) + "_IMatrixMultiplyLayer").c_str());
+    engine->context().set_tensor(node->outputs()[0], mm_layer->getOutput(0));
+    LOG(INFO) << "Output tensor shape: " << mm_layer->getOutput(0)->getDimensions();
+    return true;
+}
+
+/** 
+ * aten::addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor 
+ * check the function in pytorch: aten/src/ATen/RegisterSparseCuda.cpp  
+ * at::native::addmm_sparse_dense_cuda(self, mat1, mat2, beta, alpha)
+ * and the docs is like this: https://pytorch.org/docs/stable/generated/torch.addmm.html
+ * 
+ * %out: Tensor = aten::addmm(%bias, %mat1, %mat2, %beta, %alpha)
+ * according to the torch.addmm explanation. the result is:
+ * out = %beta * %bias + %alpha (%mat1 @ %mat2 )
+ * 
+ *  try to converter matmul like below:
+ * %mm: Tensor = aten::matmul(%mat1, %mat2)
+ * %bias_new: Tensor = aten::mul(%bias, %beta)
+ * %out: Tensor = aten::add(%bias_new, %mm, %alpha) 
+ **/
+bool AddmmConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 5), "invaid inputs size for AddmmConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for AddmmConverter is not Tensor as expected");
+    POROS_CHECK_TRUE((inputs[1]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[1] for AddmmConverter is not Tensor as expected");
+    POROS_CHECK_TRUE((inputs[2]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[2] for AddmmConverter is not Tensor as expected");
+
+    //extract bias & mat1 & mat2 
+    auto bias = engine->context().get_tensor(inputs[0]);
+    auto mat1 = engine->context().get_tensor(inputs[1]);
+    auto mat2 = engine->context().get_tensor(inputs[2]);
+    POROS_CHECK_TRUE(((bias != nullptr) && (mat1 != nullptr) && (mat2 != nullptr)), 
+        "Unable to init input tensor for node: " << *node);
+
+    //extract beta & alpha
+    auto beta = (engine->context().get_constant(inputs[3])).toScalar().to<float>();
+    auto alpha = (engine->context().get_constant(inputs[4])).toScalar().to<float>();
+
+    /*-----------------------------------------------------------------------------
+            step1: %mm: Tensor = aten::matmul(%mat1, %mat2)
+    -------------------------------------------------------------------------------*/
+    // Ensure mat1 and mat2 tensors have same nbDims by expanding the dimensions (from 0 axis) if
+    // necessary.
+    // TODO: this is too much simpler than the reality. we should change this someday
+    if (mat1->getDimensions().nbDims < mat2->getDimensions().nbDims) {
+        mat1 = add_padding(engine, node, mat1, mat2->getDimensions().nbDims, false, false);
+    } else {
+        mat2 = add_padding(engine, node, mat2, mat1->getDimensions().nbDims, false, false);
+    }
+    auto mm_layer = engine->network()->addMatrixMultiply(
+        *mat1, nvinfer1::MatrixOperation::kNONE, *mat2, nvinfer1::MatrixOperation::kNONE);
+    POROS_CHECK(mm_layer, "Unable to create matrix multiplication node: " << *node);
+    mm_layer->setName((layer_info(node) + "_IMatrixMultiplyLayer").c_str());
+    auto mm_output = mm_layer->getOutput(0);
+
+    /*-----------------------------------------------------------------------------
+            step2: %bias_new: Tensor = aten::mul(%bias, %beta)
+    -------------------------------------------------------------------------------*/
+    if (1 != beta) {
+        auto beta_tensor = tensor_to_const(engine, torch::tensor({beta}));
+        auto bias_new_layer = add_elementwise(engine, 
+                            nvinfer1::ElementWiseOperation::kPROD,
+                            bias,
+                            beta_tensor,
+                            layer_info(node) + "_prod_for_beta");
+        POROS_CHECK(bias_new_layer, "Unable to create bias mul layer from node: " << *node);
+        bias = bias_new_layer->getOutput(0);
+    }
+    
+    /*-----------------------------------------------------------------------------
+            step3: %out: Tensor = aten::add(%bias_new, %mm, %alpha)
+    -------------------------------------------------------------------------------*/
+    if (1 != alpha) {
+        auto alpha_tensor = tensor_to_const(engine, torch::tensor({alpha}));
+        auto mm_new_layer = add_elementwise(engine,
+                            nvinfer1::ElementWiseOperation::kPROD,
+                            mm_output,
+                            alpha_tensor,
+                            layer_info(node) + "_prod_for_alpha");
+        POROS_CHECK(mm_new_layer, "Unable to create alpha*input layer from node: " << *node);
+        mm_output = mm_new_layer->getOutput(0);
+    }
+    auto add_mm = add_elementwise(engine, 
+            nvinfer1::ElementWiseOperation::kSUM, 
+            bias,
+            mm_output,
+            layer_info(node) + "_sum");
+    POROS_CHECK(add_mm, "Unable to create add layer from node: " << *node);
+    engine->context().set_tensor(node->outputs()[0], add_mm->getOutput(0));
+    LOG(INFO) << "Output tensor shape: " << add_mm->getOutput(0)->getDimensions();
+    return true;
+}
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, MatmulConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, BmmConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, AddmmConverter);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/matrix_multiply.h b/poros/src/poros/converter/gpu/matrix_multiply.h
new file mode 100644
index 0000000000..4327cc89bb
--- /dev/null
+++ b/poros/src/poros/converter/gpu/matrix_multiply.h
@@ -0,0 +1,104 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file matrix_multiply.h
+* @author tianjinjin@baidu.com
+* @date Wed Aug 18 20:30:19 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class MatmulConverter : public GpuConverter {
+public:
+    MatmulConverter() {}
+    virtual ~MatmulConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    nvinfer1::ITensor* converter(TensorrtEngine* engine,
+                                const torch::jit::Node *node,
+                                nvinfer1::ITensor* self,
+                                nvinfer1::ITensor* other);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::matmul(Tensor self, Tensor other) -> Tensor"};
+    }
+
+    /**
+     * TODO: TRY TO SUPPORT SCHEMA PATTERNS BELLOW:
+     * aten::matmul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+     * **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::matmul};
+    }
+};
+
+class BmmConverter : public GpuConverter {
+public:
+    BmmConverter() {}
+    virtual ~BmmConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::bmm(Tensor self, Tensor mat2) -> Tensor"};
+    }
+
+    /**
+     * TODO: TRY TO SUPPORT SCHEMA PATTERNS BELLOW:
+     * aten::bmm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
+     * **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::bmm};
+    }
+};
+
+class AddmmConverter : public GpuConverter {
+public:
+    AddmmConverter() {}
+    virtual ~AddmmConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor"};
+    }
+
+    /**
+     * TODO: TRY TO SUPPORT SCHEMA PATTERNS BELLOW:
+     * aten::addmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+     * aten::addmm_(Tensor(a!) self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
+     * **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::addmm};
+    }
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
\ No newline at end of file
diff --git a/poros/src/poros/converter/gpu/meshgrid.cpp b/poros/src/poros/converter/gpu/meshgrid.cpp
new file mode 100644
index 0000000000..b8a1e3b296
--- /dev/null
+++ b/poros/src/poros/converter/gpu/meshgrid.cpp
@@ -0,0 +1,140 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file meshgrid.cpp
+* @author wangrui39@baidu.com
+* @date Monday November 27 11:36:11 CST 2021
+* @brief 
+**/
+
+#include "poros/converter/gpu/meshgrid.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/context/poros_global.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+// aten::meshgrid(Tensor[] tensors) -> Tensor[]
+bool MeshgridConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 1), "invaid inputs size for MeshgridConverter");
+
+    // 1：获取输入 
+    std::vector<nvinfer1::ITensor*> tensorlist;
+    //tensorlist.resize(2);
+    POROS_CHECK_TRUE((engine->context().get_tensorlist(inputs[0], tensorlist)), "extract tensor list err");
+
+    POROS_CHECK_TRUE((tensorlist.size() == 2), 
+        "Expected 2 elements in a tensorlist but found " + std::to_string(tensorlist.size()));
+    nvinfer1::ITensor* input1 = tensorlist[0];
+    POROS_CHECK_TRUE((input1->getDimensions().nbDims == 1), 
+        "Expected scalar or 1D tensor in the tensor list but got " + std::to_string(input1->getDimensions().nbDims));
+    nvinfer1::ITensor* input2 = tensorlist[1];
+    POROS_CHECK_TRUE((input1->getDimensions().nbDims == 1), 
+        "Expected scalar or 1D tensor in the tensor list but got " + std::to_string(input2->getDimensions().nbDims));
+
+    /*std::vector<nvinfer1::ITensor*> output_tensorlist;
+    output_tensorlist.emplace_back(input1);
+    output_tensorlist.emplace_back(input2);*/
+
+    // 2: 构造返回类型
+    nvinfer1::Dims reshape_dim;
+    reshape_dim.nbDims = 2;
+    reshape_dim.d[0] = 1;
+    reshape_dim.d[1] = input1->getDimensions().d[0];
+    std::vector<nvinfer1::ITensor*> output_tensorlist;
+    output_tensorlist.resize(2);
+
+    // 3：生成return tensorlist[0] unsqueeze + cat + transpose
+    // a：unsqueeze
+    auto unsqueeze_shuffle_layer1 = engine->network()->addShuffle(*input1);
+    POROS_CHECK(unsqueeze_shuffle_layer1, "Unable to create shuffle layer from node: " << *node);
+    unsqueeze_shuffle_layer1->setReshapeDimensions(reshape_dim);
+    unsqueeze_shuffle_layer1->setName((layer_info(node) + "_IShuffleLayer_for_input1").c_str());
+    nvinfer1::ITensor *un_sl_output1 = unsqueeze_shuffle_layer1->getOutput(0);
+    
+    // b：cat
+    std::vector<nvinfer1::ITensor*> cat_tensorlist1;
+    cat_tensorlist1.resize(input2->getDimensions().d[0]);
+    for (int i = 0; i < input2->getDimensions().d[0]; ++i) {
+        auto tmp_weights = Weights(at::zeros({un_sl_output1->getDimensions().d[0], un_sl_output1->getDimensions().d[1]}, {at::kCUDA}).to(torch::kInt));
+        auto constant_layer = engine->network()->addConstant(tmp_weights.shape, tmp_weights.data);
+        nvinfer1::ITensor* costant_tensor = constant_layer->getOutput(0);
+        auto add_layer = engine->network()->addElementWise(*costant_tensor, *un_sl_output1, nvinfer1::ElementWiseOperation::kSUM);
+        add_layer->setName((layer_info(node) + "_sum_for_tensorlist1_" + std::to_string(i)).c_str());
+        cat_tensorlist1[i] = add_layer->getOutput(0);
+    }
+
+    auto cat_layer1 = engine->network()->addConcatenation(cat_tensorlist1.data(), cat_tensorlist1.size());
+    cat_layer1->setAxis(0);
+    cat_layer1->setName((layer_info(node) + "_IConcatenationLayer_1").c_str());
+    nvinfer1::ITensor *cat_output1 = cat_layer1->getOutput(0);
+
+    // c：transpose
+    auto transpose_shuffle_layer = engine->network()->addShuffle(*cat_output1);
+    POROS_CHECK(transpose_shuffle_layer, "Unable to create shuffle layer from node: " << *node);
+    nvinfer1::Permutation permute;
+    permute.order[0] = 1;
+    permute.order[1] = 0;
+    transpose_shuffle_layer->setSecondTranspose(permute);
+    transpose_shuffle_layer->setName((layer_info(node) + "_IShuffleLayer_for_cat_output").c_str());
+    nvinfer1::ITensor *ts_output = transpose_shuffle_layer->getOutput(0);
+    output_tensorlist[0] = ts_output;
+
+    // 4：生成return tensorlist[1] unsqueeze + cat
+    // a：unsqueeze
+    reshape_dim.d[1] = input2->getDimensions().d[0];
+    auto unsqueeze_shuffle_layer2 = engine->network()->addShuffle(*input2);
+    POROS_CHECK(unsqueeze_shuffle_layer2, "Unable to create shuffle layer from node: " << *node);
+    unsqueeze_shuffle_layer2->setReshapeDimensions(reshape_dim);
+    unsqueeze_shuffle_layer2->setName((layer_info(node) + "_IShuffleLayer_for_input2").c_str());
+    nvinfer1::ITensor *un_sl_output2 = unsqueeze_shuffle_layer2->getOutput(0);
+
+    // b：cat
+    std::vector<nvinfer1::ITensor*> cat_tensorlist2;
+    cat_tensorlist2.resize(input1->getDimensions().d[0]);
+    for (int i = 0; i < input1->getDimensions().d[0]; ++i) {
+        auto tmp_weights = Weights(at::zeros({un_sl_output2->getDimensions().d[0], un_sl_output2->getDimensions().d[1]}, {at::kCUDA}).to(torch::kInt));
+        auto constant_layer = engine->network()->addConstant(tmp_weights.shape, tmp_weights.data);
+        nvinfer1::ITensor* costant_tensor = constant_layer->getOutput(0);
+        auto add_layer = engine->network()->addElementWise(*costant_tensor, *un_sl_output2, nvinfer1::ElementWiseOperation::kSUM);
+        //cat_tensorlist2.emplace_back(add_layer->getOutput(0));
+        add_layer->setName((layer_info(node) + "_sum_for_tensorlist2_" + std::to_string(i)).c_str());
+        cat_tensorlist2[i] = add_layer->getOutput(0);
+    }
+
+    auto cat_layer2 = engine->network()->addConcatenation(cat_tensorlist2.data(), cat_tensorlist2.size());
+    cat_layer2->setAxis(0);
+    cat_layer2->setName((layer_info(node) + "_IConcatenationLayer_2").c_str()); 
+    nvinfer1::ITensor *cat_output2 = cat_layer2->getOutput(0);
+    output_tensorlist[1] = cat_output2;
+
+    // 5：设置output
+    engine->context().set_tensorlist(node->outputs()[0], output_tensorlist);
+    return true;
+}
+  
+POROS_REGISTER_CONVERTER(TensorrtEngine, MeshgridConverter);
+
+} // baidu
+} // mirana
+} // poros
diff --git a/poros/src/poros/converter/gpu/meshgrid.h b/poros/src/poros/converter/gpu/meshgrid.h
new file mode 100644
index 0000000000..b29d6c4e5e
--- /dev/null
+++ b/poros/src/poros/converter/gpu/meshgrid.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file meshgrid.h
+* @author wangrui39@baidu.com
+* @date Monday November 27 11:36:11 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+// DEPRECATED Use `lowering/fuse_meshgrid.h` to rewrite this op. This converter is no longer needed.
+// Correspons to torch.meshgrid https://pytorch.org/docs/1.9.0/generated/torch.meshgrid.html?highlight=meshgrid#torch.meshgrid
+class MeshgridConverter : public GpuConverter {
+public:
+    MeshgridConverter() {}
+    virtual ~MeshgridConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::meshgrid(Tensor[] tensors) -> Tensor[]"};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::meshgrid};
+    }
+
+    bool assign_schema_attr() {
+        return assign_schema_attr_helper({{"aten::meshgrid(Tensor[] tensors) -> Tensor[]", {0, 0}}});
+    }
+};
+
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/mul_div.cpp b/poros/src/poros/converter/gpu/mul_div.cpp
new file mode 100644
index 0000000000..51aa0ca224
--- /dev/null
+++ b/poros/src/poros/converter/gpu/mul_div.cpp
@@ -0,0 +1,297 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file mul_div.cpp
+* @author tianjinjin@baidu.com
+* @date Mon Mar  8 11:36:11 CST 2021
+* @brief 
+**/
+
+#include "poros/converter/gpu/mul_div.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/context/poros_global.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+/*
+aten::mul.Tensor(Tensor self, Tensor other) -> Tensor
+aten::mul.Scalar(Tensor self, Scalar other) -> Tensor*/
+bool MulConverter::converter(TensorrtEngine *engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value *> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 2), "invaid inputs size for MulConverter");
+
+    // 先判断schema是否是 aten::mul.int(int a, int b) -> (int)
+    if (node->schema().operator_name() == torch::jit::parseSchema(this->schema_string()[4]).operator_name()) {
+        // 检查int是否以nvtensor的形式输入
+        if (check_inputs_tensor_scalar(engine, node)) {
+            // 获取int对应的nvtensor
+            nvinfer1::ITensor *a = this->get_tensor_scalar(inputs[0]);
+            nvinfer1::ITensor *b = this->get_tensor_scalar(inputs[1]);
+            // 判断是否为空 (get_constant失败时可能为空)
+            // 为空时返回false, 让子图fallback
+            POROS_CHECK_TRUE((a != nullptr && b != nullptr),
+                             node_info(node) + std::string("get int nvtensor false."));
+            // a和b相乘并返回
+            nvinfer1::ILayer *mul_layer = add_elementwise(engine,
+                                                          nvinfer1::ElementWiseOperation::kPROD,
+                                                          a, b, layer_info(node) + "_prod");
+            POROS_CHECK(mul_layer, "Unable to create mul layer from node: " << *node);
+            nvinfer1::ITensor *output = mul_layer->getOutput(0);
+            engine->context().set_tensor(node->outputs()[0], output);
+            LOG(INFO) << "Output tensor shape: " << output->getDimensions();
+            return true;
+        } else {
+            int a = engine->context().get_constant(inputs[0]).toScalar().to<int>();
+            int b = engine->context().get_constant(inputs[1]).toScalar().to<int>();
+            engine->context().set_constant(node->outputs()[0], a * b);
+            return true;
+        }
+    }
+
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())),
+                     "input[0] for MulConverter is not Tensor as expected");
+
+    // Should implement self * other
+    auto self = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((self != nullptr), "Unable to init input tensor for node: " << *node);
+    auto other = engine->context().get_tensor(inputs[1]);
+    // when other input is Scalar
+    if (other == nullptr) {
+        auto other_const = engine->context().get_constant(inputs[1]);
+        if (other_const.isScalar()) {
+            auto other_scalar = other_const.toScalar().to<float>();
+            other = tensor_to_const(engine, torch::tensor({other_scalar}));
+        } else {
+            POROS_THROW_ERROR("Unable to get input other value for MulConverter");
+        }
+    }
+    // 遇到过aten::mul(float tensor, int scalar)的输入情况，都转成float就行
+    if (self->getType() == nvinfer1::DataType::kFLOAT && other->getType() == nvinfer1::DataType::kINT32) {
+        nvinfer1::IIdentityLayer* identity_layer = engine->network()->addIdentity(*other);
+        identity_layer->setOutputType(0, nvinfer1::DataType::kFLOAT);
+        identity_layer->setName((layer_info(node) + "_IIdentityLayer_for_other").c_str());
+        other = identity_layer->getOutput(0);
+    } else if (other->getType() == nvinfer1::DataType::kFLOAT && self->getType() == nvinfer1::DataType::kINT32) {
+        nvinfer1::IIdentityLayer* identity_layer = engine->network()->addIdentity(*self);
+        identity_layer->setOutputType(0, nvinfer1::DataType::kFLOAT);
+        identity_layer->setName((layer_info(node) + "_IIdentityLayer_for_self").c_str());
+        self = identity_layer->getOutput(0);
+    }
+
+    auto mul = add_elementwise(engine, nvinfer1::ElementWiseOperation::kPROD, self, other, layer_info(node) + "_prod");
+    POROS_CHECK(mul, "Unable to create mul layer from node: " << *node);
+    engine->context().set_tensor(node->outputs()[0], mul->getOutput(0));
+    LOG(INFO) << "Output tensor shape: " << mul->getOutput(0)->getDimensions();
+    return true;
+}
+
+/*
+aten::div.Tensor(Tensor self, Tensor other) -> Tensor
+aten::div.Scalar(Tensor self, Scalar other) -> Tensor*/
+bool DivConverter::converter(TensorrtEngine *engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value *> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 2), "invaid inputs size for DivConverter");
+
+    // aten::div.int(int a, int b) -> (float)
+    if (node->schema().operator_name() == torch::jit::parseSchema(this->schema_string()[4]).operator_name() || \
+        node->schema().operator_name() == torch::jit::parseSchema(this->schema_string()[5]).operator_name()) {  
+        if (check_inputs_tensor_scalar(engine, node)) {
+            nvinfer1::ITensor *a = this->get_tensor_scalar(inputs[0]);
+            nvinfer1::ITensor *b = this->get_tensor_scalar(inputs[1]);
+            POROS_CHECK_TRUE((a != nullptr && b != nullptr),
+                             node_info(node) + std::string("get int nvtensor false."));
+            // Set datatype for input tensor to kFLOAT
+            auto identity_layer1 = engine->network()->addIdentity(*a);
+            identity_layer1->setOutputType(0, nvinfer1::DataType::kFLOAT);
+            identity_layer1->setName((layer_info(node) + "_IIdentityLayer_for_input0").c_str());
+            nvinfer1::ITensor *a_float = identity_layer1->getOutput(0);
+            // Set datatype for input tensor to kFLOAT
+            auto identity_layer2 = engine->network()->addIdentity(*b);
+            identity_layer2->setOutputType(0, nvinfer1::DataType::kFLOAT);
+            identity_layer2->setName((layer_info(node) + "_IIdentityLayer_for_input1").c_str());
+            nvinfer1::ITensor *b_float = identity_layer2->getOutput(0);
+
+            nvinfer1::ILayer *div_layer = add_elementwise(engine,
+                                                          nvinfer1::ElementWiseOperation::kDIV,
+                                                          a_float, b_float, layer_info(node) + "_div");
+            POROS_CHECK(div_layer, "Unable to create div layer from node: " << *node);
+            nvinfer1::ITensor *output = div_layer->getOutput(0);
+            LOG(INFO) << "div output type: " << output->getType();
+            engine->context().set_tensor(node->outputs()[0], output);
+            LOG(INFO) << "Output tensor shape: " << output->getDimensions();
+            return true;
+        } else {
+            int a = engine->context().get_constant(inputs[0]).toScalar().to<int>();
+            int b = engine->context().get_constant(inputs[1]).toScalar().to<int>();
+            float output = float(a) / float(b);
+            engine->context().set_constant(node->outputs()[0], output);
+            return true;
+        }
+    }
+
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())),
+                     "input[0] for DivConverter is not Tensor as expected");
+
+    // Should implement self / other
+    auto self = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((self != nullptr), "Unable to init input tensor for node: " << *node);
+    auto other = engine->context().get_tensor(inputs[1]);
+    //when other input is Scalar
+    if (other == nullptr) {
+        auto other_const = engine->context().get_constant(inputs[1]);
+        if (other_const.isScalar()) {
+            auto other_scalar = other_const.toScalar().to<float>();
+            other = tensor_to_const(engine, torch::tensor({other_scalar}));
+        } else {
+            POROS_THROW_ERROR("Unable to get input other value for DivConverter");
+        }
+    }
+
+    if (self->getType() == nvinfer1::DataType::kINT32) {
+        nvinfer1::IIdentityLayer* identity_self_layer = engine->network()->addIdentity(*self);
+        identity_self_layer->setOutputType(0, nvinfer1::DataType::kFLOAT);
+        self = identity_self_layer->getOutput(0);
+    }
+
+    if (other->getType() == nvinfer1::DataType::kINT32) {
+        nvinfer1::IIdentityLayer* identity_other_layer = engine->network()->addIdentity(*other);
+        identity_other_layer->setOutputType(0, nvinfer1::DataType::kFLOAT);
+        other = identity_other_layer->getOutput(0);
+    }
+
+    auto div = add_elementwise(engine, nvinfer1::ElementWiseOperation::kDIV, self, other, layer_info(node) + "_div");
+    POROS_CHECK(div, "Unable to create div layer from node: " << *node);
+    engine->context().set_tensor(node->outputs()[0], div->getOutput(0));
+    LOG(INFO) << "Output tensor shape: " << div->getOutput(0)->getDimensions();
+    return true;
+}
+
+// aten::floordiv.int(int a, int b) -> (int)
+// aten::__round_to_zero_floordiv(int a, int b) -> (int)
+bool FloordivConverter::converter(TensorrtEngine *engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value *> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 2), "invaid inputs size for ScalarFloordivConverter");
+
+    // 输入是nvtensor
+    if (check_inputs_tensor_scalar(engine, node)) {
+        // __round_to_zero_div 待支持
+        nvinfer1::ITensor *a = this->get_tensor_scalar(inputs[0]);
+        nvinfer1::ITensor *b = this->get_tensor_scalar(inputs[1]);
+
+        POROS_CHECK_TRUE((a != nullptr && b != nullptr),
+                         node_info(node) + std::string("get int nvtensor false."));
+
+        nvinfer1::ElementWiseOperation opreation;
+        std::string nv_layer_name;
+        if (node->schema().operator_name() == torch::jit::parseSchema(this->schema_string()[1]).operator_name()) {
+            opreation = nvinfer1::ElementWiseOperation::kDIV;
+            nv_layer_name = layer_info(node) + "_div";
+        } else {
+            opreation = nvinfer1::ElementWiseOperation::kFLOOR_DIV;
+            nv_layer_name = layer_info(node) + "_floor_div";
+        }
+
+        nvinfer1::ILayer *floordiv_layer = add_elementwise(engine,
+                                                           opreation,
+                                                           a, b, nv_layer_name);
+        POROS_CHECK(floordiv_layer, "Unable to create floordiv layer from node: " << *node);
+        nvinfer1::ITensor *output = floordiv_layer->getOutput(0);
+        engine->context().set_tensor(node->outputs()[0], output);
+        LOG(INFO) << "Output tensor shape: " << output->getDimensions();
+    } else {
+        // 输入是int ivalue
+        int a = engine->context().get_constant(inputs[0]).toScalar().to<int>();
+        int b = engine->context().get_constant(inputs[1]).toScalar().to<int>();
+        POROS_CHECK_TRUE((b != 0), "invaid inputs[1] for ScalarFloordivConverter, which is equal to zero");
+        int output = 0;
+        if (node->kind() == node_kind()[0]) {
+            output = std::floor(float(a) / float(b));
+        } else {
+            output = int(a / b);
+        }
+        engine->context().set_constant(node->outputs()[0], output);
+    }
+    return true;
+}
+
+//aten::remainder.Scalar(Tensor self, Scalar other) -> (Tensor)
+//aten::remainder.Tensor(Tensor self, Tensor other) -> (Tensor)
+bool RemainderConverter::converter(TensorrtEngine *engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value *> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 2), "invaid inputs size for RemainderConverter");
+
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())),
+                     "input[0] for RemainderConverter is not Tensor as expected");
+    POROS_CHECK_TRUE((inputs[1]->type()->kind() == c10::TypeKind::FloatType ||
+                      inputs[1]->type()->kind() == c10::TypeKind::IntType ||
+                      inputs[1]->type()->kind() == c10::TypeKind::TensorType),
+                     "input[1] for RemainderConverter is not Scalar as expected");
+
+    nvinfer1::ITensor *self = engine->context().get_tensor(inputs[0]);
+
+    nvinfer1::ITensor *other;
+
+    if (inputs[1]->type()->kind() == c10::TypeKind::TensorType) {
+        other = engine->context().get_tensor(inputs[1]);
+    } else {
+        other = tensor_to_const(engine,
+                                torch::tensor(engine->context().get_constant(inputs[1]).toDouble(), torch::kFloat32));
+    }
+
+    POROS_CHECK_TRUE((self != nullptr && other != nullptr),
+                     node_info(node) + std::string("get int nvtensor false."));
+
+    // floor_div
+    nvinfer1::ILayer *floordiv_layer = add_elementwise(engine,
+                                                       nvinfer1::ElementWiseOperation::kFLOOR_DIV,
+                                                       self, other, layer_info(node) + "_floor_div");
+    POROS_CHECK(floordiv_layer, "Unable to create floordiv layer from node: " << *node);
+    nvinfer1::ITensor *floordiv_output = floordiv_layer->getOutput(0);
+
+    // prod
+    nvinfer1::ILayer *prod_layer = add_elementwise(engine,
+                                                   nvinfer1::ElementWiseOperation::kPROD,
+                                                   floordiv_output, other, layer_info(node) + "_prod");
+    POROS_CHECK(prod_layer, "Unable to create prod layer from node: " << *node);
+    nvinfer1::ITensor *prod_output = prod_layer->getOutput(0);
+
+    // sub
+    nvinfer1::ILayer *sub_layer = add_elementwise(engine,
+                                                  nvinfer1::ElementWiseOperation::kSUB,
+                                                  self, prod_output, layer_info(node) + "_sub");
+    POROS_CHECK(sub_layer, "Unable to create sub layer from node: " << *node);
+    nvinfer1::ITensor *output = sub_layer->getOutput(0);
+
+    engine->context().set_tensor(node->outputs()[0], output);
+    LOG(INFO) << "Output tensor shape: " << output->getDimensions();
+
+    return true;
+}
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, MulConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, DivConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, FloordivConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, RemainderConverter);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/mul_div.h b/poros/src/poros/converter/gpu/mul_div.h
new file mode 100644
index 0000000000..20da0a20e7
--- /dev/null
+++ b/poros/src/poros/converter/gpu/mul_div.h
@@ -0,0 +1,160 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file mul_div.h
+* @author tianjinjin@baidu.com
+* @date Mon Aug 16 12:26:28 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class MulConverter : public GpuConverter {
+public:
+    MulConverter() {}
+    virtual ~MulConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::mul.Tensor(Tensor self, Tensor other) -> Tensor",
+                "aten::mul.Scalar(Tensor self, Scalar other) -> Tensor",
+                "aten::mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)",
+                "aten::mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)",
+                "aten::mul.int(int a, int b) -> (int)",
+                };
+    }
+
+    /** TODO: TO SUPPORT CONVERTERS BELLOW:
+     * "aten::mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)",
+     * **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::mul,
+                torch::jit::aten::mul_};
+    }
+
+    bool assign_schema_attr() {
+        bool result = true;
+        result &= assign_schema_attr_helper({{"aten::mul.int(int a, int b) -> (int)", {1, 1}}});
+        result &= assign_schema_attr_helper({{"aten::mul.Scalar(Tensor self, Scalar other) -> Tensor", {1, 1}}});
+        result &= assign_schema_attr_helper({{"aten::mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)", {1, 1}}});
+        return result;
+    }
+};
+
+class DivConverter : public GpuConverter {
+public:
+    DivConverter() {}
+    virtual ~DivConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::div.Tensor(Tensor self, Tensor other) -> Tensor",
+                "aten::div.Scalar(Tensor self, Scalar other) -> (Tensor)",
+                "aten::div_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)",
+                "aten::div_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)",
+                "aten::div.int(int a, int b) -> (float)",
+                "aten::div(Scalar a, Scalar b) -> (float)"
+                };
+    }
+
+    /** TODO: TO SUPPORT CONVERTERS BELLOW:
+     * "aten::div.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)",
+     * "aten::div.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor",
+     * "aten::div.Scalar_mode(Tensor self, Scalar other, *, str? rounding_mode) -> Tensor"
+     * "aten::div.out_mode(Tensor self, Tensor other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!)"
+     * **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::div,
+                torch::jit::aten::div_};
+    }
+
+    bool assign_schema_attr() {
+        bool result = true;
+        result &= assign_schema_attr_helper({{"aten::div.int(int a, int b) -> (float)", {1, 1}}});
+        result &= assign_schema_attr_helper({{"aten::div(Scalar a, Scalar b) -> (float)", {1, 1}}});
+        return result;
+    }
+
+};
+
+class FloordivConverter : public GpuConverter {
+public:
+    FloordivConverter() {}
+    virtual ~FloordivConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::floordiv.int(int a, int b) -> (int)",
+                "aten::__round_to_zero_floordiv.int(int a, int b) -> (int)"
+                };
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::floordiv,
+                torch::jit::aten::__round_to_zero_floordiv};
+    }
+
+    bool assign_schema_attr() {
+        bool result = true;
+        result &= assign_schema_attr_helper({{"aten::floordiv.int(int a, int b) -> (int)", {1, 1}}});
+        result &= assign_schema_attr_helper({{"aten::__round_to_zero_floordiv.int(int a, int b) -> (int)", {1, 1}}});
+        return result;
+    }
+};
+
+
+class RemainderConverter : public GpuConverter {
+public:
+    RemainderConverter() {}
+    virtual ~RemainderConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::remainder.Scalar(Tensor self, Scalar other) -> (Tensor)",
+                "aten::remainder.Tensor(Tensor self, Tensor other) -> (Tensor)",
+        };
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::remainder};
+    }
+
+    bool assign_schema_attr() {
+        bool result = true;
+        result &= assign_schema_attr_helper({{"aten::remainder.Scalar(Tensor self, Scalar other) -> (Tensor)", {1, 1}}});
+        result &= assign_schema_attr_helper({{"aten::remainder.Tensor(Tensor self, Tensor other) -> (Tensor)", {1, 1}}});
+        return result;
+    }
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/non_converterable.cpp b/poros/src/poros/converter/gpu/non_converterable.cpp
new file mode 100644
index 0000000000..1212799d4c
--- /dev/null
+++ b/poros/src/poros/converter/gpu/non_converterable.cpp
@@ -0,0 +1,118 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file non_converterable.cpp
+* @author tianjinjin@baidu.com
+* @date Thu Aug 26 10:24:14 CST 2021
+* @brief 
+**/
+
+#include "poros/converter/gpu/non_converterable.h"
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/context/poros_global.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+/*aten::contiguous(Tensor(a) self, *, MemoryFormat memory_format=contiguous_format) -> Tensor(a)*/
+bool ContiguousConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE(inputs[0]->type()->isSubtypeOf(c10::TensorType::get()), 
+        "input[0] for ContiguousConverter is not Tensor as expected");
+
+    //extract tensors
+    auto in_tensor = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((in_tensor != nullptr), "Unable to init input tensor for node: " << *node);
+    //need to do nothing, update the map directly.
+    engine->context().set_tensor(node->outputs()[0], in_tensor);
+    LOG(INFO) << "Output tensor shape: " << in_tensor->getDimensions();
+    return true;
+}
+
+/*aten::dropout(Tensor input, float p, bool train) -> Tensor*/
+bool DropoutConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 3), "invaid inputs size for DropoutConverter");
+    POROS_CHECK_TRUE(inputs[0]->type()->isSubtypeOf(c10::TensorType::get()), 
+        "input[0] for DropoutConverter is not Tensor as expected");
+
+    //extract tensors
+    auto in_tensor = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((in_tensor != nullptr), "Unable to init input tensor for node: " << *node);
+    //need to do nothing, update the map directly.
+    engine->context().set_tensor(node->outputs()[0], in_tensor);
+    LOG(INFO) << "Output tensor shape: " << in_tensor->getDimensions();
+    return true;
+}
+
+// aten::IntImplicit(Tensor a) -> (int)
+bool IntimplicitConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE(inputs[0]->type()->isSubtypeOf(c10::TensorType::get()), 
+        "input[0] for ContiguousConverter is not Tensor as expected");
+
+    //extract tensors
+    auto in_tensor = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((in_tensor != nullptr), "Unable to init input tensor for node: " << *node);
+    //need to do nothing, update the map directly.
+    engine->context().set_tensor(node->outputs()[0], in_tensor);
+    LOG(INFO) << "Output tensor shape: " << in_tensor->getDimensions();
+    return true;
+}
+
+// prim::tolist(Tensor a) -> (int[])
+bool TolistConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE(inputs[0]->type()->isSubtypeOf(c10::TensorType::get()), 
+        "input[0] for ContiguousConverter is not Tensor as expected");
+
+    //extract tensors
+    auto in_tensor = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((in_tensor != nullptr), "Unable to init input tensor for node: " << *node);
+    //need to do nothing, update the map directly.
+    engine->context().set_tensor(node->outputs()[0], in_tensor);
+    LOG(INFO) << "Output tensor shape: " << in_tensor->getDimensions();
+    return true;
+}
+
+// aten::detach(Tensor(a) self) -> Tensor(a)
+bool DetachConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE(inputs[0]->type()->isSubtypeOf(c10::TensorType::get()), 
+        "input[0] for DetachConverter is not Tensor as expected");
+
+    //extract tensors
+    auto in_tensor = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((in_tensor != nullptr), "Unable to init input tensor for node: " << *node);
+    //need to do nothing, update the map directly.
+    engine->context().set_tensor(node->outputs()[0], in_tensor);
+    LOG(INFO) << "Output tensor shape: " << in_tensor->getDimensions();
+    return true;
+}
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, ContiguousConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, DropoutConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, IntimplicitConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, TolistConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, DetachConverter);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/non_converterable.h b/poros/src/poros/converter/gpu/non_converterable.h
new file mode 100644
index 0000000000..7c82fd6700
--- /dev/null
+++ b/poros/src/poros/converter/gpu/non_converterable.h
@@ -0,0 +1,141 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file non_converterable.h
+* @author tianjinjin@baidu.com
+* @date Thu Aug 26 10:24:14 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class ContiguousConverter : public GpuConverter {
+public:
+    ContiguousConverter() {}
+    virtual ~ContiguousConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::contiguous(Tensor(a) self, *, MemoryFormat memory_format=contiguous_format) -> Tensor(a)"};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::contiguous};
+    }
+};
+
+class DropoutConverter : public GpuConverter {
+public:
+    DropoutConverter() {}
+    virtual ~DropoutConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::dropout(Tensor input, float p, bool train) -> Tensor",
+                "aten::dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)",
+                "aten::feature_dropout(Tensor input, float p, bool train) -> Tensor",
+                "aten::feature_alpha_dropout(Tensor input, float p, bool train) -> Tensor",
+                };
+    }
+
+    /** TODO: TO SUPPORT CONVERTERS BELLOW:
+     * "aten::feature_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)",
+     * "aten::feature_alpha_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)",
+     * 
+     * some stange err msg like : feature_alpha_dropout_ is not a member of 'torch::jit::aten' 
+     * 
+     * **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::dropout,
+                torch::jit::aten::dropout_,
+                torch::jit::aten::feature_dropout,
+                torch::jit::aten::feature_alpha_dropout,};
+    }
+
+};
+
+// aten::IntImplicit(Tensor a) -> (int)
+class IntimplicitConverter : public GpuConverter {
+public:
+    IntimplicitConverter() {}
+    virtual ~IntimplicitConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::IntImplicit(Tensor a) -> (int)"};
+    }
+    
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::IntImplicit};
+    }
+
+};
+
+// prim::tolist
+class TolistConverter : public GpuConverter {
+public:
+    TolistConverter() {}
+    virtual ~TolistConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    //prim::tolist kind node has no schema
+    const std::vector<std::string> schema_string() {
+        return {};
+    }
+    
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::prim::tolist};
+    }
+
+};
+
+// aten::detach
+class DetachConverter : public GpuConverter {
+public:
+    DetachConverter() {}
+    virtual ~DetachConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    //prim::tolist kind node has no schema
+    const std::vector<std::string> schema_string() {
+        return {"aten::detach(Tensor(a) self) -> Tensor(a)"};
+    }
+    
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::detach};
+    }
+
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/norm.cpp b/poros/src/poros/converter/gpu/norm.cpp
new file mode 100644
index 0000000000..5d47b1a1af
--- /dev/null
+++ b/poros/src/poros/converter/gpu/norm.cpp
@@ -0,0 +1,183 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file norm.cpp
+* @author Lin Xiao Chun (linxiaochun@baidu.com)
+* @date 2022-02-23 20:33:41
+* @brief
+**/
+
+#include <cmath>
+#include "poros/converter/gpu/norm.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/context/poros_global.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+bool NormConverter::converter(TensorrtEngine *engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value *> inputs = node->inputs();
+    // inputs.size() == 4
+    POROS_CHECK_TRUE((inputs.size() == 4), "invaid inputs size for NormConverter")
+
+    // inputs[0] => self
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())),
+                     "input[0] for NormConverter is not Tensor as expected");
+    auto self = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((self != nullptr), "Unable to init input tensor for node: " << *node)
+    auto self_dims = self->getDimensions();
+
+    // inputs[1] => p
+    POROS_CHECK_TRUE((inputs[1]->node()->kind() == torch::jit::prim::Constant),
+                     "Non-constant p is not support for NormConverter");
+    auto p_const = engine->context().get_constant(inputs[1]);
+    POROS_CHECK_TRUE((p_const.isScalar()), "Non-scalar p is not support for NormConverter")
+    nvinfer1::ITensor *p, *p_inverse;
+    auto p_scalar = p_const.toScalar().to<float>();
+    p = tensor_to_const(engine, torch::tensor(p_scalar));
+    p_inverse = tensor_to_const(engine, torch::tensor(1.0 / p_scalar));
+
+    // inputs[2] => dims
+    POROS_CHECK_TRUE((inputs[2]->node()->kind() == torch::jit::prim::Constant),
+                     "Non-constant dims is not support for NormConverter");
+    auto dims_const = engine->context().get_constant(inputs[2]);
+    POROS_CHECK_TRUE((dims_const.isIntList()), " dims type must be int[] for NormConverter")
+    auto dims_list = dims_const.toIntList().vec();
+    uint32_t dim = 0;
+    for (auto d: dims_list) {
+        if (d < 0) {
+            d = self_dims.nbDims + d;
+        }
+        dim |= 1 << d;
+    }
+    if (dim == 0) {
+        dim = pow(2, self_dims.nbDims) - 1;
+    }
+
+    // input[3] => keepdim
+    POROS_CHECK_TRUE((inputs[3]->node()->kind() == torch::jit::prim::Constant),
+                     "Non-constant dims is not support for NormConverter");
+    auto keepdim_const = engine->context().get_constant(inputs[3]);
+    POROS_CHECK_TRUE((keepdim_const.isBool()), " dims type must be int[] for NormConverter")
+    auto keepdim = keepdim_const.toBool();
+
+    // unary_layer
+    auto unary_layer = engine->network()->addUnary(*self, nvinfer1::UnaryOperation ::kABS);
+    unary_layer->setName((layer_info(node) + "_IUnaryLayer").c_str());
+    auto unary_output = unary_layer->getOutput(0);
+
+    // elementwise_layer 1
+    auto ew1_layer = add_elementwise(engine, nvinfer1::ElementWiseOperation::kPOW, unary_output, p,
+                                     layer_info(node) + "_pow_for_unary");
+
+    POROS_CHECK(ew1_layer, "Unable to create POW layer from node: " << *node);
+    auto ew_output = ew1_layer->getOutput(0);
+
+    // reduce_layer
+    auto reduce_layer = engine->network()->addReduce(*ew_output, nvinfer1::ReduceOperation::kSUM, dim, keepdim);
+    reduce_layer->setName((layer_info(node) + "_IReduceLayer").c_str());
+    auto reduce_output = reduce_layer->getOutput(0);
+
+    // elementwise_layer 2
+    auto ew2_layer = add_elementwise(engine, nvinfer1::ElementWiseOperation::kPOW, reduce_output, p_inverse,
+                                     layer_info(node) + "_pow_for_reduce");
+
+    engine->context().set_tensor(node->outputs()[0], ew2_layer->getOutput(0));
+    LOG(INFO) << "Output tensor shape: " << ew2_layer->getOutput(0)->getDimensions();
+    return true;
+}
+
+bool FrobeniusNormConverter::converter(TensorrtEngine *engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value *> inputs = node->inputs();
+    // inputs.size() == 3
+    POROS_CHECK_TRUE((inputs.size() == 3), "invaid inputs size for FrobeniusNormConverter")
+
+    // inputs[0] => self
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())),
+                     "input[0] for FrobeniusNormConverter is not Tensor as expected");
+    auto self = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((self != nullptr), "Unable to init input tensor for node: " << *node)
+    auto self_dims = self->getDimensions();
+
+    // p
+    nvinfer1::ITensor *p, *p_inverse;
+    float p_scalar = 2;
+    p = tensor_to_const(engine, torch::tensor(p_scalar));
+    p_inverse = tensor_to_const(engine, torch::tensor(1.0 / p_scalar));
+
+    // inputs[1] => dims
+    POROS_CHECK_TRUE((inputs[1]->node()->kind() == torch::jit::prim::Constant),
+                     "Non-constant dims is not support for FrobeniusNormConverter");
+    auto dims_const = engine->context().get_constant(inputs[1]);
+    POROS_CHECK_TRUE((dims_const.isIntList()), " dims type must be int[] for FrobeniusNormConverter")
+    auto dims_list = dims_const.toIntList().vec();
+    uint32_t dim = 0;
+    for (auto d: dims_list) {
+        if (d < 0) {
+            d = self_dims.nbDims + d;
+        }
+        dim |= 1 << d;
+    }
+    // in case of dims_list is empty or invalid, reduce on all axes
+    if (dim == 0) {
+        dim = pow(2, self_dims.nbDims) - 1;
+    }
+
+    // input[2] => keepdim
+    POROS_CHECK_TRUE((inputs[2]->node()->kind() == torch::jit::prim::Constant),
+                     "Non-constant dims is not support for FrobeniusNormConverter");
+    auto keepdim_const = engine->context().get_constant(inputs[2]);
+    POROS_CHECK_TRUE((keepdim_const.isBool()), " dims type must be int[] for FrobeniusNormConverter")
+    auto keepdim = keepdim_const.toBool();
+
+    // unary_layer
+    auto unary_layer = engine->network()->addUnary(*self, nvinfer1::UnaryOperation ::kABS);
+    unary_layer->setName((layer_info(node) + "_IUnaryLayer").c_str());
+    auto unary_output = unary_layer->getOutput(0);
+
+    // elementwise_layer 1
+    auto ew1_layer = add_elementwise(engine, nvinfer1::ElementWiseOperation::kPOW, unary_output, p,
+                                     layer_info(node) + "_pow_for_unary");
+
+    POROS_CHECK(ew1_layer, "Unable to create POW layer from node: " << *node);
+    auto ew_output = ew1_layer->getOutput(0);
+
+    // reduce_layer
+    auto reduce_layer = engine->network()->addReduce(*ew_output, nvinfer1::ReduceOperation::kSUM, dim, keepdim);
+    reduce_layer->setName((layer_info(node) + "_IReduceLayer").c_str());
+    auto reduce_output = reduce_layer->getOutput(0);
+
+    // elementwise_layer 2
+    auto ew2_layer = add_elementwise(engine, nvinfer1::ElementWiseOperation::kPOW, reduce_output, p_inverse,
+                                     layer_info(node) + "_pow_for_reduce");
+
+    engine->context().set_tensor(node->outputs()[0], ew2_layer->getOutput(0));
+    LOG(INFO) << "Output tensor shape: " << ew2_layer->getOutput(0)->getDimensions();
+    return true;
+}
+
+//
+POROS_REGISTER_CONVERTER(TensorrtEngine, NormConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, FrobeniusNormConverter);
+
+}  // namespace poros
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/norm.h b/poros/src/poros/converter/gpu/norm.h
new file mode 100644
index 0000000000..3da51ead59
--- /dev/null
+++ b/poros/src/poros/converter/gpu/norm.h
@@ -0,0 +1,98 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file norm.h
+* @author Lin Xiao Chun (linxiaochun@baidu.com)
+* @date 2022-02-23 20:33:45
+* @brief
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class NormConverter : public GpuConverter {
+public:
+    NormConverter() {}
+
+    virtual ~NormConverter() {}
+
+    bool converter(TensorrtEngine *engine, const torch::jit::Node *node);
+
+    //aten::norm.ScalarOpt_dim(Tensor self, Scalar? p, int[1] dim, bool keepdim=False) -> Tensor
+    const std::vector<std::string> schema_string() {
+        return {
+                "aten::norm.ScalarOpt_dim(Tensor self, Scalar? p, int[1] dim, bool keepdim=False) -> Tensor",
+        };
+    }
+
+    /** TODO: TO SUPPORT CONVERTERS BELLOW:
+     *
+     * **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::norm};
+    }
+
+    bool assign_schema_attr() {
+        bool result = true;
+        result &= assign_schema_attr_helper({{"aten::norm.ScalarOpt_dim(Tensor self, Scalar? p, int[1] dim, bool keepdim=False) -> Tensor", {1, 1}}});
+        return result;
+    }
+
+};
+
+class FrobeniusNormConverter : public GpuConverter {
+public:
+    FrobeniusNormConverter() {}
+
+    virtual ~FrobeniusNormConverter() {}
+
+    bool converter(TensorrtEngine *engine, const torch::jit::Node *node);
+
+    //aten::frobenius_norm.dim(Tensor self, int[1] dim, bool keepdim=False) -> Tensor
+    const std::vector<std::string> schema_string() {
+        return {
+                "aten::frobenius_norm.dim(Tensor self, int[1] dim, bool keepdim=False) -> Tensor",
+        };
+    }
+
+    /** TODO: TO SUPPORT CONVERTERS BELLOW:
+     *
+     * **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::frobenius_norm};
+    }
+
+    bool assign_schema_attr() {
+        bool result = true;
+        result &= assign_schema_attr_helper({{"aten::frobenius_norm.dim(Tensor self, int[1] dim, bool keepdim=False) -> Tensor", {1, 1}}});
+        return result;
+    }
+
+};
+
+}  // namespace poros
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/plugins/interpolate_plugin.cpp b/poros/src/poros/converter/gpu/plugins/interpolate_plugin.cpp
new file mode 100644
index 0000000000..ef20680593
--- /dev/null
+++ b/poros/src/poros/converter/gpu/plugins/interpolate_plugin.cpp
@@ -0,0 +1,398 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file interpolate_plugin.cpp
+* @author tianjinjin@baidu.com
+* @date Mon Sep 27 16:18:38 CST 2021
+* @brief 
+**/
+
+#include "torch/torch.h"
+
+#include "poros/converter/gpu/plugins/interpolate_plugin.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/util/macros.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+InterpolatePlugin::InterpolatePlugin(std::vector<int64_t> in_shape,
+                                    std::vector<int64_t> out_shape,
+                                    std::vector<int64_t> size,
+                                    std::vector<double> scales,
+                                    std::string mode,
+                                    bool align_corners,
+                                    bool use_scales)
+    : in_shape_(in_shape), out_shape_(out_shape), size_(size), 
+    scales_(scales), mode_(mode), align_corners_(align_corners), use_scales_(use_scales) {
+    if (use_scales) {
+        POROS_ASSERT(mode_ != "adaptive_avg_pool2d", 
+            "use_scales is not valid for adaptive_avg_pool2d");
+        POROS_ASSERT(scales_.size() != 0, 
+            "Attempted to use interpolate plugin without providing scales while use_scales=true");
+        at::Tensor input = at::randint(1, 10, in_shape, {at::kCUDA});
+        at::Tensor output;
+
+        if (mode_ == "linear") {
+            output = at::upsample_linear1d(input, c10::nullopt, align_corners_, scales_[0]);
+        } else if (mode_ == "bilinear") {
+            output = at::upsample_bilinear2d(input, c10::nullopt, align_corners_, scales_);
+            std::cout << output.sizes() << std::endl;
+        } else if (mode_ == "trilinear") {
+            output = at::upsample_trilinear3d(input, c10::nullopt, align_corners_, scales_);
+        }
+        
+        out_shape_ = output.sizes().vec();
+    } else {
+        POROS_ASSERT((size_.size() != 0 && out_shape_.size() != 0),
+            "Attempted to use interpolate plugin without providing output size while use_scales=false");
+    }
+}
+
+InterpolatePlugin::InterpolatePlugin(const char* data, size_t length) {
+    std::istringstream  data_stream(std::string(data, length));
+
+    torch::serialize::InputArchive input_archive;
+    input_archive.load_from(data_stream);
+    {
+        torch::IValue value;
+        input_archive.read("in_shape", value);
+        in_shape_ = value.toIntVector();
+    }
+    {
+        torch::IValue value;
+        input_archive.read("out_shape", value);
+        out_shape_ = value.toIntVector();
+    }
+    {
+        torch::IValue value;
+        input_archive.read("size", value);
+        size_ = value.toIntVector();
+    }
+    {
+        torch::IValue value;
+        input_archive.read("scales", value);
+        scales_ = value.toDoubleVector();
+    }
+    {
+        torch::IValue value;
+        input_archive.read("mode", value);
+        mode_ = value.toStringRef();
+    }
+    {
+        torch::IValue value;
+        input_archive.read("align_corners", value);
+        align_corners_ = value.toBool();
+    }
+    {
+        torch::IValue value;
+        input_archive.read("use_scales", value);
+        use_scales_ = value.toBool();
+    }
+}
+
+std::vector<int64_t> InterpolatePlugin::getInputShape() {
+    return in_shape_;
+}
+
+std::vector<int64_t> InterpolatePlugin::getOutputShape() {
+    return out_shape_;
+}
+
+std::vector<int64_t> InterpolatePlugin::getOutputSize() {
+    return size_;
+}
+
+int InterpolatePlugin::getNbOutputs() const noexcept {
+    if (mode_ == "adaptive_max_pool2d") {
+        return 2;
+    } else {
+        return 1;
+    }
+}
+
+const char* InterpolatePlugin::getPluginType() const noexcept {
+    return "Interpolate";
+}
+
+const char* InterpolatePlugin::getPluginVersion() const noexcept {
+    return "1";
+}
+
+const char* InterpolatePlugin::getPluginNamespace() const noexcept {
+    return "";
+}
+
+nvinfer1::IPluginV2DynamicExt* InterpolatePlugin::clone() const noexcept {
+    return new InterpolatePlugin(in_shape_, out_shape_, size_, scales_, mode_, align_corners_, use_scales_);
+}
+
+nvinfer1::DimsExprs InterpolatePlugin::getOutputDimensions(int outputIndex,
+                                                  const nvinfer1::DimsExprs* inputs,
+                                                  int nbInputs,
+                                                  nvinfer1::IExprBuilder& exprBuilder) noexcept {
+    nvinfer1::DimsExprs output(inputs[0]);
+
+    // TODO: This should enable the case of using this plugin with dynamic shape, scale factor and align corners == true
+    // to cover the different implementations between PyTorch and TRT. However TRT currently does not support doubles for
+    // ExprBuilder constants. Once that is possible enable this code and remove the code in the constructor if
+    // if (use_scales_) {
+    //     auto input_dimsexprs = inputs[0];
+    //     output.d[0] = exprBuilder.operation(DimensionOperation::kMAX, *input_dimsexprs.d[0], *exprBuilder.constant(0));
+    //     if (mode_ == "linear") {
+    //         output.d[1] = exprBuilder.operation(DimensionOperation::kPROD, *input_dimsexprs.d[1],
+    //         *exprBuilder.constant(scales_[1]));
+    //     } else if (mode_ == "bilinear") {
+    //         output.d[1] = exprBuilder.operation(DimensionOperation::kPROD, *input_dimsexprs.d[1],
+    //         *exprBuilder.constant(scales_[1])); output.d[2] = exprBuilder.operation(DimensionOperation::kPROD,
+    //         *input_dimsexprs.d[2], *exprBuilder.constant(scales_[2]));
+    //     } else if (mode_ == "trilinear") {
+    //         output.d[1] = exprBuilder.operation(DimensionOperation::kPROD, *input_dimsexprs.d[1],
+    //         *exprBuilder.constant(scales_[1])); output.d[2] = exprBuilder.operation(DimensionOperation::kPROD,
+    //         *input_dimsexprs.d[2], *exprBuilder.constant(scales_[2])); output.d[3] =
+    //         exprBuilder.operation(DimensionOperation::kPROD, *input_dimsexprs.d[3], *exprBuilder.constant(scales_[3]));
+    //     }
+    // } else {
+    for (unsigned int i = 0; i < out_shape_.size(); i++) {
+        output.d[i] = exprBuilder.constant(out_shape_[i]);
+    }
+    //}
+    return output;
+}
+
+nvinfer1::DataType InterpolatePlugin::getOutputDataType(int index, 
+                                                        const nvinfer1::DataType* inputTypes, 
+                                                        int nbInputs) const noexcept {
+    return nvinfer1::DataType::kFLOAT;
+}
+
+int InterpolatePlugin::initialize() noexcept {
+    return 0;
+}
+
+void InterpolatePlugin::serialize(void* buffer) const noexcept {
+    std::string data = serializeToString();
+    size_t size = getSerializationSize();
+    data.copy((char*)buffer, size);
+}
+
+std::string InterpolatePlugin::serializeToString() const {
+    torch::serialize::OutputArchive output_archive;
+
+    output_archive.write("in_shape", torch::IValue(in_shape_));
+    output_archive.write("out_shape", torch::IValue(out_shape_));
+    output_archive.write("size", torch::IValue(size_));
+    output_archive.write("scales", torch::IValue(scales_));
+    output_archive.write("mode", torch::IValue(mode_));
+    output_archive.write("align_corners", torch::IValue(align_corners_));
+    output_archive.write("use_scales", torch::IValue(use_scales_));
+
+    std::ostringstream data_str;
+    output_archive.save_to(data_str);
+
+    return data_str.str();
+}
+
+size_t InterpolatePlugin::getSerializationSize() const noexcept {
+    return serializeToString().size();
+}
+
+bool InterpolatePlugin::supportsFormatCombination(int pos,
+                                                const nvinfer1::PluginTensorDesc* inOut,
+                                                int nbInputs,
+                                                int nbOutputs) noexcept {
+    if (nbInputs != 1) {
+        LOG(WARNING) << "Expected a single tensor as input to interpolate plugin";
+    }
+    if (mode_ == "adaptive_max_pool2d") {
+        if (nbOutputs != 2) {
+            LOG(WARNING) << "Expected 2 tensors as output to interpolate plugin";
+        }
+        if (pos < 0 || pos > 2) {
+            LOG(WARNING) << "There should be exactly 3 connections to the plugin - 1 input, 2 output";
+        }
+    } else {
+        if (nbOutputs != 1) {
+            LOG(WARNING) << "Expected a single tensor as output to interpolate plugin";
+        }
+        if (pos < 0 || pos > 1) {
+            LOG(WARNING) << "There should be exactly 2 connections to the plugin - 1 input, 1 output";
+        }
+    }
+
+    const nvinfer1::PluginTensorDesc& in = inOut[0];
+
+    if (pos == 0) {
+        return (in.type == nvinfer1::DataType::kFLOAT) && (in.format == nvinfer1::TensorFormat::kLINEAR);
+    }
+
+    // pos == 1, accessing information about output tensor
+    const nvinfer1::PluginTensorDesc& out = inOut[1];
+
+    return (in.type == out.type) && (in.format == out.format);
+}
+
+void InterpolatePlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                                        int nbInputs,
+                                        const nvinfer1::DynamicPluginTensorDesc* out,
+                                        int nbOutputs) noexcept {
+    dtype_ = nvinfer1::DataType::kFLOAT;
+}
+
+size_t InterpolatePlugin::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                            int nbInputs,
+                                            const nvinfer1::PluginTensorDesc* outputs,
+                                            int nbOutputs) const noexcept {
+    return 0;
+}
+
+int InterpolatePlugin::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                const nvinfer1::PluginTensorDesc* outputDesc,
+                                const void* const* inputs,
+                                void* const* outputs,
+                                void* workspace,
+                                cudaStream_t stream) noexcept {
+    at::Tensor input =
+        at::from_blob((void*)inputs[0], nvdim_to_sizes(inputDesc->dims), [](void*) {}, {at::kCUDA}).to(torch::kFloat);
+    at::Tensor output =
+        at::from_blob(outputs[0], nvdim_to_sizes(outputDesc->dims), [](void*) {}, {at::kCUDA}).to(torch::kFloat);
+
+    at::cuda::CUDAStream torch_stream = at::cuda::getStreamFromPool();
+    at::cuda::CUDAStreamGuard torch_guard(torch_stream);
+
+    cudaEvent_t event;
+    cudaEventCreate(&event);
+    cudaEventRecord(event, stream);
+
+    cudaStreamWaitEvent(torch_stream.stream(), event, 0);
+    at::Tensor out;
+    if (use_scales_) {
+        if (mode_ == "linear") {
+            out = at::upsample_linear1d(input, c10::nullopt, align_corners_, {scales_[0]});
+        } else if (mode_ == "bilinear") {
+            out = at::upsample_bilinear2d(input, c10::nullopt, align_corners_, scales_);
+        } else if (mode_ == "trilinear") {
+            out = at::upsample_trilinear3d(input, c10::nullopt, align_corners_, scales_);
+        }
+    } else {
+        if (mode_ == "linear") {
+            out = at::upsample_linear1d(input, {size_[0]}, align_corners_);
+        } else if (mode_ == "bilinear") {
+            out = at::upsample_bilinear2d(input, {size_[0], size_[1]}, align_corners_);
+        } else if (mode_ == "trilinear") {
+            out = at::upsample_trilinear3d(input, {size_[0], size_[1], size_[2]}, align_corners_);
+        }  else if (mode_ == "adaptive_avg_pool2d") {
+            out = at::adaptive_avg_pool2d(input, {size_[0], size_[1]});
+        } else if (mode_ == "adaptive_max_pool2d") {
+            out = std::get<0>(at::adaptive_max_pool2d(input, {size_[0], size_[1]}));
+        }
+    }
+
+    output.copy_(out);
+    cudaEvent_t torch_event;
+    cudaEventCreate(&torch_event);
+    cudaEventRecord(torch_event, torch_stream.stream());
+
+    cudaStreamWaitEvent(stream, torch_event, 0);
+
+    cudaEventDestroy(event);
+    cudaEventDestroy(torch_event);
+
+    return 0;
+}
+
+/*
+ * InterpolatePluginCreator class implementations
+ */
+
+InterpolatePluginCreator::InterpolatePluginCreator() {
+    mPluginAttributes.emplace_back(nvinfer1::PluginField("in_shape", nullptr, nvinfer1::PluginFieldType::kINT32, 1));
+    mPluginAttributes.emplace_back(nvinfer1::PluginField("out_shape", nullptr, nvinfer1::PluginFieldType::kINT32, 1));
+    mPluginAttributes.emplace_back(nvinfer1::PluginField("out_size", nullptr, nvinfer1::PluginFieldType::kINT32, 1));
+    mPluginAttributes.emplace_back(nvinfer1::PluginField("scales", nullptr, nvinfer1::PluginFieldType::kFLOAT32, 1));
+    mPluginAttributes.emplace_back(nvinfer1::PluginField("mode", nullptr, nvinfer1::PluginFieldType::kCHAR, 1));
+    mPluginAttributes.emplace_back(nvinfer1::PluginField("align_corners", nullptr, nvinfer1::PluginFieldType::kINT32, 1));
+    mPluginAttributes.emplace_back(nvinfer1::PluginField("use_scales", nullptr, nvinfer1::PluginFieldType::kINT32, 1));
+
+    mFC.nbFields = mPluginAttributes.size();
+    mFC.fields = mPluginAttributes.data();
+}
+
+const char* InterpolatePluginCreator::getPluginNamespace() const noexcept {
+    return "";
+}
+
+const char* InterpolatePluginCreator::getPluginName() const noexcept {
+    return "Interpolate";
+}
+
+const char* InterpolatePluginCreator::getPluginVersion() const noexcept {
+    return "1";
+}
+
+nvinfer1::IPluginV2* InterpolatePluginCreator::createPlugin(const char* name,
+                                                        const nvinfer1::PluginFieldCollection* fc) noexcept {
+    std::vector<int64_t> in_shape;
+    std::vector<int64_t> out_shape;
+    std::vector<int64_t> out_size;
+    std::vector<double> scales;
+    std::string mode;
+    int32_t align_corners = 0;
+    int32_t use_scales = 0;
+
+    for (int i = 0; i < fc->nbFields; i++) {
+        std::string field_name(fc->fields[i].name);
+        if (field_name.compare("in_shape") == 0) {
+            auto in_shape_values = static_cast<const int32_t*>(fc->fields[i].data);
+            in_shape.assign(in_shape_values, in_shape_values + fc->fields[i].length);
+        } else if (field_name.compare("out_shape") == 0) {
+            auto out_shape_values = static_cast<const int32_t*>(fc->fields[i].data);
+            out_shape.assign(out_shape_values, out_shape_values + fc->fields[i].length);
+        } else if (field_name.compare("out_size") == 0) {
+            auto out_size_values = static_cast<const int32_t*>(fc->fields[i].data);
+            out_size.assign(out_size_values, out_size_values + fc->fields[i].length);
+        } else if (field_name.compare("scales") == 0) {
+            auto scales_values = static_cast<const double*>(fc->fields[i].data);
+            scales.assign(scales_values, scales_values + fc->fields[i].length);
+        } else if (field_name.compare("mode") == 0) {
+            mode = *static_cast<const std::string*>(fc->fields[i].data);
+        } else if (field_name.compare("align_corners") == 0) {
+            align_corners = *static_cast<const int32_t*>(fc->fields[i].data);
+        } else if (field_name.compare("use_scales") == 0) {
+            use_scales = *static_cast<const int32_t*>(fc->fields[i].data);
+        }
+    }
+    InterpolatePlugin* plugin =
+        new InterpolatePlugin(in_shape, out_shape, out_size, scales, mode, (bool)align_corners, (bool)use_scales);
+    return plugin;
+}
+
+nvinfer1::IPluginV2* InterpolatePluginCreator::deserializePlugin(const char* name,
+                                                                const void* serialData,
+                                                                size_t serialLength) noexcept {
+    name_ = name;
+    return new InterpolatePlugin((const char*)serialData, serialLength);
+}
+
+const nvinfer1::PluginFieldCollection* InterpolatePluginCreator::getFieldNames() noexcept {
+    return nullptr;
+}
+
+REGISTER_TENSORRT_PLUGIN(InterpolatePluginCreator);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/plugins/interpolate_plugin.h b/poros/src/poros/converter/gpu/plugins/interpolate_plugin.h
new file mode 100644
index 0000000000..05a6898972
--- /dev/null
+++ b/poros/src/poros/converter/gpu/plugins/interpolate_plugin.h
@@ -0,0 +1,289 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file interpolate_plugin.h
+* @author tianjinjin@baidu.com
+* @date Mon Sep 27 16:18:38 CST 2021
+* @brief
+**/
+
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAEvent.h>
+#include <cuda_runtime_api.h>
+#include <cudnn.h>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+//from tensorrt
+#include "NvInferPlugin.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+/*
+* @brief InterpolatePlugin 实现tensorrt的一个插件。
+    该插件支持的pooling类算子包括：adaptive_avg_pool2d & adaptive_max_pool2d
+    该插件支持的linear类算子包括：linear & bilinear & trilinear
+    该插件会被上述算子对应的gpu-converter调用，注册到gpu-engine上，实现gpu-engine对该算子的支持；
+    plugin的内部逻辑，主要是调用了pytorch-aten的实现(CUDA实现)。
+**/
+class InterpolatePlugin : public nvinfer1::IPluginV2DynamicExt {
+public:
+    /*
+     * @brief 多参数构造函数。
+     * @param [in] in_shape : 输入tensors的shape信息
+     * @param [in] out_shape : 输出tensors的shape信息
+     * @param [in] size : 输出的 spatial 尺寸（当mode=linear, bilinear 和 trilinear）
+     *                    输出的目标 size 信息（当mode=adaptive_avg_pool2d 和 adaptive_max_pool2d）
+     * @param [in] scales : spatial 尺寸的缩放因子, 当use_scales=True时生效。
+     * @param [in] mode : 算子标识，可选adaptive_avg_pool2d、adaptive_max_pool2d、linear、bilinear、trilinear。
+     * @param [in] align_corners : 默认为false，如果 align_corners=True，则对齐input和output 的角点像素(corner pixels)，保持在角点像素的值. 
+     *                             只会对 mode=linear, bilinear 和 trilinear 有作用。
+     * @param [in] use_scales : 默认为false，如果use_scales=True，入参scales生效（且scales必须不为空）。
+     *                          只会对 mode=linear, bilinear 和 trilinear 有作用。
+     **/
+    InterpolatePlugin(
+        std::vector<int64_t> in_shape,
+        std::vector<int64_t> out_shape,
+        std::vector<int64_t> size,
+        std::vector<double> scales,
+        std::string mode,
+        bool align_corners,
+        bool use_scales);
+
+    /*
+     * @brief deserialize阶段使用的构造函数。
+     * @param [in] data : 序列化好的data。
+     * @param [in] length : 数据长度。
+     **/
+    InterpolatePlugin(const char* data, size_t length);
+
+    /*
+     * @brief InterpolatePlugin不应该存在无参数构造函数，将该默认构造函数删除。
+     **/
+    InterpolatePlugin() = delete;
+   
+    /*****************************************************************************
+            以下部分是tensorrt定义的 IPluginV2DynamicExt API
+    ******************************************************************************/
+    /*
+     * @brief clone函数，将这个插件对象克隆给tensorrt的builder/network/engine。
+     **/   
+    nvinfer1::IPluginV2DynamicExt* clone() const noexcept override;
+
+    /*
+     * @brief 返回输出维度信息。
+     **/
+    nvinfer1::DimsExprs getOutputDimensions(
+        int outputIndex,
+        const nvinfer1::DimsExprs* inputs,
+        int nbInputs,
+        nvinfer1::IExprBuilder& exprBuilder) noexcept override;
+
+    /*
+     * @brief 插件的输入输出是否支持inOut[pos].format和inOut[pos].type指定的格式/数据类型。
+     **/
+    bool supportsFormatCombination(
+        int pos,
+        const nvinfer1::PluginTensorDesc* inOut,
+        int nbInputs,
+        int nbOutputs) noexcept override;
+
+    /*
+     * @brief 配置这个插件，判断输入和输出类型数量是否正确等。
+     **/
+    void configurePlugin(
+        const nvinfer1::DynamicPluginTensorDesc* in,
+        int nbInputs,
+        const nvinfer1::DynamicPluginTensorDesc* out,
+        int nbOutputs) noexcept override;
+    /*
+     * @brief 返回需要中间显存变量的实际数据大小（bytesize）。
+     **/
+    size_t getWorkspaceSize(
+        const nvinfer1::PluginTensorDesc* inputs,
+        int nbInputs,
+        const nvinfer1::PluginTensorDesc* outputs,
+        int nbOutputs) const noexcept override;
+    /*
+     * @brief 该插件的的实际执行函数（重要！）。
+     **/
+    int enqueue(
+        const nvinfer1::PluginTensorDesc* inputDesc,
+        const nvinfer1::PluginTensorDesc* outputDesc,
+        const void* const* inputs,
+        void* const* outputs,
+        void* workspace,
+        cudaStream_t stream) noexcept override;  
+
+    /*****************************************************************************
+            以下部分是tensorrt定义的 IPluginV2Ext API
+    ******************************************************************************/
+    /*
+    * @brief 返回结果的数据类型（一般与输入类型一致）。
+    **/
+    nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const noexcept override;
+    
+    /*****************************************************************************
+            以下部分是tensorrt定义的 IPluginV2 API
+    ******************************************************************************/
+    /*
+     * @brief 插件名称信息。
+     **/  
+    const char* getPluginType() const noexcept override;
+    /*
+     * @brief 插件版本信息。
+     **/    
+    const char* getPluginVersion() const noexcept override;
+    /*
+     * @brief 插件返回多少个Tensor。
+     **/
+    int getNbOutputs() const noexcept override;
+
+    /*
+     * @brief 初始化函数，在这个插件准备开始run之前执行。
+     **/    
+    int initialize() noexcept override;
+
+    /*
+     * @brief 资源释放函数，engine destory的时候调用该函数。
+     **/ 
+    void terminate() noexcept override {}
+
+    /*
+     * @brief 返回序列化时需要写多少字节到buffer中。
+     **/      
+    size_t getSerializationSize() const noexcept override;
+
+    /*
+     * @brief 把需要用的数据按照顺序序列化到buffer中。
+     **/        
+    void serialize(void* buffer) const noexcept override;
+
+    /*
+     * @brief 销毁插件对象，network/builder/engine destroy的时候调用该函数。
+     **/
+    void destroy() noexcept override {}
+
+    /*
+     * @brief 设置插件的namespace，默认为 ”“。
+     **/    
+    void setPluginNamespace(const char* pluginNamespace) noexcept override {};
+
+    /*
+     * @brief 返回插件的namespace。
+     **/        
+    const char* getPluginNamespace() const noexcept override;
+
+    /*****************************************************************************
+            以下部分为本插件自定义的API
+    ******************************************************************************/
+    /*
+     * @brief 返回输入的shape信息。
+     **/  
+    std::vector<int64_t> getInputShape();
+
+    /*
+     * @brief 返回输出的shape信息。
+     **/
+    std::vector<int64_t> getOutputShape();
+
+    /*
+     * @brief 返回输出的spatial尺寸。
+     **/
+    std::vector<int64_t> getOutputSize();
+
+    /*
+     * @brief 插件序列化函数。
+     **/
+    std::string serializeToString() const;
+
+private:
+    nvinfer1::DataType dtype_;
+
+    std::vector<int64_t> in_shape_;
+    std::vector<int64_t> out_shape_;
+    std::vector<int64_t> size_;
+    std::vector<double> scales_;
+    std::string mode_;
+    bool align_corners_;
+    bool use_scales_;
+};
+
+/*
+* @brief InterpolatePluginCreator 实现插件的注册。
+*        (配合全局宏REGISTER_TENSORRT_PLUGIN，将插件注册到tensorrt, 此后插件可以通过getPluginRegistry获取)
+**/
+class InterpolatePluginCreator : public nvinfer1::IPluginCreator {
+public:
+    /*
+     * @brief 默认构造函数。
+     **/
+    InterpolatePluginCreator();
+
+    /*
+     * @brief 获取插件的名称信息。
+     **/
+    const char* getPluginName() const noexcept override;
+
+    /*
+     * @brief 获取插件的版本信息。
+     **/
+    const char* getPluginVersion() const noexcept override;
+
+    /*
+     * @brief 获取创建该插件需要的字段列表，该列表被 createPlugin 使用。
+     **/
+    const nvinfer1::PluginFieldCollection* getFieldNames() noexcept override;
+
+    /*
+     * @brief 根据插件名和字段列表信息，创建插件对象。
+     **/
+    nvinfer1::IPluginV2* createPlugin(
+        const char* name, 
+        const nvinfer1::PluginFieldCollection* fc) noexcept override;
+
+    /*
+     * @brief 反序列化插件。
+     **/
+    nvinfer1::IPluginV2* deserializePlugin(
+        const char* name,
+        const void* serialData,
+        size_t serialLength) noexcept override;
+
+    /*
+     * @brief 设置插件的namespace信息。
+     **/
+    void setPluginNamespace(const char* libNamespace) noexcept override{};
+
+    /*
+     * @brief 获取插件的namespace信息。
+     **/
+    const char* getPluginNamespace() const noexcept override;
+
+private:
+    std::string name_;
+    std::vector<nvinfer1::PluginField> mPluginAttributes;
+    nvinfer1::PluginFieldCollection mFC;
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/pooling.cpp b/poros/src/poros/converter/gpu/pooling.cpp
new file mode 100644
index 0000000000..ece7ad6539
--- /dev/null
+++ b/poros/src/poros/converter/gpu/pooling.cpp
@@ -0,0 +1,266 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file pooling.cpp
+* @author tianjinjin@baidu.com
+* @date Wed Aug 18 11:25:13 CST 2021
+* @brief 
+**/
+
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/converter/gpu/pooling.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/context/poros_global.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+//note1: max_pool?d 输入参数都是6个，各个参数的含义是一致的，差异在于 int[] 的维度不一样。
+//note2: avg_pool1d 的输入参数是6个，avg_pool2d 和 3d 的输入参数是7个，多了最后一个参数 divisor_override。
+//note3: max 与 avg 从第5个参数开始，出现了定义上的差异，需要注意。
+bool PoolingConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 6 || inputs.size() == 7), 
+        "invaid inputs size for PoolingConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for PoolingConverter is not Tensor as expected");
+
+    auto in = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((in != nullptr), "Unable to init input tensor for node: " << *node);
+    
+    // Max Pool needs at least 4D input
+    auto orig_dims = in->getDimensions();
+    bool expandDims = (orig_dims.nbDims < 4);
+    if (expandDims) {
+        in = add_padding(engine, node, in, 4, false, true);
+    }
+
+    auto kernel_size = sizes_to_nvdim((engine->context().get_constant(inputs[1])).toIntList());
+    auto stride = sizes_to_nvdim((engine->context().get_constant(inputs[2])).toIntList());
+    auto padding = sizes_to_nvdim((engine->context().get_constant(inputs[3])).toIntList());
+    if (stride.nbDims == 0) {
+        LOG(INFO) << "Stride not provided, using kernel_size as stride";
+        stride = sizes_to_nvdim((engine->context().get_constant(inputs[1])).toIntList());
+    }
+    if (kernel_size.nbDims == 1) {
+        kernel_size = unsqueeze_dims(kernel_size, 0, 1);
+    }
+    if (padding.nbDims == 1) {
+        padding = unsqueeze_dims(padding, 0, 0);
+    }
+    if (stride.nbDims == 1) {
+        stride = unsqueeze_dims(stride, 0, 1);
+    }
+    LOG(INFO) << "kernel_size: " << kernel_size << ", padding: " << padding << ", stride: " << stride;
+    
+
+    bool ceil_mode = false;
+    nvinfer1::IPoolingLayer* new_layer;
+
+    //when it's max pooling
+    if (node->kind() == torch::jit::aten::max_pool1d ||
+        node->kind() == torch::jit::aten::max_pool2d ||
+        node->kind() == torch::jit::aten::max_pool3d) { 
+        auto dilation = sizes_to_nvdim((engine->context().get_constant(inputs[4])).toIntList());
+        POROS_CHECK(dilation == sizes_to_nvdim(std::vector<int64_t>(dilation.nbDims, 1)),
+            "Pooling dilation is not supported in TensorRT");
+        
+        LOG(INFO) << "dilation: " << dilation;
+        LOG(WARNING) << "Dilation not used in Max pooling converter";
+
+        ceil_mode = (engine->context().get_constant(inputs[5])).toBool();
+        
+        new_layer = engine->network()->addPoolingNd(*in, nvinfer1::PoolingType::kMAX, kernel_size);
+        POROS_CHECK(new_layer, "Unable to create Max Pooling layer from node: " << *node);
+        new_layer->setName((layer_info(node) + "_IPoolingLayer_max").c_str());
+
+    //when it's avg pooling
+    } else if (node->kind() == torch::jit::aten::avg_pool1d ||
+            node->kind() == torch::jit::aten::avg_pool2d ||
+            node->kind() == torch::jit::aten::avg_pool3d) {
+
+        ceil_mode = (engine->context().get_constant(inputs[4])).toBool();
+        bool count_inlcude_pad = (engine->context().get_constant(inputs[5])).toBool();
+        
+        new_layer = engine->network()->addPoolingNd(*in, nvinfer1::PoolingType::kAVERAGE, kernel_size);
+        POROS_CHECK(new_layer, "Unable to create Avg Pooling layer from node: " << *node);
+        new_layer->setAverageCountExcludesPadding(!count_inlcude_pad);
+        new_layer->setName((layer_info(node) + "_IPoolingLayer_average").c_str());
+
+    //we should never reach here  
+    } else {
+        POROS_THROW_ERROR("Unsupported pool mode!");
+    }
+    
+    auto padding_mode = 
+        ceil_mode ? nvinfer1::PaddingMode::kEXPLICIT_ROUND_UP : nvinfer1::PaddingMode::kEXPLICIT_ROUND_DOWN;
+    
+    new_layer->setPaddingMode(padding_mode);
+    new_layer->setPaddingNd(padding);
+    new_layer->setStrideNd(stride);
+    
+    auto out_tensor = add_unpadding(engine, node, new_layer->getOutput(0), orig_dims.nbDims, false, true);
+    
+    // avg_pool2d or avg_pool3d divisor_override
+    if (node->kind() == torch::jit::aten::avg_pool2d ||
+        node->kind() == torch::jit::aten::avg_pool3d) {
+        auto maybe_divisor = engine->context().get_constant(inputs[6]);
+        if (maybe_divisor.isScalar()) {
+            auto divisor = maybe_divisor.toScalar().to<int>();
+            if (divisor != 0){
+                auto kernel_size_list = sizes_to_nvdim((engine->context().get_constant(inputs[1])).toIntList());
+                int64_t kernel_area = 1;
+                for(auto i = 0; i < kernel_size_list.nbDims; i++) {
+                    kernel_area *= kernel_size_list.d[i];
+                }
+                auto actual_divisor = tensor_to_const(engine, torch::tensor({(float)kernel_area / (float)divisor}));
+                auto mul = add_elementwise(engine, nvinfer1::ElementWiseOperation::kPROD, 
+                                            out_tensor, actual_divisor, layer_info(node) + "_prod");
+                POROS_CHECK(mul, "Unable to create mul layer from node: " << *node);
+                out_tensor = mul->getOutput(0);
+            } else {
+                LOG(INFO) << "Invalid parameter: divisor_override";
+                return false;
+            }
+        }
+    }
+
+    engine->context().set_tensor(node->outputs()[0], out_tensor);
+    LOG(INFO) << "Output tensor shape: " << out_tensor->getDimensions();
+    return true;
+}
+
+//note1: adaptive_avg_pool?d，各个参数的含义是一致的，差异在于 int[] 的维度不一样。
+//note2: adaptive_max_pool?d，同note1, 各个参数含义一样，差异在于 int[] 的维度不一样。
+//note3: avg 与 max的 输入参数都是2个，avg的输出参数是1个，max的输出参数是2个。
+//note4: 这个家族的6个op，没有全部实现，需要注意！！！
+bool AdaptivePoolingConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 2), "invaid inputs size for AdaptivePoolingConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for AdaptivePoolingConverter is not Tensor as expected");
+        
+    auto in = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((in != nullptr), "Unable to init input tensor for node: " << *node);
+    nvinfer1::Dims orig_dims = in->getDimensions();
+    auto out_size = sizes_to_nvdim((engine->context().get_constant(inputs[1])).toIntList());
+    LOG(INFO) << "get out_size: " << out_size << " in AdaptivePoolingConverter";
+
+    nvinfer1::PoolingType pool_type;
+    if (node->kind() == torch::jit::aten::adaptive_avg_pool1d ||
+        node->kind() == torch::jit::aten::adaptive_avg_pool2d) {
+        pool_type = nvinfer1::PoolingType::kAVERAGE;
+    } else if (node->kind() == torch::jit::aten::adaptive_max_pool2d) {
+        pool_type = nvinfer1::PoolingType::kMAX;
+    } else {
+        POROS_THROW_ERROR("Unsupported Adaptive pool mode!");
+    }
+    
+    // Corner case: when out dimension is all ones, replace with simpler operation
+    if (out_size.d[0] == 1 && (out_size.nbDims < 2 || out_size.d[1] == 1) &&
+    (out_size.nbDims < 3 || out_size.d[2] == 1)) {
+        LOG(INFO) << "Matched corner case in AdaptivePoolingConverter";
+        // Generate a bitmask of all 1s except the last 2 bits (N and C axes)
+        uint32_t reduceAxes = ((1 << orig_dims.nbDims) - 1) & ~0b11;
+        auto* new_layer = engine->network()->addReduce(
+                *in,
+                pool_type == nvinfer1::PoolingType::kMAX ? nvinfer1::ReduceOperation::kMAX : nvinfer1::ReduceOperation::kAVG,
+                reduceAxes,
+                /*keepDimensions=*/true);
+        new_layer->setName((layer_info(node) + "_IReduceLayer").c_str());
+
+        engine->context().set_tensor(node->outputs()[0], new_layer->getOutput(0));
+        LOG(INFO) << "AdaptivePoolingConverter: Output tensor shape: " << new_layer->getOutput(0)->getDimensions();
+        return true;
+    }
+    
+    bool expandDims = (orig_dims.nbDims < 4);
+    POROS_CHECK(orig_dims.nbDims > 2, "Unable to create pooling layer from node: " << *node);
+    if (expandDims) {
+        in = add_padding(engine, node, in, 4, false, false);
+    }
+    
+    if (out_size.nbDims == 1) {
+        out_size = unsqueeze_dims(out_size, 0, 1);
+    }
+    
+    auto in_shape = nvdim_to_sizes(in->getDimensions());
+    nvinfer1::ILayer* new_layer = nullptr;
+    
+    nvinfer1::PluginFieldCollection fc;
+    std::vector<nvinfer1::PluginField> f;
+    auto out_shape = in_shape;
+    auto out_size_vec = nvdim_to_sizes(out_size);
+    
+    std::copy(out_size_vec.begin(), out_size_vec.end(), out_shape.begin() + (in_shape.size() - out_size_vec.size()));
+    std::vector<int32_t> in_shape_casted(in_shape.begin(), in_shape.end());
+    f.emplace_back(
+        nvinfer1::PluginField("in_shape", in_shape_casted.data(), nvinfer1::PluginFieldType::kINT32, in_shape.size()));
+    std::vector<int32_t> out_shape_casted(out_shape.begin(), out_shape.end());
+    f.emplace_back(
+        nvinfer1::PluginField("out_shape", out_shape_casted.data(), nvinfer1::PluginFieldType::kINT32, out_shape.size()));
+    std::vector<int32_t> out_size_casted(out_size_vec.begin(), out_size_vec.end());
+    f.emplace_back(
+        nvinfer1::PluginField("out_size", out_size_casted.data(), nvinfer1::PluginFieldType::kINT32, out_size_vec.size()));
+    f.emplace_back(
+        nvinfer1::PluginField("scales", nullptr, nvinfer1::PluginFieldType::kFLOAT64, 0));
+    
+    int32_t align_corners_casted = 0;
+    f.emplace_back(
+        nvinfer1::PluginField("align_corners", &align_corners_casted, nvinfer1::PluginFieldType::kINT32, 1));
+    int32_t use_scales_casted = 0;
+    f.emplace_back(
+        nvinfer1::PluginField("use_scales", &use_scales_casted, nvinfer1::PluginFieldType::kINT32, 1));
+        
+    std::string mode = "adaptive_avg_pool2d";
+    if (pool_type == nvinfer1::PoolingType::kMAX) {
+        mode = "adaptive_max_pool2d";
+    }
+    f.emplace_back(
+        nvinfer1::PluginField("mode", &mode, nvinfer1::PluginFieldType::kCHAR, 1));
+        
+    fc.nbFields = f.size();
+    fc.fields = f.data();
+     
+    auto creator = getPluginRegistry()->getPluginCreator("Interpolate", "1", "");
+    auto interpolate_plugin = creator->createPlugin(mode.c_str(), &fc);
+    LOG(INFO) << "create Interpolate plugin done";
+    
+    new_layer = engine->network()->addPluginV2(reinterpret_cast<nvinfer1::ITensor* const*>(&in), 1, *interpolate_plugin);
+    POROS_CHECK(new_layer, "Unable to create pooling (interpolation) plugin from node" << *node);
+    new_layer->setName((layer_info(node) + "_plugin_Interpolate").c_str());
+    auto layer_output = add_unpadding(engine, node, new_layer->getOutput(0), orig_dims.nbDims, false, false);
+
+    engine->context().set_tensor(node->outputs()[0], layer_output);
+    LOG(INFO) << "Output tensor shape: " << layer_output->getDimensions();
+    //attention: 对于adaptive_max_pool2d, 映射第二个output
+    if (mode == "adaptive_max_pool2d") {
+        engine->context().set_tensor(node->outputs()[1], new_layer->getOutput(1));
+        LOG(INFO) << "Output tensor2 shape: " << new_layer->getOutput(1)->getDimensions();
+    }
+    return true;
+}
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, PoolingConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, AdaptivePoolingConverter);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
\ No newline at end of file
diff --git a/poros/src/poros/converter/gpu/pooling.h b/poros/src/poros/converter/gpu/pooling.h
new file mode 100644
index 0000000000..45a628f278
--- /dev/null
+++ b/poros/src/poros/converter/gpu/pooling.h
@@ -0,0 +1,96 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file pooling.h
+* @author tianjinjin@baidu.com
+* @date Tue Aug 17 22:57:03 CST 2021
+* @brief
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class PoolingConverter : public GpuConverter {
+public:
+    PoolingConverter() {}
+    virtual ~PoolingConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::max_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> Tensor",
+                "aten::max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor",
+                "aten::max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor",
+                "aten::avg_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, bool ceil_mode=False, bool count_include_pad=True) -> Tensor",
+                "aten::avg_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor",
+                "aten::avg_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor"
+            };
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::max_pool1d,
+                torch::jit::aten::avg_pool1d,
+                torch::jit::aten::max_pool2d,
+                torch::jit::aten::avg_pool2d,
+                torch::jit::aten::max_pool3d,
+                torch::jit::aten::avg_pool3d};
+    }
+};
+
+
+class AdaptivePoolingConverter : public GpuConverter {
+public:
+    AdaptivePoolingConverter() {}
+    virtual ~AdaptivePoolingConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::adaptive_avg_pool1d(Tensor self, int[1] output_size) -> Tensor",
+                "aten::adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor",
+                "aten::adaptive_max_pool2d(Tensor self, int[2] output_size) -> (Tensor, Tensor)"
+            };
+    }
+
+    /** TODO: TRY TO SUPPORT SCHEMA PATTERNS BELLOW:
+     * aten::adaptive_avg_pool3d(Tensor self, int[3] output_size) -> Tensor
+     * aten::adaptive_max_pool1d(Tensor self, int[1] output_size) -> (Tensor, Tensor)
+     * aten::adaptive_max_pool3d(Tensor self, int[3] output_size) -> (Tensor, Tensor)
+     **/
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::adaptive_avg_pool1d,
+                torch::jit::aten::adaptive_avg_pool2d,
+                torch::jit::aten::adaptive_max_pool2d
+            };
+    }
+};
+
+
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
\ No newline at end of file
diff --git a/poros/src/poros/converter/gpu/reduce.cpp b/poros/src/poros/converter/gpu/reduce.cpp
new file mode 100644
index 0000000000..bf497e77cd
--- /dev/null
+++ b/poros/src/poros/converter/gpu/reduce.cpp
@@ -0,0 +1,281 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file reduce.cpp
+* @author tianjinjin@baidu.com
+* @date Fri Aug 27 10:18:24 CST 2021
+* @brief 
+**/
+
+#include "poros/converter/gpu/reduce.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/context/poros_global.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+/*
+"aten::mean(Tensor self, *, ScalarType? dtype=None) -> Tensor",
+"aten::mean.dim(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor"*/
+bool MeanConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for MeanConverter is not Tensor as expected");
+
+
+    //extract self
+    auto in_tensor = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((in_tensor != nullptr), "Unable to init input tensor for node: " << *node);
+    auto in_dims = nvdim_to_sizes(in_tensor->getDimensions());
+    LOG(WARNING) << "MeanConverter disregards dtype";
+
+    uint32_t axis_mask = (uint32_t)(((uint64_t)1 << in_dims.size()) - 1);
+    auto keepdim = false;
+
+    // aten::mean.dim situation
+    auto maybe_dim = engine->context().get_constant(inputs[1]);
+    if ((inputs.size() == 4) &&  maybe_dim.isIntList() && 
+          (engine->context().get_constant(inputs[2])).isBool()) {
+        auto dims = maybe_dim.toIntList();
+        c10::List<int64_t> calculated_dims;
+        for (size_t i = 0; i < dims.size(); i++) {
+            auto dim_val = dims[i] < 0 ? (in_dims.size() + dims[i]) : dims[i];
+            calculated_dims.push_back(dim_val);
+        }
+        axis_mask = 0;
+        for (size_t d = 0; d < calculated_dims.size(); d++) {
+            axis_mask |= 1 << calculated_dims[d];
+        }
+        keepdim = (engine->context().get_constant(inputs[2])).toBool();
+    }
+
+    auto mean_layer = engine->network()->addReduce(*in_tensor, 
+                    nvinfer1::ReduceOperation::kAVG, axis_mask, keepdim);
+    POROS_CHECK(mean_layer, "Unable to create mean layer from node: " << *node);
+    mean_layer->setName((layer_info(node) + "_IReduceLayer_avg").c_str());
+
+    engine->context().set_tensor(node->outputs()[0], mean_layer->getOutput(0));
+    LOG(INFO) << "Output tensor shape: " << mean_layer->getOutput(0)->getDimensions();
+    return true;
+}
+
+
+/*
+"aten::sum(Tensor self, *, ScalarType? dtype=None) -> Tensor",
+"aten::sum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor"*/
+bool SumConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for SumConverter is not Tensor as expected");
+
+    //extract self
+    auto in_tensor = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((in_tensor != nullptr), "Unable to init input tensor for node: " << *node);
+    auto in_dims = nvdim_to_sizes(in_tensor->getDimensions());
+    LOG(WARNING) << "SumConverter disregards dtype";
+
+    uint32_t axis_mask = (uint32_t)(((uint64_t)1 << in_dims.size()) - 1);
+    auto keepdim = false;
+
+    // aten::sum.dim_IntList situation
+    auto maybe_dim = engine->context().get_constant(inputs[1]);
+    if ((inputs.size() == 4) &&  maybe_dim.isIntList() && 
+          (engine->context().get_constant(inputs[2])).isBool()) {
+        auto dims = maybe_dim.toIntList();
+        c10::List<int64_t> calculated_dims;
+        for (size_t i = 0; i < dims.size(); i++) {
+            auto dim_val = dims[i] < 0 ? (in_dims.size() + dims[i]) : dims[i];
+            calculated_dims.push_back(dim_val);
+        }
+        axis_mask = 0;
+        for (size_t d = 0; d < calculated_dims.size(); d++) {
+            axis_mask |= 1 << calculated_dims[d];
+        }
+        keepdim = (engine->context().get_constant(inputs[2])).toBool();
+    }
+
+    auto mean_layer = engine->network()->addReduce(*in_tensor, 
+                    nvinfer1::ReduceOperation::kSUM, axis_mask, keepdim);
+    POROS_CHECK(mean_layer, "Unable to create mean layer from node: " << *node);
+    mean_layer->setName((layer_info(node) + "_IReduceLayer_sum").c_str());
+
+    engine->context().set_tensor(node->outputs()[0], mean_layer->getOutput(0));
+    LOG(INFO) << "Output tensor shape: " << mean_layer->getOutput(0)->getDimensions();
+    return true;
+}
+
+/*
+"aten::prod(Tensor self, *, ScalarType? dtype=None) -> Tensor",
+"aten::prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor"*/
+bool ProdConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for ProdConverter is not Tensor as expected");
+
+    //extract self
+    auto in_tensor = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((in_tensor != nullptr), "Unable to init input tensor for node: " << *node);
+    auto in_dims = nvdim_to_sizes(in_tensor->getDimensions());
+    LOG(WARNING) << "ProdConverter disregards dtype";
+
+    uint32_t axis_mask = (uint32_t)(((uint64_t)1 << in_dims.size()) - 1);
+    auto keepdim = false;
+
+    //aten::prod.dim_int situation
+    auto maybe_dim = engine->context().get_constant(inputs[1]);
+    if ((inputs.size() == 4) &&  maybe_dim.isInt() && 
+          (engine->context().get_constant(inputs[2])).isBool()) {
+        auto dim = maybe_dim.toInt();
+        dim = dim < 0 ? (in_tensor->getDimensions().nbDims + dim) : dim;
+        axis_mask = 1 << dim;
+
+        keepdim = (engine->context().get_constant(inputs[2])).toBool();
+    }
+
+    auto mean_layer = engine->network()->addReduce(*in_tensor, 
+                    nvinfer1::ReduceOperation::kPROD, axis_mask, keepdim);
+    POROS_CHECK(mean_layer, "Unable to create mean layer from node: " << *node);
+    mean_layer->setName((layer_info(node) + "_IReduceLayer_prod").c_str());
+
+    engine->context().set_tensor(node->outputs()[0], mean_layer->getOutput(0));
+    LOG(INFO) << "Output tensor shape: " << mean_layer->getOutput(0)->getDimensions();
+    return true;
+}
+
+/*
+"aten::max(Tensor self) -> Tensor",
+"aten::max.other(Tensor self, Tensor other) -> Tensor"
+"aten::max.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)"*/
+bool MaxMinConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for MaxMinConverter is not Tensor as expected");
+
+    //extract self
+    auto in_tensor = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((in_tensor != nullptr), "Unable to init input tensor for node: " << *node);
+    auto in_dims = nvdim_to_sizes(in_tensor->getDimensions());
+
+    bool is_dynamic = check_nvtensor_is_dynamic(in_tensor);
+
+    nvinfer1::ILayer* new_layer;
+    //aten::max situation
+    if (inputs.size() == 1) {
+        uint32_t axis_mask = (uint32_t)(((uint64_t)1 << in_dims.size()) - 1);
+        auto keepdim = false;
+
+        nvinfer1::ReduceOperation reduce_type = (node->kind() == torch::jit::aten::max)
+                                            ? nvinfer1::ReduceOperation::kMAX
+                                            : nvinfer1::ReduceOperation::kMIN;
+        new_layer = engine->network()->addReduce(*in_tensor, reduce_type, axis_mask, keepdim);
+        new_layer->setName((layer_info(node) + "_IReduceLayer_max_or_min").c_str());
+        POROS_CHECK(new_layer, "Unable to create reduce layer from node: " << *node);
+
+    //aten::max.other situation
+    } else if (inputs.size() == 2) {
+        //extract other
+        auto other = engine->context().get_tensor(inputs[1]);
+        POROS_CHECK_TRUE((other != nullptr), "Unable to init input tensor for node: " << *node);
+
+        nvinfer1::ElementWiseOperation element_type = (node->kind() == torch::jit::aten::max)
+                                            ? nvinfer1::ElementWiseOperation::kMAX
+                                            : nvinfer1::ElementWiseOperation::kMIN;
+        new_layer = add_elementwise(engine,
+                            element_type,
+                            in_tensor,
+                            other,
+                            layer_info(node) + "_max_or_min");
+        POROS_CHECK(new_layer, "Unable to create element_wise layer from node: " << *node);
+
+    } else if (inputs.size() == 3 && node->outputs().size() == 2 &&
+                inputs[1]->type()->kind() == c10::TypeKind::IntType) {
+        POROS_CHECK_TRUE((in_dims.size() > 1), 
+            "Converter aten::max.dim error: At least 2 dimensions are required for input[0].");
+        nvinfer1::ITensor* output_max = nullptr;
+        nvinfer1::ITensor* output_indices = nullptr;
+        int64_t dim = engine->context().get_constant(inputs[1]).toInt();
+        dim = dim < 0 ? in_dims.size() + dim : dim;
+
+        bool keep_dim = engine->context().get_constant(inputs[2]).toBool();
+        uint32_t shiftDim = 1 << dim;
+        nvinfer1::TopKOperation topk_option = (node->kind() == torch::jit::aten::max) ?
+                                                nvinfer1::TopKOperation::kMAX : 
+                                                nvinfer1::TopKOperation::kMIN;
+        nvinfer1::ITopKLayer* topk_layer =  engine->network()->addTopK(*in_tensor, topk_option, 1, shiftDim);
+        POROS_CHECK(topk_layer, "Unable to create TopK layer from node: " << *node);
+        topk_layer->setName((layer_info(node) + "_ITopKLayer").c_str());
+        output_max = topk_layer->getOutput(0);
+        output_indices = topk_layer->getOutput(1);
+        
+        // squeeze output dim
+        if (in_tensor->getDimensions().nbDims > 1 && !keep_dim) {
+            auto shuffle_layer1 = engine->network()->addShuffle(*output_max);
+            auto shuffle_layer2 = engine->network()->addShuffle(*output_indices);
+            if (is_dynamic) {
+                nvinfer1::ITensor* self_shape_tensor = engine->network()->addShape(*in_tensor)->getOutput(0);
+                nvinfer1::ITensor* squeeze_output_shape = squeeze_nv_shapetensor(engine, self_shape_tensor, dim);
+                shuffle_layer1->setInput(1, *squeeze_output_shape);
+                shuffle_layer2->setInput(1, *squeeze_output_shape);
+            } else {
+                in_dims.erase(in_dims.begin() + dim);
+                nvinfer1::Dims squeeze_output_dims = sizes_to_nvdim(in_dims);
+                shuffle_layer1->setReshapeDimensions(squeeze_output_dims);
+                shuffle_layer2->setReshapeDimensions(squeeze_output_dims);
+            }
+            output_max = shuffle_layer1->getOutput(0);
+            output_indices = shuffle_layer2->getOutput(0);
+        }
+
+        engine->context().set_tensor(node->outputs()[0], output_max);
+        engine->context().set_tensor(node->outputs()[1], output_indices);
+        LOG(INFO) << "Output tensor1 shape: " << output_max->getDimensions();
+        LOG(INFO) << "Output tensor2 shape: " << output_indices->getDimensions();
+        return true;
+
+    } else{
+        //some other situation not supported yet
+        POROS_THROW_ERROR("We should never reach here for MaxMinConverter, meet Unsupported inputs size!");
+    }
+    
+    nvinfer1::ITensor* output = new_layer->getOutput(0);
+
+    if (output->getDimensions().nbDims == 0) {
+        auto shuffle_layer = engine->network()->addShuffle(*output);
+        nvinfer1::Dims output_dims;
+        output_dims.nbDims = 1;
+        output_dims.d[0] = 1;
+        shuffle_layer->setReshapeDimensions(output_dims);
+        shuffle_layer->setName((layer_info(node) + "_IShuffleLayer_for_output").c_str());
+        output = shuffle_layer->getOutput(0);
+    }
+    engine->context().set_tensor(node->outputs()[0], output);
+    LOG(INFO) << "Output tensor shape: " << output->getDimensions();
+    return true;
+}
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, MeanConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, SumConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, ProdConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, MaxMinConverter);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/reduce.h b/poros/src/poros/converter/gpu/reduce.h
new file mode 100644
index 0000000000..9da7b1a62d
--- /dev/null
+++ b/poros/src/poros/converter/gpu/reduce.h
@@ -0,0 +1,140 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file reduce.h
+* @author tianjinjin@baidu.com
+* @date Fri Aug 27 10:18:24 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class MeanConverter : public GpuConverter {
+public:
+    MeanConverter() {}
+    virtual ~MeanConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::mean(Tensor self, *, ScalarType? dtype=None) -> Tensor",
+                "aten::mean.dim(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor",
+                };
+    }
+
+    /** TODO: TO SUPPORT CONVERTERS BELLOW:
+     * "aten::mean.out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)",
+     * "aten::mean.names_dim(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor",
+     * "aten::mean.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)"
+     * **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::mean};
+    }
+};
+
+class SumConverter : public GpuConverter {
+public:
+    SumConverter() {}
+    virtual ~SumConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::sum(Tensor self, *, ScalarType? dtype=None) -> Tensor",
+                "aten::sum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor",
+                };
+    }
+
+    /** TODO: TO SUPPORT CONVERTERS BELLOW:
+     * "aten::sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor",
+     * "aten::sum.IntList_out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)",
+     * "aten::sum.DimnameList_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)"
+     * **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::sum};
+    }
+};
+
+class ProdConverter : public GpuConverter {
+public:
+    ProdConverter() {}
+    virtual ~ProdConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::prod(Tensor self, *, ScalarType? dtype=None) -> Tensor",
+                "aten::prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor",
+                };
+    }
+
+    /** TODO: TO SUPPORT CONVERTERS BELLOW:
+     * "aten::prod.int_out(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)",
+     * "aten::prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor",
+     * "aten::prod.Dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)"
+     * **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::prod};
+    }
+};
+
+class MaxMinConverter : public GpuConverter {
+public:
+    MaxMinConverter() {}
+    virtual ~MaxMinConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::max(Tensor self) -> Tensor",
+                "aten::min(Tensor self) -> Tensor",
+                "aten::max.other(Tensor self, Tensor other) -> Tensor",
+                "aten::min.other(Tensor self, Tensor other) -> Tensor",
+                "aten::max.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)",
+                "aten::min.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)",};
+    }
+
+    /** TODO: TO SUPPORT CONVERTERS BELLOW:
+     * "aten::max.dim_max(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)",
+     * "aten::max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)",
+     * "aten::max.names_dim_max(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)",
+     * "aten::max.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)"
+     * 
+     * "aten::min.dim_min(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)",
+     * "aten::min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)",
+     * "aten::min.names_dim_min(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)",
+     * "aten::min.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)",
+     * **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::max,
+                torch::jit::aten::min};
+    }
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/reflection_pad.cpp b/poros/src/poros/converter/gpu/reflection_pad.cpp
new file mode 100644
index 0000000000..a2965dfcac
--- /dev/null
+++ b/poros/src/poros/converter/gpu/reflection_pad.cpp
@@ -0,0 +1,436 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file reflection_pad.cpp
+* @author tianshaoqing@baidu.com
+* @date Tue Aug 16 16:54:20 CST 2022
+* @brief 
+**/
+
+#include "poros/converter/gpu/reflection_pad.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/context/poros_global.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+/**
+ * @brief 翻转input的dim维，支持dynamic
+ *
+ * @param [in] engine : trtengine
+ * @param [in] node : 当前节点
+ * @param [in] input : 要翻转的tensor
+ * @param [in] is_dynamic : 输入是否是dynamic的
+ * @param [in] dim : 指定要翻转的维度
+ * 
+ * @return nvinfer1::ITensor*
+ * @retval 返回翻转后的tensor
+**/
+static nvinfer1::ITensor* flip_nvtensor(TensorrtEngine* engine, 
+                                        const torch::jit::Node *node, 
+                                        nvinfer1::ITensor* input, 
+                                        bool is_dynamic, 
+                                        int dim) {
+
+    auto in_dims = input->getDimensions();
+    int64_t in_rank = in_dims.nbDims;
+    dim = dim < 0 ? in_rank + dim : dim;
+
+    POROS_ASSERT(dim >= 0 && dim < in_rank, "flip dim is out of range. expect range is [" + 
+            std::to_string(-in_rank) + ", " + std::to_string(in_rank - 1) + "].");
+
+    if (!is_dynamic) {
+        std::vector<int64_t> start_vec, size_vec, stride_vec;
+        for (int32_t r = 0; r < in_rank; r++) {
+            start_vec.push_back(0);
+            size_vec.push_back(in_dims.d[r]);
+            stride_vec.push_back(1);
+        }
+        start_vec[dim] = size_vec[dim] - 1;
+        stride_vec[dim] = -1;
+
+        auto slice_layer = engine->network()->addSlice(*input,
+                                                sizes_to_nvdim(start_vec), 
+                                                sizes_to_nvdim(size_vec), 
+                                                sizes_to_nvdim(stride_vec));
+        slice_layer->setName((layer_info(node) + "_ISliceLayer_flip_for_dim_" + std::to_string(dim)).c_str());
+        return slice_layer->getOutput(0);
+    } else {
+        nvinfer1::ITensor* input_shape = engine->network()->addShape(*input)->getOutput(0);
+        std::vector<int64_t> stride_vec(in_rank, 1), dim_mask_vec(in_rank, 0), tmp_vec(in_rank, 0);
+        stride_vec[dim] = -1;
+        dim_mask_vec[dim] = 1;
+
+        nvinfer1::ITensor* dim_mask_tensor = tensor_to_const(engine, torch::tensor(dim_mask_vec, torch::kInt32));
+        nvinfer1::ITensor* stride_tensor = tensor_to_const(engine, torch::tensor(stride_vec, torch::kInt32));
+
+        nvinfer1::ITensor* start_tensor = add_elementwise(engine, 
+                                            nvinfer1::ElementWiseOperation::kPROD, 
+                                            input_shape, 
+                                            dim_mask_tensor,
+                                            layer_info(node) + "_prod_flip_for_dim_" +
+                                            std::to_string(dim))->getOutput(0);
+
+        start_tensor = add_elementwise(engine, 
+                                    nvinfer1::ElementWiseOperation::kSUB, 
+                                    start_tensor, 
+                                    dim_mask_tensor,
+                                    layer_info(node) + "_sub_flip_for_dim_" +
+                                    std::to_string(dim))->getOutput(0);
+
+        auto slice_layer = engine->network()->addSlice(*input,
+                                                sizes_to_nvdim(tmp_vec), 
+                                                sizes_to_nvdim(tmp_vec), 
+                                                sizes_to_nvdim(tmp_vec));
+        slice_layer->setInput(0, *input);
+        slice_layer->setInput(1, *start_tensor);
+        slice_layer->setInput(2, *input_shape);
+        slice_layer->setInput(3, *stride_tensor);
+        slice_layer->setName((layer_info(node) + "_ISliceLayer_flip_for_dim_" + std::to_string(dim)).c_str());
+
+        return slice_layer->getOutput(0);
+    }
+}
+
+/**
+ * @brief 根据padding的大小，计算left slice 的 start 和 size，以及 righit slice 的 size，
+ * 具体作用见ReflectionPadConverter::converter注释
+ *
+ * @param [in] engine : trtengine
+ * @param [in] node : 当前节点
+ * @param [in] input : 要 reflection padding 的 tensor
+ * @param [in] padding_is_nvtensor : padding参数是否是以nvscalar形式输入的
+ * @param [in] padding_tensor : padding_is_nvtensor 为 true 时，读取内部 padding 数据
+ * @param [in] padding_size : padding_is_nvtensor 为 false 时，读取内部 padding 数据
+ * @param [in] axis : 当前 padding 的维度
+ * 
+ * @return std::tuple<nvinfer1::ITensor*, nvinfer1::ITensor*, nvinfer1::ITensor*>
+ * @retval 返回left slice 的 start 和 size，以及 righit slice 的 size
+**/
+static std::tuple<nvinfer1::ITensor*, nvinfer1::ITensor*, nvinfer1::ITensor*> gen_slice_start_size(TensorrtEngine* engine, 
+                                                                                const torch::jit::Node *node, 
+                                                                                nvinfer1::ITensor* input,
+                                                                                bool padding_is_nvtensor, 
+                                                                                std::vector<nvinfer1::ITensor*> padding_tensor, 
+                                                                                std::vector<int32_t> padding_size,
+                                                                                int32_t axis) {
+    nvinfer1::ITensor* left_start_tensor = nullptr;
+    nvinfer1::ITensor* left_size_tensor = nullptr;
+    nvinfer1::ITensor* right_size_tensor = nullptr;
+    // start_vec[axis] = inDims.d[axis] - padding[padding_index] - 1; 基础是0
+    // size_vec[axis] = padding[padding_index]; 基础是size
+    nvinfer1::ITensor* input_shape = engine->network()->addShape(*input)->getOutput(0);
+
+    auto in_dims = input->getDimensions();
+    int64_t in_rank = in_dims.nbDims;
+    std::vector<int64_t> dim_mask_vec(in_rank, 0), dim_remask_vec(in_rank, 1);
+    dim_mask_vec[axis] = 1;
+    dim_remask_vec[axis] = 0;
+    nvinfer1::ITensor* dim_mask_tensor = tensor_to_const(engine, torch::tensor(dim_mask_vec, torch::kInt32));
+    nvinfer1::ITensor* dim_remask_tensor = tensor_to_const(engine, torch::tensor(dim_remask_vec, torch::kInt32));
+    
+    nvinfer1::ITensor* shape_mask_axis = add_elementwise(engine, 
+                                        nvinfer1::ElementWiseOperation::kPROD, 
+                                        input_shape, 
+                                        dim_mask_tensor,
+                                        layer_info(node) + std::string("_left_shape_mask_axis_") + 
+                                        std::to_string(axis))->getOutput(0);
+
+    nvinfer1::ITensor* left_padding_size_tensor = nullptr;
+    nvinfer1::ITensor* right_padding_size_tensor = nullptr;
+    if (!padding_is_nvtensor) {
+        left_padding_size_tensor = tensor_to_const(engine, torch::tensor({padding_size[0]}, torch::kInt32));
+        right_padding_size_tensor = tensor_to_const(engine, torch::tensor({padding_size[1]}, torch::kInt32));
+    } else {
+        left_padding_size_tensor = padding_tensor[0];
+        right_padding_size_tensor = padding_tensor[1];
+    }
+
+    nvinfer1::ITensor* left_padding_mask_axis = add_elementwise(engine, 
+                                        nvinfer1::ElementWiseOperation::kPROD, 
+                                        dim_mask_tensor,
+                                        left_padding_size_tensor, 
+                                        layer_info(node) + std::string("_left_padding_mask_axis_") + 
+                                        std::to_string(axis))->getOutput(0);
+
+    nvinfer1::ITensor* shape_sub_left_padding_mask_axis = add_elementwise(engine, 
+                                            nvinfer1::ElementWiseOperation::kSUB, 
+                                            shape_mask_axis, 
+                                            left_padding_mask_axis,
+                                            layer_info(node) + std::string("_left_shape_sub_padding_mask_axis_") + 
+                                            std::to_string(axis))->getOutput(0);
+
+    left_start_tensor = add_elementwise(engine, 
+                                nvinfer1::ElementWiseOperation::kSUB, 
+                                shape_sub_left_padding_mask_axis, 
+                                dim_mask_tensor,
+                                layer_info(node) + std::string("_left_shape_sub_padding_sub_one_mask_axis_") + 
+                                std::to_string(axis))->getOutput(0);
+    
+    nvinfer1::ITensor* shape_remask_axis = add_elementwise(engine, 
+                                    nvinfer1::ElementWiseOperation::kPROD, 
+                                    input_shape, 
+                                    dim_remask_tensor,
+                                    layer_info(node) + std::string("_left_shape_remask_axis_") + 
+                                    std::to_string(axis))->getOutput(0);
+
+    left_size_tensor = add_elementwise(engine, 
+                                nvinfer1::ElementWiseOperation::kSUM, 
+                                shape_remask_axis, 
+                                left_padding_mask_axis,
+                                layer_info(node) + std::string("_left_shape_remask_sum_padding_axis_") + 
+                                std::to_string(axis))->getOutput(0);
+    
+    nvinfer1::ITensor* right_padding_mask_axis = add_elementwise(engine, 
+                                        nvinfer1::ElementWiseOperation::kPROD, 
+                                        dim_mask_tensor,
+                                        right_padding_size_tensor, 
+                                        layer_info(node) + std::string("_right_padding_mask_axis_") + 
+                                        std::to_string(axis))->getOutput(0);
+
+    right_size_tensor = add_elementwise(engine, 
+                                nvinfer1::ElementWiseOperation::kSUM, 
+                                shape_remask_axis, 
+                                right_padding_mask_axis,
+                                layer_info(node) + std::string("_right_shape_remask_sum_padding_axis_") + 
+                                std::to_string(axis))->getOutput(0);
+
+    return std::make_tuple(left_start_tensor, left_size_tensor, right_size_tensor);
+}
+
+/**
+ * @brief ReflectionPad功能：镜像填充，pad规则类似constant_pad_nd，只不过pad值换成边缘的镜像
+ * 例如：输入 x = torch.arange(8).reshape(2, 4) =
+ * [[0,1,2,3],
+ *  [4,5,6,7]]
+ * 那么 ReflectionPad1d(x, [1,2]) = 
+ * [[1,0,1,2,3,2,1],
+ *  [5,4,5,6,7,6,5]]
+ * 
+ * converter实现思路
+ * 先将x整体按照pad维度翻转x_flip =
+ * [[3,2,1,0],
+ *  [7,6,5,4]]
+ * 左边padding size = 1
+ * x' = cat([x_flip[:, -2], x], dim = 1)
+ * [[3,2,|1|,0],        [[|1|,0,1,2,3],
+ *  [7,6,|5|,4]]         [|5|,4,5,6,7]]
+ *        ^                ^
+ * 右边padding size = 2
+ * x'' = cat([x', x_flip[:, 1:2]], dim = 1)
+ * [[3,|2,1|,0],        [[1,0,1,2,3,|2,1|],
+ *  [7,|6,5|,4]]         [5,4,5,6,7,|6,5|]]
+ *        ^                            ^
+ * ReflectionPad2d同理
+ *
+ * @param [in] engine : trtengine
+ * @param [in] node : 当前节点
+ * 
+ * @return bool
+ * @retval true convert 成功，false convert 失败
+**/
+bool ReflectionPadConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    // "aten::reflection_pad1d(Tensor self, int[2] padding) -> Tensor"
+    // "aten::replication_pad2d(Tensor self, int[4] padding) -> Tensor"
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 2), "invaid inputs size for ReflectionPadConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for ReflectionPadConverter is not Tensor as expected");
+
+    //extract self
+    auto in = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((in != nullptr), "Unable to init input tensor for node: " << *node);
+    auto inDims = in->getDimensions();
+    int64_t inRank = inDims.nbDims;
+    
+    std::vector<nvinfer1::ITensor*> tensors_vec;
+
+    bool has_tensor_scalar = check_inputs_tensor_scalar(engine, node);
+    bool input0_is_dynamic = check_nvtensor_is_dynamic(in);
+
+    if (!has_tensor_scalar) {
+
+        //extract padding
+        auto padding = (engine->context().get_constant(inputs[1])).toIntList().vec();
+        
+        for (int64_t i = 0; i < int(padding.size() / 2); i++) {
+            int64_t axis = inRank - (i + 1); // axis = {inRank - 1, inRank - 2}
+            int64_t padding_index = i * 2;
+
+            nvinfer1::ITensor* in_flip = flip_nvtensor(engine, node, in, input0_is_dynamic, axis);
+
+            std::tuple<nvinfer1::ITensor*, nvinfer1::ITensor*, nvinfer1::ITensor*> left_start_size_right_size;
+
+            if (inDims.d[axis] < 0) {
+                std::vector<nvinfer1::ITensor*> tmp_itensor_vec;
+                std::vector<int32_t> padding_size_vec = {(int32_t)padding[padding_index], (int32_t)padding[padding_index + 1]};
+                left_start_size_right_size = gen_slice_start_size(engine, node, in, false, tmp_itensor_vec, padding_size_vec, axis);
+            }
+
+            if (padding[padding_index] > 0) { // left padding value
+                tensors_vec.clear();
+                std::vector<int64_t> start_vec, size_vec, stride_vec;
+                for (int32_t r = 0; r < inRank; r++) {
+                    start_vec.push_back(0);
+                    size_vec.push_back(inDims.d[r]);
+                    stride_vec.push_back(1);
+                }
+                start_vec[axis] = inDims.d[axis] - padding[padding_index] - 1;
+                size_vec[axis] = padding[padding_index];
+
+                auto slice_layer = engine->network()->addSlice(*in_flip,
+                                                    sizes_to_nvdim(start_vec), 
+                                                    sizes_to_nvdim(size_vec), 
+                                                    sizes_to_nvdim(stride_vec));
+                slice_layer->setName((layer_info(node) + "_ISliceLayer_for_leftpadding_" + std::to_string(axis)).c_str());
+                if (inDims.d[axis] < 0) {
+                    slice_layer->setInput(1, *(std::get<0>(left_start_size_right_size)));
+                    slice_layer->setInput(2, *(std::get<1>(left_start_size_right_size)));
+                }
+                
+                tensors_vec.push_back(slice_layer->getOutput(0));
+                tensors_vec.push_back(in);
+
+                auto concat_layer = engine->network()->addConcatenation(tensors_vec.data(), tensors_vec.size());
+                concat_layer->setAxis(axis);
+                concat_layer->setName((layer_info(node) + "_IConcatenationLayer_for_leftpadding_" + std::to_string(axis)).c_str());
+                in = concat_layer->getOutput(0);
+                inDims = in->getDimensions();
+            }
+
+            if (padding[padding_index + 1] > 0) { // right padding value
+                tensors_vec.clear();
+                tensors_vec.push_back(in);
+
+                std::vector<int64_t> start_vec, size_vec, stride_vec;
+                for (int32_t r = 0; r < inRank; r++) {
+                    start_vec.push_back(0);
+                    size_vec.push_back(inDims.d[r]);
+                    stride_vec.push_back(1);
+                }
+                start_vec[axis] = 1;
+                size_vec[axis] = padding[padding_index + 1];
+
+                auto slice_layer = engine->network()->addSlice(*in_flip,
+                                                    sizes_to_nvdim(start_vec), 
+                                                    sizes_to_nvdim(size_vec), 
+                                                    sizes_to_nvdim(stride_vec));
+                slice_layer->setName((layer_info(node) + "_ISliceLayer_for_rightpadding_"+ std::to_string(axis)).c_str());
+                if (inDims.d[axis] < 0) {
+                    slice_layer->setInput(2, *(std::get<2>(left_start_size_right_size)));
+                }
+                
+                tensors_vec.push_back(slice_layer->getOutput(0));
+
+                auto concat_layer = engine->network()->addConcatenation(tensors_vec.data(), tensors_vec.size());
+                concat_layer->setAxis(axis);
+                concat_layer->setName((layer_info(node) + "_IConcatenationLayer_for_rightpadding_" + std::to_string(axis)).c_str());
+                in = concat_layer->getOutput(0);
+                inDims = in->getDimensions();
+
+            }
+        }
+    } else {
+        // 先分开
+        std::vector<nvinfer1::ITensor*> padding_tensor_vec;
+        nvinfer1::ITensor* padding_tensor = this->get_tensor_scalar(inputs[1]);
+        nvinfer1::Dims padding_tensor_dims = padding_tensor->getDimensions();
+
+        for (int i = 0; i < padding_tensor_dims.d[0]; i++) {
+            std::vector<int64_t> start_vec(1, i), size_vec(1, 1), stride_vec(1, 1);
+            auto slice_layer = engine->network()->addSlice(*padding_tensor,
+                                                    sizes_to_nvdim(start_vec), 
+                                                    sizes_to_nvdim(size_vec), 
+                                                    sizes_to_nvdim(stride_vec));
+            slice_layer->setName((layer_info(node) + "_ISliceLayer_for_padding_tensor_" + std::to_string(i)).c_str());
+            padding_tensor_vec.push_back(slice_layer->getOutput(0));
+        }
+
+        for (size_t i = 0; i < padding_tensor_vec.size() / 2; i++) {
+            int64_t axis = inRank - (i + 1); // axis = {inRank - 1, inRank - 2}
+            int64_t padding_index = i * 2;
+
+            nvinfer1::ITensor* in_flip = flip_nvtensor(engine, node, in, input0_is_dynamic, axis);
+
+            std::vector<nvinfer1::ITensor*> itensor_vec = {padding_tensor_vec[padding_index], padding_tensor_vec[padding_index + 1]};
+            std::vector<int32_t> tmp_vec;
+            auto left_start_size_right_size = gen_slice_start_size(engine, node, in, true, itensor_vec, tmp_vec, axis);
+
+            // left
+            tensors_vec.clear();
+            std::vector<int64_t> start_vec, size_vec, stride_vec(inRank, 1); 
+
+            auto slice_layer = engine->network()->addSlice(*in_flip,
+                                                sizes_to_nvdim(stride_vec), 
+                                                sizes_to_nvdim(stride_vec), 
+                                                sizes_to_nvdim(stride_vec));
+            
+            slice_layer->setInput(1, *(std::get<0>(left_start_size_right_size)));
+            slice_layer->setInput(2, *(std::get<1>(left_start_size_right_size)));
+            slice_layer->setName((layer_info(node) + "_ISliceLayer_for_leftpadding_" + std::to_string(axis)).c_str());
+            
+            tensors_vec.push_back(slice_layer->getOutput(0));
+            tensors_vec.push_back(in);
+
+            auto concat_layer = engine->network()->addConcatenation(tensors_vec.data(), tensors_vec.size());
+            concat_layer->setAxis(axis);
+            in = concat_layer->getOutput(0);
+            concat_layer->setName((layer_info(node) + "_IConcatenationLayer_for_leftpadding_" + std::to_string(axis)).c_str());
+            inDims = in->getDimensions();
+
+            // right
+            tensors_vec.clear();
+            tensors_vec.push_back(in);
+
+            std::vector<int64_t> start_vec2, stride_vec2;
+            for (int32_t r = 0; r < inRank; r++) {
+                start_vec2.push_back(0);
+                stride_vec2.push_back(1);
+            }
+            start_vec2[axis] = 1;
+
+            auto slice_layer2 = engine->network()->addSlice(*in_flip,
+                                                sizes_to_nvdim(start_vec2), 
+                                                sizes_to_nvdim(stride_vec2), 
+                                                sizes_to_nvdim(stride_vec2));
+            
+            slice_layer2->setInput(2, *(std::get<2>(left_start_size_right_size)));
+            slice_layer2->setName((layer_info(node) + "_ISliceLayer_for_rightpadding_" + std::to_string(axis)).c_str());
+            tensors_vec.push_back(slice_layer2->getOutput(0));
+
+            auto concat_layer2 = engine->network()->addConcatenation(tensors_vec.data(), tensors_vec.size());
+            concat_layer2->setAxis(axis);
+            concat_layer2->setName((layer_info(node) + "_IConcatenationLayer_for_rightpadding_" + std::to_string(axis)).c_str());
+            in = concat_layer2->getOutput(0);
+            inDims = in->getDimensions();
+        }
+    }
+    
+    engine->context().set_tensor(node->outputs()[0], in);
+    LOG(INFO) << "Output tensor shape: " << in->getDimensions();
+    return true;
+}
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, ReflectionPadConverter);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/reflection_pad.h b/poros/src/poros/converter/gpu/reflection_pad.h
new file mode 100644
index 0000000000..d49133025f
--- /dev/null
+++ b/poros/src/poros/converter/gpu/reflection_pad.h
@@ -0,0 +1,65 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file reflection_pad.h
+* @author tianshaoqing@baidu.com
+* @date Tue Aug 16 16:54:20 CST 2022
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class ReflectionPadConverter : public GpuConverter {
+public:
+    ReflectionPadConverter() {}
+    virtual ~ReflectionPadConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::reflection_pad1d(Tensor self, int[2] padding) -> Tensor",
+                "aten::reflection_pad2d(Tensor self, int[4] padding) -> Tensor",
+                };
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::reflection_pad1d,
+                torch::jit::aten::reflection_pad2d,
+                };
+    }
+
+    bool assign_schema_attr() {
+        bool result = true;
+        result &= assign_schema_attr_helper({{"aten::reflection_pad1d(Tensor self, int[2] padding) -> Tensor", {1, 1}}});
+        result &= assign_schema_attr_helper({{"aten::reflection_pad2d(Tensor self, int[4] padding) -> Tensor", {1, 1}}});
+        return result;
+    }
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/replication_pad.cpp b/poros/src/poros/converter/gpu/replication_pad.cpp
new file mode 100644
index 0000000000..f3ec4e2c60
--- /dev/null
+++ b/poros/src/poros/converter/gpu/replication_pad.cpp
@@ -0,0 +1,144 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Part of the following code in this file refs to
+// https://github.com/pytorch/TensorRT/blob/master/core/conversion/converters/impl/replication_pad.cpp
+//
+// Copyright (c) 2020-present, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// Licensed under the 3-Clause BSD License
+
+/**
+* @file replication_pad.cpp
+* @author tianjinjin@baidu.com
+* @date Tue Sep  7 14:29:20 CST 2021
+* @brief 
+**/
+
+#include "poros/converter/gpu/replication_pad.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/context/poros_global.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+/*
+"aten::replication_pad1d(Tensor self, int[2] padding) -> Tensor",
+"aten::replication_pad2d(Tensor self, int[4] padding) -> Tensor",
+"aten::replication_pad3d(Tensor self, int[6] padding) -> Tensor",
+*/
+bool ReplicationPadConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 2), "invaid inputs size for ReplicationPadConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for ReplicationPadConverter is not Tensor as expected");
+
+    //extract self
+    auto in = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((in != nullptr), "Unable to init input tensor for node: " << *node);
+    auto inDims = in->getDimensions();
+    int64_t inRank = inDims.nbDims;
+
+    //extract padding
+    auto padding = (engine->context().get_constant(inputs[1])).toIntList().vec();
+    if (padding.size() == 1) {
+        POROS_THROW_ERROR("Only 3D, 4D, 5D padding with non-constant padding are supported for now");
+    }
+    if (inRank == 3) {
+        POROS_CHECK(padding.size() == 2, "3D tensors expect 2 values for padding");
+    } else if (inRank == 4) {
+        POROS_CHECK(padding.size() == 4, "4D tensors expect 4 values for padding");
+    } else if (inRank == 5) {
+        POROS_CHECK(padding.size() == 6, "5D tensors expect 6 values for padding");
+    } else {
+        POROS_THROW_ERROR("Only 3D, 4D, 5D padding with non-constant padding are supported for now");
+    }
+    
+    std::vector<nvinfer1::ITensor*> tensors_vec;
+    // input: (N, C, D_in, H_in, W_in).
+    // padding: (padding_left, padding_right, padding_top, padding_bottom, padding_front, padding_back)
+    // When axis is inRank - 1, making W_out = W_in + padding_left + padding_right.
+    // When axis is inRank - 2, making H_out = H_in + padding_top + padding_bottom.
+    // When axis is inRank - 1, making D_out = D_in + padding_front + padding_back.
+    for (int64_t i = 0; i < int(padding.size() / 2); i++) {
+        int64_t axis = inRank - (i + 1); // axis = {inRank - 1, inRank - 2, inRank - 3}
+        int64_t padding_index = i * 2;
+
+        if (padding[padding_index] > 0) { // left/top/front padding value
+            tensors_vec.clear();
+            at::Tensor left_indices = torch::tensor({0}, torch::kInt32);
+            auto indicesTensor = tensor_to_const(engine, left_indices);
+            auto left_gather_layer = engine->network()->addGather(*in, *indicesTensor, axis);
+            left_gather_layer->setName((layer_info(node) + "_IGatherLayer_for_left_axis_" + std::to_string(axis)).c_str());
+            auto left_gather_out = left_gather_layer->getOutput(0);
+            for (int i = 0; i < padding[padding_index]; i++) {
+                tensors_vec.push_back(left_gather_out);
+            }
+            tensors_vec.push_back(in);
+            auto concat_layer = engine->network()->addConcatenation(tensors_vec.data(), tensors_vec.size());
+            concat_layer->setAxis(axis);
+            concat_layer->setName((layer_info(node) + "_IConcatenationLayer_for_left_axis_" + std::to_string(axis)).c_str());
+            in = concat_layer->getOutput(0);
+            inDims = in->getDimensions();
+        }
+    
+        if (padding[padding_index + 1] > 0) { // right/bottom/back padding value
+            tensors_vec.clear();
+            tensors_vec.push_back(in);
+
+            nvinfer1::ITensor* indicesTensor = NULL;
+            if (inDims.d[axis] == -1) {
+                auto shapeTensor = engine->network()->addShape(*in)->getOutput(0);
+                at::Tensor dimValue = torch::tensor({axis}, torch::kInt32);
+                auto dimTensor = tensor_to_const(engine, dimValue);
+                indicesTensor = engine->network()->addGather(*shapeTensor, *dimTensor, 0)->getOutput(0);
+                auto oneTensor = tensor_to_const(engine, torch::tensor({1}, torch::kInt32));
+                indicesTensor = engine->network()->addElementWise(*indicesTensor, 
+                                    *oneTensor, nvinfer1::ElementWiseOperation::kSUB)->getOutput(0);
+            } else {
+                auto indices = torch::tensor({inDims.d[axis] - 1}, torch::kInt32);
+                indicesTensor = tensor_to_const(engine, indices);
+            }
+            auto right_gather_layer = engine->network()->addGather(*in, *indicesTensor, axis);
+            right_gather_layer->setName((layer_info(node) + "_IGatherLayer_for_right_axis_" + std::to_string(axis)).c_str());
+            auto right_gather_out = right_gather_layer->getOutput(0);
+
+            for (int i = 0; i < padding[padding_index + 1]; i++) {
+                tensors_vec.push_back(right_gather_out);
+            }
+
+            auto concat_layer = engine->network()->addConcatenation(tensors_vec.data(), tensors_vec.size());
+            concat_layer->setAxis(axis);
+            concat_layer->setName((layer_info(node) + "_IConcatenationLayer_for_right_axis_" + std::to_string(axis)).c_str());
+            in = concat_layer->getOutput(0);
+            inDims = in->getDimensions();
+        }
+    }
+    
+    engine->context().set_tensor(node->outputs()[0], in);
+    LOG(INFO) << "Output tensor shape: " << in->getDimensions();
+    return true;
+}
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, ReplicationPadConverter);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/replication_pad.h b/poros/src/poros/converter/gpu/replication_pad.h
new file mode 100644
index 0000000000..1f9fe8bc43
--- /dev/null
+++ b/poros/src/poros/converter/gpu/replication_pad.h
@@ -0,0 +1,65 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file replication_pad.h
+* @author tianjinjin@baidu.com
+* @date Tue Sep  7 14:29:20 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class ReplicationPadConverter : public GpuConverter {
+public:
+    ReplicationPadConverter() {}
+    virtual ~ReplicationPadConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::replication_pad1d(Tensor self, int[2] padding) -> Tensor",
+                "aten::replication_pad2d(Tensor self, int[4] padding) -> Tensor",
+                "aten::replication_pad3d(Tensor self, int[6] padding) -> Tensor",
+                };
+    }
+
+    /** TODO: TO SUPPORT CONVERTERS BELLOW:
+     * "aten::replication_pad1d.out(Tensor self, int[2] padding, *, Tensor(a!) out) -> Tensor(a!)",
+     * "aten::replication_pad2d.out(Tensor self, int[4] padding, *, Tensor(a!) out) -> Tensor(a!)",
+     * "aten::replication_pad3d.out(Tensor self, int[6] padding, *, Tensor(a!) out) -> Tensor(a!)"
+     * **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::replication_pad1d,
+                torch::jit::aten::replication_pad2d,
+                torch::jit::aten::replication_pad3d,
+                };
+    }
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/roll.cpp b/poros/src/poros/converter/gpu/roll.cpp
new file mode 100644
index 0000000000..7340c15ed4
--- /dev/null
+++ b/poros/src/poros/converter/gpu/roll.cpp
@@ -0,0 +1,114 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file roll.cpp
+* @author tianshaoqing@baidu.com
+* @date Wed Jul 20 16:34:51 CST 2022
+* @brief 
+**/
+
+#include "poros/converter/gpu/roll.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+// aten::roll(Tensor self, int[1] shifts, int[1] dims=[]) -> (Tensor)
+bool RollConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 3), "invaid inputs size for RollConverter");
+
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for RollConverter is not Tensor as expected");
+
+    POROS_CHECK_TRUE((inputs[1]->type()->isSubtypeOf(c10::ListType::ofInts()) 
+                        && inputs[2]->type()->isSubtypeOf(c10::ListType::ofInts())), 
+                        "input[1] or input[2] for RollConverter is not int[] as expected");
+    // extract self
+    nvinfer1::ITensor* self = engine->context().get_tensor(inputs[0]);
+    // extract shifts
+    std::vector<int64_t> shifts_vec = (engine->context().get_constant(inputs[1])).toIntList().vec();
+    // extract dims
+    std::vector<int64_t> dims_vec = (engine->context().get_constant(inputs[2])).toIntList().vec();
+
+    POROS_CHECK_TRUE((shifts_vec.size() == dims_vec.size()), 
+                        "The length of shifts and dims must be equal in RollConverter.");
+    
+    // Implementation of aten::roll
+    // example: 
+    // input = {1, 2, 3, 4, 5}; shifts = 3; dim = 0;
+    // Then slice input into two parts: {1, 2} and {3, 4, 5}.
+    // Finally flip their order and concat them on rolling dim 0: {3, 4, 5, 1, 2}.
+    // And so on when multiple dimensions.
+    nvinfer1::Dims self_dims = self->getDimensions();
+    for (size_t i = 0; i < shifts_vec.size(); i++) {
+        std::vector<nvinfer1::ITensor*> tensorlist;
+        int64_t rolling_dim = dims_vec[i];
+        rolling_dim = (rolling_dim < 0) ? (self_dims.nbDims + rolling_dim) : rolling_dim;
+
+        int64_t shift_stride = shifts_vec[i];
+        // Shift is allowed to be greater than the rolling dimension, so we need to take the remainder.
+        shift_stride = shift_stride % self_dims.d[rolling_dim];
+        // when shift == 0, on processing required
+        if (shift_stride == 0) {
+            continue;
+        }
+        std::vector<int64_t> start_vec(self_dims.nbDims, 0);
+        std::vector<int64_t> size_vec(self_dims.nbDims, 0);
+        std::vector<int64_t> stride_vec(self_dims.nbDims, 1);
+
+        for (int32_t s = 0; s < self_dims.nbDims; s++) {
+            size_vec[s] = self_dims.d[s];
+        }
+
+        size_vec[rolling_dim] = (shift_stride < 0) ? (-shift_stride) : (self_dims.d[rolling_dim] - shift_stride);
+        
+        auto slice_left_layer = engine->network()->addSlice(*self, 
+                                                sizes_to_nvdim(start_vec), 
+                                                sizes_to_nvdim(size_vec), 
+                                                sizes_to_nvdim(stride_vec));
+        slice_left_layer->setName((layer_info(node) + "_left_slice_" + std::to_string(i)).c_str());
+        nvinfer1::ITensor* left_slice = slice_left_layer->getOutput(0);
+
+        start_vec[rolling_dim] = size_vec[rolling_dim];
+        size_vec[rolling_dim] = self_dims.d[rolling_dim] - size_vec[rolling_dim];
+
+        auto slice_right_layer = engine->network()->addSlice(*self, 
+                                                sizes_to_nvdim(start_vec), 
+                                                sizes_to_nvdim(size_vec), 
+                                                sizes_to_nvdim(stride_vec));
+        slice_right_layer->setName((layer_info(node) + "_right_slice_" + std::to_string(i)).c_str());
+        nvinfer1::ITensor* right_slice = slice_right_layer->getOutput(0);
+        tensorlist.push_back(right_slice);
+        tensorlist.push_back(left_slice);
+
+        auto cat_layer = engine->network()->addConcatenation(tensorlist.data(), tensorlist.size());
+        cat_layer->setAxis(static_cast<int>(rolling_dim));
+        cat_layer->setName((layer_info(node) + "_cat_" + std::to_string(i)).c_str());
+        self = cat_layer->getOutput(0);
+    }
+
+    engine->context().set_tensor(node->outputs()[0], self);
+    LOG(INFO) << "Output shape: " << self->getDimensions();    
+    return true;
+}
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, RollConverter);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/roll.h b/poros/src/poros/converter/gpu/roll.h
new file mode 100644
index 0000000000..a55da88eb9
--- /dev/null
+++ b/poros/src/poros/converter/gpu/roll.h
@@ -0,0 +1,57 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file roll.h
+* @author tianshaoqing@baidu.com
+* @date Wed Jul 20 16:33:51 CST 2022
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class RollConverter : public GpuConverter {
+public:
+    RollConverter() {}
+    virtual ~RollConverter() {}
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+    const std::vector<std::string> schema_string() {
+        return {"aten::roll(Tensor self, int[1] shifts, int[1] dims=[]) -> Tensor"};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        // return {torch::jit::aten::roll}; // can't find defintion in torch-1.9.0
+        return {c10::Symbol::fromQualString("aten::roll")}; 
+    }
+
+    bool assign_schema_attr() {
+        return assign_schema_attr_helper({{"aten::roll(Tensor self, int[1] shifts, int[1] dims=[]) -> Tensor", {0, 0}}});
+    }
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/select.cpp b/poros/src/poros/converter/gpu/select.cpp
new file mode 100644
index 0000000000..e9d5778fc4
--- /dev/null
+++ b/poros/src/poros/converter/gpu/select.cpp
@@ -0,0 +1,1318 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file select.cpp
+* @author tianjinjin@baidu.com
+* @date Tue Aug 24 16:31:28 CST 2021
+* @brief
+**/
+
+#include "poros/converter/gpu/select.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/context/poros_global.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+/*aten::select.int(Tensor(a) self, int dim, int index) -> Tensor(a)*/
+bool SelectConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 3), "invaid inputs size for SelectConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for SelectConverter is not Tensor as expected");
+    POROS_CHECK_TRUE((inputs[1]->node()->kind() == torch::jit::prim::Constant), 
+        "input[1] for SelectConverter is not come from prim::Constant as expected");
+    // POROS_CHECK_TRUE((inputs[2]->node()->kind() == torch::jit::prim::Constant),
+    //     "input[2] for SelectConverter is not come from prim::Constant as expected");
+
+    auto in = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((in != nullptr), "Unable to init input tensor for node: " << *node);
+    auto maxDim = static_cast<int64_t>(in->getDimensions().nbDims);
+
+    //extract dim
+    auto dim = (engine->context().get_constant(inputs[1])).toInt();
+    dim = dim < 0 ? dim + maxDim : dim;
+
+    nvinfer1::ITensor* index_tensor = engine->context().get_tensor(inputs[2]);
+    //extract index
+    if (index_tensor == nullptr) {
+        auto ind = (int32_t)((engine->context().get_constant(inputs[2])).toInt());
+        // dynamic情况下 dim这一维是动态的-1，且index为倒序，需要转正
+        if (in->getDimensions().d[dim] < 0 && ind < 0) {
+            nvinfer1::ITensor* in_shape_tensor = engine->network()->addShape(*in)->getOutput(0);
+            std::vector<int64_t> start_vec = {dim}, size_vec = {1}, stride_vec = {1}; 
+            nvinfer1::ISliceLayer* slice_layer = engine->network()->addSlice(*in_shape_tensor,
+                                                        sizes_to_nvdim(start_vec),
+                                                        sizes_to_nvdim(size_vec),
+                                                        sizes_to_nvdim(stride_vec));
+            nvinfer1::ITensor* in_dim_val = slice_layer->getOutput(0);
+            nvinfer1::ITensor* ind_tensor = tensor_to_const(engine, torch::tensor({ind}).to(torch::kI32));
+            index_tensor = add_elementwise(engine, 
+                                            nvinfer1::ElementWiseOperation::kSUM,  
+                                            in_dim_val,
+                                            ind_tensor,
+                                            layer_info(node) + std::string("_neg_index_to_pos"))->getOutput(0);
+
+        } else {
+            ind = ind < 0 ? ind + in->getDimensions().d[dim] : ind;
+            // index to access needs to be an at::Tensor
+            at::Tensor indices = torch::tensor({ind}).to(torch::kI32);
+            index_tensor = tensor_to_const(engine, indices);
+        }
+    } else {
+        POROS_CHECK_TRUE((in->getDimensions().d[dim] >= 0), "When index(input[2]) of aten::select is not from prim::Constant,"
+        " the selected " + std::to_string(dim) + "th dim of input must be fixed (not dynamic)." << node_info(node));
+    }
+
+    // IGatherLayer takes in input tensor, the indices, and the axis
+    // of input tensor to take indices from
+    auto gather_layer = engine->network()->addGather(*in, *index_tensor, dim);
+    POROS_CHECK(gather_layer, "Unable to create gather layer from node: " << *node);
+    gather_layer->setName((layer_info(node) + "_gathier").c_str());
+    auto out = gather_layer->getOutput(0);
+    LOG(INFO) << "Gather tensor shape: " << out->getDimensions();
+
+    if (out->getDimensions().nbDims != 1) {
+        // IShuffleLayer removes redundant dimensions
+        auto shuffle_layer = engine->network()->addShuffle(*out);
+        POROS_CHECK(shuffle_layer, "Unable to create shuffle layer from node: " << *node);
+        // when input is dynamic
+        if (check_nvtensor_is_dynamic(out)) {
+            nvinfer1::ITensor* gather_out_shape_tensor = engine->network()->addShape(*out)->getOutput(0);
+            gather_out_shape_tensor = squeeze_nv_shapetensor(engine, gather_out_shape_tensor, dim);
+            shuffle_layer->setInput(1, *gather_out_shape_tensor);
+        } else {
+            // when input is not dynamic
+            shuffle_layer->setReshapeDimensions(squeeze_dims(out->getDimensions(), dim, false));
+        }
+        shuffle_layer->setName(layer_info(node).c_str());
+        out = shuffle_layer->getOutput(0);
+    } 
+    
+    engine->context().set_tensor(node->outputs()[0], out);
+    LOG(INFO) << "Output tensor shape: " << out->getDimensions();
+    return true;
+}
+
+// aten::slice.Tensor(Tensor(a) self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor(a)
+bool SliceConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    
+    if (node->schema().operator_name() == torch::jit::parseSchema(this->schema_string()[1]).operator_name()) {
+        // aten::slice.t(t[] l, int? start=None, int? end=None, int step=1) -> (t[])
+        POROS_CHECK_TRUE((inputs.size() == 4), "invaid inputs size for SliceConverter");
+        
+        nvinfer1::ITensor* self_nvtensor = nullptr;
+        std::vector<int64_t> self_vec = {};
+        int32_t dim_rank = 0;
+        std::vector<nvinfer1::ITensor*> itensor_vec = {};
+        bool has_tensor_scalar = false;
+        // input[0] is int[]
+        if (inputs[0]->type()->isSubtypeOf(c10::ListType::ofInts())) {
+            has_tensor_scalar = check_inputs_tensor_scalar(engine, node);
+            if (has_tensor_scalar) {
+                self_nvtensor = this->get_tensor_scalar(inputs[0]);
+                POROS_CHECK_TRUE((self_nvtensor != nullptr), node_info(node) + std::string("get int nvtensor false."));
+                dim_rank = (self_nvtensor->getDimensions()).d[0];
+            } else {
+                self_vec = (engine->context().get_constant(inputs[0])).toIntList().vec();
+                dim_rank = self_vec.size();
+            }   
+        // tensor[]
+        } else if (inputs[0]->type()->isSubtypeOf(c10::ListType::ofTensors())) {
+            POROS_CHECK_TRUE(engine->context().get_tensorlist(inputs[0], itensor_vec), "extract tensor list error.");
+            dim_rank = itensor_vec.size();
+        } else {
+            LOG(WARNING) << node->schema().operator_name() << " converter input[0] meets unsupported type.";
+            return false;
+        }
+        // extract start, end and step
+        torch::jit::IValue maybe_start = engine->context().get_constant(inputs[1]);
+        int64_t startIdx = maybe_start.isNone() ? 0 : maybe_start.toInt();
+        startIdx = (startIdx < 0) ? (dim_rank + startIdx) : startIdx;
+
+        torch::jit::IValue maybe_end = engine->context().get_constant(inputs[2]);
+        int64_t endIdx = maybe_end.isNone() ? dim_rank : maybe_end.toInt();
+        endIdx = (endIdx < 0) ? (dim_rank + endIdx) : endIdx;
+
+        int64_t step = (engine->context().get_constant(inputs[3])).toInt();
+
+        POROS_CHECK_TRUE((startIdx <= endIdx && endIdx <= dim_rank), 
+                                node_info(node) + std::string("start > end or end > self_size"));
+        // input[0] is int[]                        
+        if (inputs[0]->type()->isSubtypeOf(c10::ListType::ofInts())) {
+            if (has_tensor_scalar) {
+                int64_t size = ceil(float(endIdx - startIdx) / float(step));
+                std::vector<int64_t> start_vec{startIdx}, size_vec{size}, stride_vec{step};
+                auto slice_layer = engine->network()->addSlice(*self_nvtensor,
+                                                        sizes_to_nvdim(start_vec),
+                                                        sizes_to_nvdim(size_vec),
+                                                        sizes_to_nvdim(stride_vec));
+                POROS_CHECK(slice_layer, "Unable to given dim info from node: " << *node);
+                slice_layer->setName(layer_info(node).c_str());
+                nvinfer1::ITensor* slice_output = slice_layer->getOutput(0);
+                engine->context().set_tensor(node->outputs()[0], slice_output);
+            } else {
+                c10::List<int64_t> list;
+                int index = startIdx;
+                while (index <= endIdx - 1) {
+                    list.push_back(std::move(self_vec[index]));
+                    index += step;
+                }
+                auto output_ivalue = c10::optional<torch::jit::IValue>(std::move(torch::jit::IValue(list)));
+                engine->context().set_constant(node->outputs()[0], output_ivalue);
+            }
+        } else if (inputs[0]->type()->isSubtypeOf(c10::ListType::ofTensors())) {
+            std::vector<nvinfer1::ITensor*> output_itensor_vec = {};
+            int index = startIdx;
+            while (index <= endIdx - 1) {
+                output_itensor_vec.push_back(itensor_vec[index]);
+                index += step;
+            }
+            engine->context().set_tensorlist(node->outputs()[0], output_itensor_vec);
+        } else {
+            LOG(WARNING) << node->schema().operator_name() << " converter input[0] meets unsupported type.";
+            return false;
+        }
+        
+        return true;
+    }
+
+    POROS_CHECK_TRUE((inputs.size() == 5), "invaid inputs size for SliceConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for SliceConverter is not Tensor as expected");
+    for (int32_t i = 1; i < 5; i++) {
+        if (i == 2 || i == 3) {
+            continue;
+        }
+        POROS_CHECK_TRUE((inputs[i]->node()->kind() == torch::jit::prim::Constant), 
+        std::string("input[") + std::to_string(i) + std::string("] for SliceConverter is not come from prim::Constant as expected"));
+    }
+
+    nvinfer1::ITensor* in = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((in != nullptr), "Unable to init input tensor for node: " << *node);
+
+    int64_t dim = (engine->context().get_constant(inputs[1])).toInt();
+    nvinfer1::Dims in_dims = in->getDimensions();
+    int64_t axis = c10::maybe_wrap_dim(dim, in_dims.nbDims);
+    
+    torch::jit::IValue maybe_start = engine->context().get_constant(inputs[2]);
+    int64_t startIdx = maybe_start.isNone() ? 0 : maybe_start.toInt();
+    torch::jit::IValue maybe_end = engine->context().get_constant(inputs[3]);
+    int64_t endIdx = maybe_end.isNone() ? INT64_MAX : maybe_end.toInt();
+    int64_t step =(engine->context().get_constant(inputs[4])).toInt();
+    POROS_CHECK_TRUE((step > 0), "step for SliceConverter must be postive");
+
+    int64_t maxDim = static_cast<int64_t>(in_dims.d[axis]);
+    int64_t start = 0, end = INT64_MAX;
+
+    // not dynamic or axis dim is not negtive
+    // make sure start and end are postive
+    if (maxDim >= 0) {
+        //extract start
+        start = (startIdx < 0) ? (maxDim + startIdx) : startIdx;
+        POROS_CHECK_TRUE((start >= 0 && start <= maxDim), "invalid start for SliceConverter");
+        //extract end
+        endIdx = std::min(endIdx, maxDim);
+        end = (endIdx < 0) ? (maxDim + endIdx) : endIdx;
+        POROS_CHECK_TRUE((end >= start && end <= maxDim), "invalid end for SliceConverter or end less than start");
+        POROS_CHECK_TRUE((step <= maxDim), "invalid step for SliceConverter");
+    }
+
+    std::vector<int64_t> start_vec, size_vec, stride_vec;
+    bool is_dynamic = check_nvtensor_is_dynamic(in);
+    bool has_tensor_scalar = check_inputs_tensor_scalar(engine, node);
+    for (int32_t i = 0; i < in_dims.nbDims; i++) {
+        start_vec.push_back(0);
+        size_vec.push_back(in_dims.d[i]);
+        stride_vec.push_back(1);
+    }
+    stride_vec[axis] = step;
+    start_vec[axis] = start;
+
+    nvinfer1::ILayer* slice_layer = nullptr;
+
+    // no dynamic and ints don't have nvtensor inputs.
+    if (!is_dynamic && !has_tensor_scalar) { 
+        int64_t size = ceil(float(end - start) / float(step));
+        size_vec[axis] = size;
+        slice_layer = engine->network()->addSlice(*in, 
+                                                sizes_to_nvdim(start_vec), 
+                                                sizes_to_nvdim(size_vec), 
+                                                sizes_to_nvdim(stride_vec));
+        slice_layer->setName(layer_info(node).c_str());
+    } else { // dynamic
+        nvinfer1::IShapeLayer* shape_layer = engine->network()->addShape(*in);
+        nvinfer1::ITensor* in_shape_tensor = shape_layer->getOutput(0);
+        nvinfer1::ITensor* start_tensor = nullptr, *size_tensor = nullptr,  *end_tensor = nullptr;
+        
+        std::vector<int64_t> dy_mask_vec, dy_rev_mask_vec;
+        
+        for (int32_t i = 0; i < in_dims.nbDims; i++) {
+            dy_mask_vec.push_back(0);
+            dy_rev_mask_vec.push_back(1);
+        }
+
+        // Prepare for following calculations. 
+        // Such as, get dynamic input dims is [4, 5, *, 7] (runtime input dims is [4, 5, 6, 7]), and axis dim is 2.
+        // Then, mask_tensor is [0, 0, 1, 0], rev_mask_tensor is [1, 1, 0, 1],
+        // mask_shape_tensor is [0, 0, 6, 0], rev_mask_shape_tensor is [4, 5, 0, 7].
+        at::Tensor mask_tensor = torch::tensor(dy_mask_vec, torch::kInt);
+        at::Tensor rev_mask_tensor = torch::tensor(dy_rev_mask_vec, torch::kInt);
+
+        rev_mask_tensor[axis] = 0;
+        nvinfer1::ITensor* const_rev_mask_tensor = tensor_to_const(engine, rev_mask_tensor);
+        nvinfer1::ITensor* rev_mask_shape_tensor = add_elementwise(engine, 
+                                            nvinfer1::ElementWiseOperation::kPROD, 
+                                            in_shape_tensor, 
+                                            const_rev_mask_tensor,
+                                            layer_info(node) + std::string("_axis_dim_to_zero"))->getOutput(0);
+        mask_tensor[axis] = 1;
+        nvinfer1::ITensor* const_mask_tensor = tensor_to_const(engine, mask_tensor);
+        nvinfer1::ITensor* mask_shape_tensor = add_elementwise(engine, 
+                                            nvinfer1::ElementWiseOperation::kPROD, 
+                                            in_shape_tensor, 
+                                            const_mask_tensor,
+                                            layer_info(node) + std::string("_other_dims_to_zero"))->getOutput(0);
+        bool has_tensor_scalar = check_inputs_tensor_scalar(engine, node);
+        if (has_tensor_scalar) {
+            // Generally, only start and end come from nvtensor
+            // nvinfer1::ITensor* dim_int_nvtensor = this->get_tensor_scalar(inputs[1]);
+            nvinfer1::ITensor* start_int_nvtensor = this->get_tensor_scalar(inputs[2]);
+            nvinfer1::ITensor* end_int_nvtensor = this->get_tensor_scalar(inputs[3]);
+            // nvinfer1::ITensor* stride_int_nvtensor = this->get_tensor_scalar(inputs[4]);
+
+            // only end from nvtensor (start is none)
+            if (end_int_nvtensor != nullptr && start_int_nvtensor == nullptr) {
+                LOG(INFO) << "Slice only end from nvtensor";
+                mask_tensor[axis] = 0;
+                start_tensor = tensor_to_const(engine, mask_tensor);
+                nvinfer1::ITensor* end_tensor_temp = add_elementwise(engine, 
+                                            nvinfer1::ElementWiseOperation::kPROD,  
+                                            const_mask_tensor,
+                                            end_int_nvtensor,
+                                            layer_info(node) + std::string("_end_prod_mask_shape_tensor"))->getOutput(0);
+                end_tensor = add_elementwise(engine, 
+                                            nvinfer1::ElementWiseOperation::kSUM,  
+                                            end_tensor_temp,
+                                            rev_mask_shape_tensor,
+                                            layer_info(node) + std::string("_end_tmp_sum_rev_mask_shape_tensor"))->getOutput(0);
+            // only start from nvtensor (end is none)
+            } else if (end_int_nvtensor == nullptr && start_int_nvtensor != nullptr) {
+                LOG(INFO) << "Slice only start from nvtensor";
+                start_tensor = add_elementwise(engine, 
+                                            nvinfer1::ElementWiseOperation::kPROD,  
+                                            const_mask_tensor,
+                                            start_int_nvtensor,
+                                            layer_info(node) + std::string("_start_prod_mask_shape_tensor"))->getOutput(0);
+                end_tensor = in_shape_tensor; 
+            // start and end both from nvtensor
+            } else {
+                LOG(INFO) << "Slice start and end both from nvtensor";
+                // make sure that start or end which not from nvtensor is postive when maxDims >= 0
+                if (maxDim >= 0) {
+                    if (!maybe_start.isNone()) {
+                        LOG(INFO) << "Slice start can be from constant";
+                        start_int_nvtensor = tensor_to_const(engine, torch::tensor({start}, torch::kInt));
+                    }
+                    if (!maybe_end.isNone()) {
+                        LOG(INFO) << "Slice end can be from constant";
+                        end_int_nvtensor = tensor_to_const(engine, torch::tensor({end}, torch::kInt));
+                    }
+                }
+                start_tensor = add_elementwise(engine, 
+                                            nvinfer1::ElementWiseOperation::kPROD,  
+                                            const_mask_tensor,
+                                            start_int_nvtensor,
+                                            layer_info(node) + std::string("_start_prod_mask_shape_tensor"))->getOutput(0);
+                nvinfer1::ITensor* end_tensor_temp = add_elementwise(engine, 
+                                            nvinfer1::ElementWiseOperation::kPROD,  
+                                            const_mask_tensor,
+                                            end_int_nvtensor,
+                                            layer_info(node) + std::string("_end_prod_mask_shape_tensor"))->getOutput(0);
+                end_tensor = add_elementwise(engine, 
+                                            nvinfer1::ElementWiseOperation::kSUM,  
+                                            end_tensor_temp,
+                                            rev_mask_shape_tensor,
+                                            layer_info(node) + std::string("_end_tmp_sum_rev_mask_shape_tensor"))->getOutput(0);                              
+            }
+            nvinfer1::ITensor* sub_tensor = add_elementwise(engine, 
+                                                    nvinfer1::ElementWiseOperation::kSUB, 
+                                                    end_tensor, 
+                                                    start_tensor,
+                                                    layer_info(node) + std::string("_end_sub_start"))->getOutput(0);
+            // Equivalent to ceil((end - start) / step) -> size
+            if (step > 1) {
+                mask_tensor[axis] = step - 1;
+                nvinfer1::ITensor* sum_step_tensor = add_elementwise(engine, 
+                                                        nvinfer1::ElementWiseOperation::kSUM, 
+                                                        sub_tensor, 
+                                                        tensor_to_const(engine, mask_tensor),
+                                                        layer_info(node) + std::string("_sum_step_sub_one"))->getOutput(0);
+                rev_mask_tensor[axis] = step;
+                size_tensor = add_elementwise(engine, 
+                                nvinfer1::ElementWiseOperation::kFLOOR_DIV, 
+                                sum_step_tensor, 
+                                tensor_to_const(engine, rev_mask_tensor),
+                                layer_info(node) + std::string("_div_get_size"))->getOutput(0);
+            } else {
+                size_tensor = sub_tensor;
+            }
+
+        } else {                                           
+            if (maxDim < 0) {
+                // start
+                mask_tensor[axis] = startIdx;
+                if (startIdx < 0) {
+                    start_tensor = add_elementwise(engine, 
+                                        nvinfer1::ElementWiseOperation::kSUM, 
+                                        mask_shape_tensor, 
+                                        tensor_to_const(engine, mask_tensor),
+                                        layer_info(node) + std::string("_start_tensor"))->getOutput(0);
+                } else {
+                    start_tensor = tensor_to_const(engine, mask_tensor);
+                }
+                // end
+                if (maybe_end.isNone()){
+                    end_tensor = in_shape_tensor;
+                } else {
+                    mask_tensor[axis] = endIdx;
+                    if (endIdx < 0) {
+                        end_tensor = add_elementwise(engine, 
+                                        nvinfer1::ElementWiseOperation::kSUM, 
+                                        in_shape_tensor, 
+                                        tensor_to_const(engine, mask_tensor),
+                                        layer_info(node) + std::string("_end_tensor_to_pos"))->getOutput(0);
+                    } else {
+                        end_tensor = add_elementwise(engine, 
+                                        nvinfer1::ElementWiseOperation::kSUM, 
+                                        rev_mask_shape_tensor, 
+                                        tensor_to_const(engine, mask_tensor),
+                                        layer_info(node) + std::string("_end_tensor"))->getOutput(0);
+                    }
+                }
+                nvinfer1::ITensor* sub_tensor = add_elementwise(engine, 
+                                                    nvinfer1::ElementWiseOperation::kSUB, 
+                                                    end_tensor, 
+                                                    start_tensor,
+                                                    layer_info(node) + std::string("_end_sub_start"))->getOutput(0);
+                // Equivalent to ceil((end - start) / step) -> size
+                if (step > 1) {
+                    mask_tensor[axis] = step - 1;
+                    nvinfer1::ITensor* sum_step_tensor = add_elementwise(engine, 
+                                                            nvinfer1::ElementWiseOperation::kSUM, 
+                                                            sub_tensor, 
+                                                            tensor_to_const(engine, mask_tensor),
+                                                            layer_info(node) + std::string("_sum_step_sub_one"))->getOutput(0);
+                    rev_mask_tensor[axis] = step;
+                    size_tensor = add_elementwise(engine, 
+                                    nvinfer1::ElementWiseOperation::kFLOOR_DIV, 
+                                    sum_step_tensor, 
+                                    tensor_to_const(engine, rev_mask_tensor),
+                                    layer_info(node) + std::string("_div_get_size"))->getOutput(0);
+                } else {
+                    size_tensor = sub_tensor;
+                }
+            } else {
+                mask_tensor[axis] = start;
+                start_tensor = tensor_to_const(engine, mask_tensor);
+
+                mask_tensor[axis] = ceil(float(end - start) / float(step));
+                size_tensor = add_elementwise(engine, 
+                                nvinfer1::ElementWiseOperation::kSUM, 
+                                rev_mask_shape_tensor,
+                                tensor_to_const(engine, mask_tensor),
+                                layer_info(node) + std::string("_sum_get_size"))->getOutput(0);
+            }
+        }
+
+        std::vector<int64_t> temp_vec = {0, 0};
+        slice_layer = engine->network()->addSlice(*in, sizes_to_nvdim(temp_vec), 
+                                                    sizes_to_nvdim(temp_vec), 
+                                                    sizes_to_nvdim(stride_vec));
+        slice_layer->setInput(0, *in);
+        slice_layer->setInput(1, *start_tensor);
+        slice_layer->setInput(2, *size_tensor);
+        // slice_layer->setInput(3, *stride_tensor);
+        slice_layer->setName(layer_info(node).c_str());
+    }
+
+    nvinfer1::ITensor* slice_out = slice_layer->getOutput(0);
+    engine->context().set_tensor(node->outputs()[0], slice_out);
+    LOG(INFO) << "Output tensor shape: " << slice_out->getDimensions();
+    return true;
+}
+
+/*aten::embedding(Tensor weight, 
+Tensor indices, 
+int padding_idx=-1, 
+bool scale_grad_by_freq=False, 
+bool sparse=False) -> Tensor*/
+bool EmbeddingConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 5), "invaid inputs size for EmbeddingConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for EmbeddingConverter is not Tensor as expected");
+    POROS_CHECK_TRUE((inputs[1]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[1] for EmbeddingConverter is not Tensor as expected");
+
+    auto embedding = engine->context().get_tensor(inputs[0]);
+    auto indices  = engine->context().get_tensor(inputs[1]);
+    POROS_CHECK_TRUE(((embedding != nullptr) && (indices != nullptr)), 
+        "Unable to init input tensor for node: " << *node);
+        
+    // Set datatype for indices tensor to INT32
+    auto identity = engine->network()->addIdentity(*indices);
+    identity->setOutputType(0, nvinfer1::DataType::kINT32);
+    identity->setName((layer_info(node) + "_identify").c_str());
+    indices = identity->getOutput(0);
+    
+    // IGatherLayer takes in input tensor, the indices, and the axis of input tensor to take indices from
+    auto gather_layer = engine->network()->addGather(*embedding, *indices, 0);
+    POROS_CHECK(gather_layer, "Unable to create gather layer from node: " << *node);
+    gather_layer->setName(layer_info(node).c_str());
+    auto gather_out = gather_layer->getOutput(0);
+    
+    engine->context().set_tensor(node->outputs()[0], gather_out);
+    LOG(INFO) << "Output tensor shape: " << gather_out->getDimensions();
+    return true;
+}
+
+/*
+aten::narrow(Tensor(a) self, int dim, int start, int length) -> Tensor(a)
+aten::narrow.Tensor(Tensor(a) self, int dim, Tensor start, int length) -> Tensor(a)
+*/
+bool NarrowConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 4), "invaid inputs size for NarrowConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for NarrowConverter is not Tensor as expected");
+
+    auto in = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((in != nullptr), "Unable to init input tensor for node: " << *node);
+
+    //extract dim & length
+    auto maxDim = static_cast<int64_t>(in->getDimensions().nbDims);
+    auto axis  = (engine->context().get_constant(inputs[1])).toInt();
+    axis = (axis < 0) ? (axis + maxDim) : axis;
+    auto length  = (int32_t)(engine->context().get_constant(inputs[3])).toInt();
+
+    //extract start
+    int32_t start = 0;
+    auto maybe_start = engine->context().get_constant(inputs[2]);
+    if (maybe_start.isInt()) {
+        start = (int32_t)maybe_start.toInt();
+        start = (start < 0) ? (maxDim + start) : start;
+    } else if (maybe_start.isTensor()) {
+        auto start_tensor = maybe_start.toTensor().to(torch::kI32);
+        start = start_tensor.item().to<int32_t>();
+    }
+
+    // index to access needs to be an at::Tensor
+    at::Tensor indices = torch::arange(start, start + length, 1).to(torch::kI32);
+    auto weights = Weights(indices);
+
+    // IConstantLayer to convert indices from Weights to ITensor
+    auto const_layer = engine->network()->addConstant(weights.shape, weights.data);
+    POROS_CHECK(const_layer, "Unable to create constant layer from node: " << *node);
+    auto const_out = const_layer->getOutput(0);
+
+    // IGatherLayer takes in input tensor, the indices, and the axis
+    // of input tensor to take indices from
+    auto gather_layer = engine->network()->addGather(*in, *const_out, axis);
+    POROS_CHECK(gather_layer, "Unable to create gather layer from node: " << *node);
+    auto gather_out = gather_layer->getOutput(0);
+
+    // IShuffleLayer removes redundant dimensions
+    auto shuffle_layer = engine->network()->addShuffle(*gather_out);
+    POROS_CHECK(shuffle_layer, "Unable to create shuffle layer from node: " << *node);
+    shuffle_layer->setReshapeDimensions(unpad_nvdim(gather_out->getDimensions()));
+    shuffle_layer->setName(layer_info(node).c_str());
+    auto shuffle_out = shuffle_layer->getOutput(0);    
+    engine->context().set_tensor(node->outputs()[0], shuffle_out);
+    LOG(INFO) << "Output tensor shape: " << shuffle_out->getDimensions();
+    return true;
+}
+
+/*
+aten::split.Tensor(Tensor(a) self, int split_size, int dim=0) -> Tensor(a)[]
+aten::split_with_sizes(Tensor(a) self, int[] split_sizes, int dim=0) -> Tensor(a)[]
+aten::unbind.int(Tensor(a) self, int dim=0) -> Tensor(a)[]
+*/
+bool SplitConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 3 || inputs.size() == 2), "invaid inputs size for SplitConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for SplitConverter is not Tensor as expected");
+
+    auto in = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((in != nullptr), "Unable to init input tensor for node: " << *node);
+
+    int axis = 0;
+    //extract dim
+    if (inputs.size() == 3) {
+        axis = (engine->context().get_constant(inputs[2])).toInt();
+    } else {
+        // node->kind() == torch::jit::aten::unbind
+        // aten::unbind 和 split_size=1时的aten::split 非常像。但aten::unbind最后会做一次squeeze。
+        // 例如：输入shape（2，3，4），dim=1，split_size=1，
+        // 那么aten::unbind出来的就是3个（2，4），aten::split出来的就是3个（2，1，4）
+        axis = (engine->context().get_constant(inputs[1])).toInt();
+    }
+    
+    auto in_dim_size = in->getDimensions().d[axis];
+    
+    //extract split_size
+    auto num_outputs = 1;
+    auto num_remainder = 0;
+    std::vector<int64_t> sizes;
+    auto maybe_split_size = engine->context().get_constant(inputs[1]);
+    if (node->kind() == torch::jit::aten::split_with_sizes) {
+        sizes = maybe_split_size.toIntList().vec();
+        num_outputs = sizes.size();
+    } else { // node->kind() == torch::jit::aten::split
+        auto split_size = maybe_split_size.toInt();
+        // node->kind() == torch::jit::aten::unbind 时设置 split_size 为 1
+        if (inputs.size() == 2) {
+            split_size = 1;
+        }
+        num_outputs = in_dim_size / split_size;
+        num_remainder = in_dim_size % split_size;
+        for (int64_t i = 0; i < num_outputs; i++) {
+            sizes.push_back(split_size);
+        }
+        if (num_remainder) {
+            num_outputs += 1;
+            sizes.push_back(num_remainder);
+        }
+    }
+
+    LOG(INFO) << "Number of split outputs: " << num_outputs;
+
+    std::vector<nvinfer1::ITensor*> tensorlist;
+    tensorlist.reserve(num_outputs);
+
+    int start_idx = 0;
+    for (int64_t i = 0; i < num_outputs; i++) {
+        at::Tensor indices = torch::arange(start_idx, start_idx + sizes[i], 1).to(torch::kI32);
+        auto indices_tensor = tensor_to_const(engine, indices);
+
+        auto gather_layer = engine->network()->addGather(*in, *indices_tensor, axis);
+        auto gather_out = gather_layer->getOutput(0);
+        // 为 aten::unbind axis维度做一次 squeeze
+        if (inputs.size() == 2) {
+            nvinfer1::IShuffleLayer* shuffle_l = engine->network()->addShuffle(*gather_out);
+            std::vector<int64_t> in_shape_vec = nvdim_to_sizes(in->getDimensions());
+            in_shape_vec.erase(in_shape_vec.begin() + axis);
+            shuffle_l->setReshapeDimensions(sizes_to_nvdim(in_shape_vec));
+            gather_out = shuffle_l->getOutput(0);
+        }
+
+        tensorlist.emplace_back(gather_out);
+        start_idx = start_idx + sizes[i];
+    }
+    
+    engine->context().set_tensorlist(node->outputs()[0], tensorlist);
+    return true;
+}
+
+/*
+aten::masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor
+aten::masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor
+*/
+bool MaskedFillConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 3), "invaid inputs size for MaskedFillConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for MaskedFillConverter is not Tensor as expected");
+    POROS_CHECK_TRUE((inputs[1]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[1] for MaskedFillConverter is not Tensor as expected");
+
+    //extract self & mask
+    auto self = engine->context().get_tensor(inputs[0]);
+    auto mask = engine->context().get_tensor(inputs[1]);
+    POROS_CHECK_TRUE((self != nullptr && mask != nullptr), "Unable to init input tensor for node: " << *node);
+    int max_rank = std::max({self->getDimensions().nbDims, mask->getDimensions().nbDims});
+
+    bool is_dynamic = check_nvtensor_is_dynamic(self) || check_nvtensor_is_dynamic(mask);
+    if (is_dynamic) {
+        self = broadcast_itensor(engine, node, self, max_rank, "self");
+        mask = broadcast_itensor(engine, node, mask, max_rank, "mask");
+    } else {
+        mask = add_padding(engine, node, mask, max_rank, false, true);
+        self = add_padding(engine, node, self, max_rank, false, true);
+    }
+
+    //extract value
+    nvinfer1::ITensor* val_t = engine->context().get_tensor(inputs[2]);
+    //situation1: val is a scalar and is_dynamic == false
+    if (val_t == nullptr && !is_dynamic) {
+        auto val = (engine->context().get_constant(inputs[2])).toScalar().to<float>();
+        val_t = tensor_to_const(engine, torch::full(nvdim_to_sizes(self->getDimensions()), val));
+    //situation2: val is a scalar and is_dynamic == true
+    } else if (val_t == nullptr && is_dynamic) {
+            //change scalar to tensor and broadcast it
+            auto val = (engine->context().get_constant(inputs[2])).toScalar().to<float>();
+            at::Tensor val_at_tensor = torch::tensor({val});
+            nvinfer1::ITensor* val_nv_tensor = tensor_to_const(engine, val_at_tensor);
+            val_t = broadcast_itensor(engine, node, val_nv_tensor, max_rank, "value");
+    //situation3: val is a tensor
+    } else {
+        int32_t value_rank = val_t->getDimensions().nbDims;
+        POROS_CHECK(value_rank == 0, "masked_fill only supports a 0-dimensional value tensor");
+        //let's expand value
+        int32_t new_value_rank = self->getDimensions().nbDims;
+        //nvinfer1::ITensor*  new_value_shape = engine->network()->addShape(*self)->getOutput(0);
+
+        //先给value把维度补起来,补成[1, 1, 1, ...], 用shuffle实现 
+        std::vector<int64_t> new_dim(new_value_rank, 1);
+        auto reshape_layer = engine->network()->addShuffle(*val_t);
+        reshape_layer->setReshapeDimensions(sizes_to_nvdim(c10::IntArrayRef(new_dim)));
+        reshape_layer->setName((layer_info(node) + "_IShuffleLayer_for_value").c_str());
+        val_t = reshape_layer->getOutput(0);
+
+        //无需专门调用slice把维度对齐，addSelect接口要求rank对齐就行，rank对齐的情况下，接口内部自己会broadcast。
+        /*
+        //再slice一下, 因为是从rank 0 expand到其他的dim，
+        //所以此处start_dim 设置为全0，stride_dim 也设置为全0，
+        //sizes信息先用start_dim 作为dummy input, 后面用setInput 接口设置真是的output_dim 信息。
+        std::vector<int64_t> start_vec_new(new_value_rank, 0);
+        auto offset = sizes_to_nvdim(c10::IntArrayRef(start_vec_new));
+        
+        // Slice layer does the expansion in TRT. Desired output size is specified by new_value_shape
+        auto slice_layer = engine->network()->addSlice(*val_t, offset, offset, offset);
+        slice_layer->setInput(2, *new_value_shape);
+        slice_layer->setName((layer_info(node) + "_ISliceLayer_for_value").c_str());
+        val_t = slice_layer->getOutput(0);
+        */
+    }
+
+    //no need anymore
+    // POROS_CHECK(broadcastable(self->getDimensions(), mask->getDimensions(), /*multidirectional=*/false),
+    //     "Self and mask tensors are not broadcastable");
+
+    nvinfer1::ISelectLayer* new_layer = engine->network()->addSelect(*mask, *val_t, *self);
+    POROS_CHECK(new_layer, "Unable to create layer for aten::masked_fill");
+
+    new_layer->setName(layer_info(node).c_str());
+    engine->context().set_tensor(node->outputs()[0], new_layer->getOutput(0));
+    LOG(INFO) << "Output tensor shape: " << new_layer->getOutput(0)->getDimensions();
+    return true;
+}
+
+// aten::gather(Tensor self, int dim, Tensor index, *, bool sparse_grad=False) -> Tensor
+bool GatherConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 4), "invaid inputs size for GatherConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for GatherConverter is not Tensor as expected");
+    POROS_CHECK_TRUE((inputs[1]->node()->kind() == torch::jit::prim::Constant), 
+        "input[1] for GatherConverter is not come from prim::Constant as expected");
+    POROS_CHECK_TRUE((inputs[2]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[2] for GatherConverter is not Tensor as expected");
+    // extract self
+    nvinfer1::ITensor* self = engine->context().get_tensor(inputs[0]);
+    auto maxDim = static_cast<int64_t>(self->getDimensions().nbDims);
+    // extract index
+    nvinfer1::ITensor* index  = engine->context().get_tensor(inputs[2]);
+    POROS_CHECK_TRUE(((self != nullptr) && (index != nullptr)), 
+        "Unable to init input tensor for node: " << *node);
+    //extract dim
+    int64_t dim = engine->context().get_constant(inputs[1]).toInt();
+    // make sure dim >= 0
+    dim = dim < 0 ? dim + maxDim : dim;
+    
+    // Set datatype for indices tensor to INT32
+    nvinfer1::IIdentityLayer* identity = engine->network()->addIdentity(*index);
+    identity->setOutputType(0, nvinfer1::DataType::kINT32);
+    identity->setName((layer_info(node) + "_identify").c_str());
+    index = identity->getOutput(0);
+    
+    // IGatherLayer takes in input tensor, the indices, and the axis of input tensor to take indices from
+    nvinfer1::IGatherLayer* gather_layer = engine->network()->addGather(*self, *index, dim);
+    POROS_CHECK(gather_layer, "Unable to create gather layer from node: " << *node);
+    gather_layer->setName(layer_info(node).c_str());
+    gather_layer->setMode(nvinfer1::GatherMode::kELEMENT);
+    nvinfer1::ITensor* gather_out = gather_layer->getOutput(0);
+    
+    engine->context().set_tensor(node->outputs()[0], gather_out);
+    LOG(INFO) << "Output tensor shape: " << gather_out->getDimensions();
+    return true;
+}
+
+/*
+aten::index含义：用indices指定下标，选取self指定维度。
+（实际上是用tensor将多个indices分dims打包起来，能够一起选取）
+例如：输入x，其shape = {3, 4, 5}
+输入两个indices tensors，
+indices_1 = [0, 2]
+indices_2 = [1, 3]
+这组输入表示用indices_1选取x dim=0 的 0和2下标，用indices_2选取x dim=1 的 1和3下标
+则结果为
+output = [x[0][1], x[2][3]]
+由于剩余x dim=2 的维度是5，则
+output.shape = {1, 2, 5}
+这样就实现了同时选取 x[0][1] 和 x[2][3] 的功能了。
+规则：
+1、输入的indices数量不能超过self rank数。（也就是说本例子中输入的indices tensor数量不能大于3）
+2、输入的indices中的值不能超过自己对应维度范围。（例如：indices_1对应x的dim=0，则其最大值必须小于3；indices_2对应x的dim=1，则其最大值必须小于4。）
+3、输入的indices shape必须一致或可以broadcast。（为的是相应位置能够同时选取。）
+---------------------------------
+如果继续上面再输入一个indices tensor
+indices_3 = [2, 4]
+则结果为
+output = [x[0][1][2], x[2][3][4]]
+output.shape = {1, 2}
+*/
+// aten::index.Tensor(Tensor self, Tensor?[] indices) -> Tensor 
+bool IndexConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 2), "invaid inputs size for IndexConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for IndexConverter is not Tensor as expected");
+    // torch对于Tensor?[]类型的解释：
+    // the index of aten::index should be a type of List[Optional[Tensor]],
+    // this is to support the case like t[:, :, 1] where : here indicates a
+    // None/undefined tensor(optional tensor)
+    POROS_CHECK_TRUE((inputs[1]->type()->str().find("Tensor?[]") != std::string::npos), 
+        "input[1] for IndexConverter is not List[Optional[Tensor]] (Tensor?[]) as expected");
+    
+    // extract self
+    nvinfer1::ITensor* self = engine->context().get_tensor(inputs[0]);
+    // extract indices
+    std::vector<nvinfer1::ITensor*> indices_tensors;
+    engine->context().get_tensorlist(inputs[1], indices_tensors);
+
+    // ps: 目前能支持在self的0 dim上选取，也就是说indices_tensors只能输入一个。
+    // 而下面注释掉这段代码实现了更全面的功能，支持多个indices_tensors（见下方介绍），但是实测中会使模型速度更慢（由于使用了更多的gather），先注释掉。解掉注释不支持dynamic
+    POROS_CHECK_TRUE((indices_tensors.size() == 1), 
+        "aten::Index of torchscript implements the selection of multiple dimensions with several indices. "
+        "But due to the functional limitations of trt gatherlayer, in this version of poros, "
+        "aten::Index only support one indice input, which means only 0 dim of self can be indexed.");
+
+    /*
+    // 设self.rank = r，indices_tensors.size() = q （根据规则1 q <= r），则该组输入会选取self的前q维。
+    // 现由于trt gatherlayer功能限制，现只能实现self前q-1维的每一维只能选取一个值。
+    // 例如：上面例子 output = [x[0][1][2], x[2][3][4]] 是不能支持的（因为选取的第1维同时有0和2，第2维同时有1和3），
+    // 而output = [x[0][1][2], x[0][1][4]]是能够支持的。（因为选取的第1维只有0，第2维只有1）
+    // 换句话说，第1至q-1的indices_tensors中的每个值都必须相等。
+    // 为便于判断，先设定只有前q-1的indices_tensors所有维度都是1才能支持（因为这样broadcast过去能保证indices_tensor中的每个值都相等）
+    for (size_t i = 0; i < indices_tensors.size() - 1; i++) {
+        std::vector<int64_t> input_index_shape_vec = nvdim_to_sizes(indices_tensors[i]->getDimensions());
+        size_t shape_prod = 1;
+        for (size_t j = 0; j < input_index_shape_vec.size(); j++) {
+            shape_prod *= input_index_shape_vec[j];
+        }
+        if (shape_prod > 1) {
+            LOG(WARNING) << "Torchscript could have implemented aten::Index with several indices. But due to the functional limitations of trt gatherlayer, "
+            "in this version of poros, aten::Index only support that every dimension of indices is equal to 1 except the last one.";
+            return false;
+        }
+    }
+    // 前q - 1维选取
+    for (size_t i = 0; i < indices_tensors.size() - 1; i++) {
+        // Set datatype for indices tensor to INT32
+        nvinfer1::IIdentityLayer* identity_layer = engine->network()->addIdentity(*indices_tensors[i]);
+        identity_layer->setOutputType(0, nvinfer1::DataType::kINT32);
+        identity_layer->setName((layer_info(node) + "_identify" + std::to_string(i)).c_str());
+        indices_tensors[i] = identity_layer->getOutput(0);
+
+        // 由于前q-1的indices_tensors所有维度都是1，可以将indices reshape到1维
+        nvinfer1::IShuffleLayer* shuffle_layer = engine->network()->addShuffle(*indices_tensors[i]);
+        POROS_CHECK(shuffle_layer, "Unable to create shuffle layer from node: " << *node);
+        shuffle_layer->setName((layer_info(node) + "_shuffle" + std::to_string(i)).c_str());
+        std::vector<int64_t> one_vec = {1};
+        shuffle_layer->setReshapeDimensions(sizes_to_nvdim(one_vec));
+        indices_tensors[i] = shuffle_layer->getOutput(0);
+
+        // 用1维的indices 去gather self的第0维
+        nvinfer1::IGatherLayer* gather_layer = engine->network()->addGather(*self, *indices_tensors[i], 0);
+        POROS_CHECK(gather_layer, "Unable to create gather layer from node: " << *node);
+        gather_layer->setName((layer_info(node) + "_gather" + std::to_string(i)).c_str());
+        self = gather_layer->getOutput(0);
+
+        // 由于gather出的结果第0维是1，可以将gather出的第0维抹掉
+        auto self_shape_vec = nvdim_to_sizes(self->getDimensions());
+        self_shape_vec.erase(self_shape_vec.begin());
+        nvinfer1::IShuffleLayer* shuffle_layer2 = engine->network()->addShuffle(*self);
+        POROS_CHECK(shuffle_layer2, "Unable to create shuffle layer from node: " << *node);
+        shuffle_layer->setName((layer_info(node) + "_shuffle2_" + std::to_string(i)).c_str());
+        shuffle_layer2->setReshapeDimensions(sizes_to_nvdim(self_shape_vec));
+        self = shuffle_layer2->getOutput(0);
+    }*/
+
+    // 最后一维选取，支持indices中包含多个不同值
+    nvinfer1::ITensor* final_index = *(--indices_tensors.end());
+    // Set datatype for indices tensor to INT32
+    nvinfer1::IIdentityLayer* identity_layer = engine->network()->addIdentity(*final_index);
+    identity_layer->setOutputType(0, nvinfer1::DataType::kINT32);
+    identity_layer->setName((layer_info(node) + "_identify").c_str());
+    final_index = identity_layer->getOutput(0);
+    
+    // IGatherLayer takes in input tensor, the indices, and the axis of input tensor to take indices from
+    nvinfer1::IGatherLayer* gather_layer = engine->network()->addGather(*self, *final_index, 0);
+    POROS_CHECK(gather_layer, "Unable to create gather layer from node: " << *node);
+    gather_layer->setName((layer_info(node) + "_gather").c_str());
+    nvinfer1::ITensor* gather_out = gather_layer->getOutput(0);
+    
+    engine->context().set_tensor(node->outputs()[0], gather_out);
+    LOG(INFO) << "Output tensor shape: " << gather_out->getDimensions();
+    return true;
+}
+
+//aten::index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor
+//TODO: when meet accumulate == True  situation. not support yet.
+//TODO: when indices element type is Bool, not support yet.
+bool IndexPutConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 4), "invaid inputs size for IndexPutConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for IndexPutConverter is not Tensor as expected");
+    POROS_CHECK_TRUE((inputs[1]->type()->str().find("Tensor?[]") != std::string::npos), 
+        "input[1] for IndexPutConverter is not List[Optional[Tensor]] (Tensor?[]) as expected");
+    
+    // extract self
+    nvinfer1::ITensor* self = engine->context().get_tensor(inputs[0]);
+    // extract indices
+    std::vector<nvinfer1::ITensor*> indices_tensors;
+    engine->context().get_tensorlist(inputs[1], indices_tensors);
+    //extract values
+    nvinfer1::ITensor* values = engine->context().get_tensor(inputs[2]);
+    //extract accumulate
+    bool accumulate = (engine->context().get_constant(inputs[3])).toBool();
+
+    //situation 1/3: ---------- when indices_tensors.size() == 0  -------------   
+    if (indices_tensors.size() == 0) {
+        engine->context().set_tensor(node->outputs()[0], values);
+        LOG(WARNING) << "meet the situation when indices_tensors(the second input value) for index_put is empty.";
+        LOG(INFO) << "Output tensor shape: " << values->getDimensions();
+        return true;
+    }
+
+    if (accumulate == true) {
+        LOG(WARNING) << "accumulate equal true situation is not supported yet";
+        return false;
+    }
+
+    LOG(INFO) << "handle node info: " << node_info(node)
+            << ", self tensor shape: " << self->getDimensions()
+            << ", value tensor shape: " << values->getDimensions()
+            << ", indices_tensors.size(): " << indices_tensors.size();
+
+    auto is_dynamic_shape = PorosGlobalContext::instance().get_poros_options().is_dynamic;
+    nvinfer1::ITensor* index_tensor = nullptr;
+    nvinfer1::ITensor* broadcast_index_shape = nullptr;
+    //situation 2/3: ---------- when indices_tensors.size() > 1  -------------   
+    if (indices_tensors.size() > 1) {
+        //TODO: check the indices type, if scalartype is bool. we should add NonZero handler
+
+        nvinfer1::ITensor* broadcast_index = indices_tensors[0];
+        //add the element in tensor_list to get broadcast index_tensor
+        for (size_t index = 1; index < indices_tensors.size(); index++) {
+            auto add = add_elementwise(engine, nvinfer1::ElementWiseOperation::kSUM, 
+                                        broadcast_index, 
+                                        indices_tensors[index],
+                                        layer_info(node) + "select_add_" + std::to_string(index));
+            broadcast_index = add->getOutput(0);
+        }
+        //get broadcast_index shape.
+        LOG(INFO) << "broadcast_index dim is : " << broadcast_index->getDimensions();
+        broadcast_index_shape = engine->network()->addShape(*broadcast_index)->getOutput(0);  //shape tensor
+        auto target_dims = broadcast_index->getDimensions();
+        auto output_rank = target_dims.nbDims;
+
+        std::vector<nvinfer1::ITensor*> new_indices_tensors;
+
+        nvinfer1::ITensor* new_input_shape_tensor = nullptr;
+        nvinfer1::ITensor* in = nullptr;  //current handle indice tensor
+        for (size_t index = 0; index < indices_tensors.size(); index++) {
+            //step 2.0: expand the indices
+            in = indices_tensors[index];
+            auto input_dims = in->getDimensions();
+            auto input_rank = in->getDimensions().nbDims;
+            LOG(INFO) << "try to expand tensor shape: " << in->getDimensions()
+                    << " to new shape: " << broadcast_index->getDimensions()
+                    << ", input rank: " << input_rank  << ", output rank: " << output_rank;
+            //situation1: ---------- when input is dynamic shape -------------
+            if (is_dynamic_shape == true) {
+                size_t max_rank = std::max(input_rank, output_rank);
+                // Dimensions are right alignment. Eg: an input of [3, 1] and max_rank = 4, the result of concat is [1, 1, 3, 1]
+                if (max_rank - input_rank > 0) { //need shuffle
+                    torch::Tensor the_one = torch::tensor(std::vector<int32_t>(max_rank - input_rank, 1), torch::kInt32);
+                    auto one_tensor = tensor_to_const(engine, the_one);
+                    auto in_shape_tensor = engine->network()->addShape(*in)->getOutput(0);
+                    nvinfer1::ITensor* const args[2] = {one_tensor, in_shape_tensor};
+                    new_input_shape_tensor =  engine->network()->addConcatenation(args, 2)->getOutput(0);
+                } else { //max_rank - input_rank == 0
+                    new_input_shape_tensor =  engine->network()->addShape(*in)->getOutput(0);
+                }
+                auto shuffle = engine->network()->addShuffle(*in);
+                shuffle->setInput(1, *new_input_shape_tensor);
+                //LOG(INFO) << "input shuffle to shape: " << shuffle->getOutput(0)->getDimensions();
+
+                // Start the slicing from beginning of tensor since this is an expand layer
+                std::vector<int64_t> start_vec(max_rank, 0);
+                nvinfer1::Dims starts_dim = sizes_to_nvdim(c10::IntArrayRef(start_vec));
+                at::Tensor th_start = torch::tensor(nvdim_to_sizes(starts_dim), torch::kInt32);
+                auto starts = tensor_to_const(engine, th_start);
+        
+                // compute sizes = max(x,y).
+                auto sizes = engine->network()->addElementWise(*new_input_shape_tensor, 
+                                            *broadcast_index_shape, 
+                                            nvinfer1::ElementWiseOperation::kMAX)->getOutput(0);
+                nvinfer1::Dims sizes_dim{-1, {}};
+                sizes_dim.nbDims = max_rank;
+        
+                // Compute (x > 1 ? 1 : 0) for x in newDims, assuming positive x, using only TensorRT operations.
+                // min(1, sub(input_shape, 1))
+                torch::Tensor thOne = torch::tensor({1}, torch::kInt32);
+                auto thone_tensor = tensor_to_const(engine, thOne);
+                auto x_sub_one = engine->network()->addElementWise(*new_input_shape_tensor,
+                                                *thone_tensor,
+                                                nvinfer1::ElementWiseOperation::kSUB)->getOutput(0);
+                auto strides = engine->network()->addElementWise(*thone_tensor,
+                                                *x_sub_one,
+                                                nvinfer1::ElementWiseOperation::kMIN)->getOutput(0);
+                nvinfer1::Dims strides_dim{-1, {}};
+                strides_dim.nbDims = max_rank;
+        
+                // Slice layer does the expansion in TRT. Desired output size is specified by sizes input at index 2.
+                auto slice = engine->network()->addSlice(*shuffle->getOutput(0), starts_dim, sizes_dim, strides_dim);
+                slice->setInput(1, *starts);
+                slice->setInput(2, *sizes);
+                slice->setInput(3, *strides);
+                auto new_indice = slice->getOutput(0);
+                //LOG(INFO) << "new indice tensor shape: " << new_indice->getDimensions();
+
+                //unsqueeze it.
+                auto dim = nvdim_to_sizes(new_indice->getDimensions()).size();  //this is ok
+                auto shuffle_layer = engine->network()->addShuffle(*new_indice);
+                nvinfer1::ITensor* input_shape_tensor = (engine->network()->addShape(*new_indice))->getOutput(0);
+                nvinfer1::ITensor* reshape_tensor = unsqueeze_nv_shapetensor(engine, input_shape_tensor, dim);
+                shuffle_layer->setInput(1, *reshape_tensor);
+                //LOG(INFO) << "unsqueeze new indice tensor shape: " << shuffle_layer->getOutput(0)->getDimensions();
+
+                new_indices_tensors.push_back(shuffle_layer->getOutput(0));
+
+            //situation2: ---------- when input is NOT dynamic shape -------------   
+            }  else {  // is_dynamic_shape == false
+                // Validate the expansion. Eg: an input of [3, 1] can be expanded to [1, 3, 4] but not [3, 4, 1]
+                for (int64_t i = target_dims.nbDims - 1; i >= 0; --i) {
+                    int64_t offset = target_dims.nbDims - 1 - i;
+                    int64_t dim = input_dims.nbDims - 1 - offset;
+                    int64_t targetSize = target_dims.d[i];
+                    // In expand layer passing -1 as the size for a dimension means not changing the size of that dimension.
+                    if (targetSize == -1) {
+                        // in(3, 1), expand(3, -1, 4) -> expand(3, 3, 4)
+                        target_dims.d[i] = input_dims.d[dim];
+                    }
+                }
+
+                auto num_expand_dims = target_dims.nbDims - input_dims.nbDims;
+                if (num_expand_dims > 0) {
+                    nvinfer1::Dims reshape_dims;
+                    reshape_dims.nbDims = target_dims.nbDims;
+                    for (int64_t i = 0; i < num_expand_dims; i++) {
+                        reshape_dims.d[i] = 1;
+                    }
+                    for (int64_t i = 0; i < input_dims.nbDims; i++) {
+                        reshape_dims.d[num_expand_dims + i] = input_dims.d[i];
+                    }
+            
+                    // Add a reshape layer to expand dims
+                    auto reshape_layer = engine->network()->addShuffle(*in);
+                    reshape_layer->setReshapeDimensions(reshape_dims);
+                    in = reshape_layer->getOutput(0);
+                    //LOG(INFO) << "Input reshaped to : " << in->getDimensions() << " from " << input_dims;
+                }
+        
+                // Start the slicing from beginning of tensor since this is an expand layer
+                std::vector<int64_t> start_vec(target_dims.nbDims, 0);
+                auto start_offset = sizes_to_nvdim(c10::IntArrayRef(start_vec));
+        
+                // Set the stride of non singleton dimension to 1
+                std::vector<int64_t> strides_vec(target_dims.nbDims, 0);
+                for (int64_t i = 0; i < target_dims.nbDims; i++) {
+                    strides_vec[i] = (in->getDimensions().d[i] != 1);
+                }
+        
+                auto strides = sizes_to_nvdim(c10::IntArrayRef(strides_vec));
+                // Slice layer does the expansion in TRT. Desired output size is specified by target_dims
+                auto slice_layer = engine->network()->addSlice(*in, start_offset, target_dims, strides);
+                auto new_indice = slice_layer->getOutput(0);
+                //LOG(INFO) << "new indice tensor shape: " << new_indice->getDimensions();
+
+                //unsqueeze it.
+                auto dim = nvdim_to_sizes(new_indice->getDimensions()).size();  //this is ok
+                auto shuffle_layer = engine->network()->addShuffle(*new_indice);
+                shuffle_layer->setReshapeDimensions(unsqueeze_dims(new_indice->getDimensions(), dim));
+                //LOG(INFO) << "unsqueeze new indice tensor shape: " << shuffle_layer->getOutput(0)->getDimensions();
+
+                new_indices_tensors.push_back(shuffle_layer->getOutput(0));
+            }
+        }
+
+        auto dim = new_indices_tensors[0]->getDimensions().nbDims - 1;
+        auto cat_layer = engine->network()->addConcatenation(new_indices_tensors.data(), new_indices_tensors.size());
+        cat_layer->setAxis(static_cast<int>(dim));
+        cat_layer->setName((layer_info(node) + "_IConcatenationLayer_for_indices").c_str());
+        index_tensor = cat_layer->getOutput(0);
+    
+    //situation 3/3: ---------- when indices_tensors.size() ==  1  -------------   
+    } else {  
+        auto indices_tensor = indices_tensors[0];
+        broadcast_index_shape =  engine->network()->addShape(*indices_tensor)->getOutput(0);
+        auto dim = nvdim_to_sizes(indices_tensor->getDimensions()).size();  //this is ok
+        auto shuffle_layer = engine->network()->addShuffle(*indices_tensor);
+        shuffle_layer->setReshapeDimensions(unsqueeze_dims(indices_tensor->getDimensions(), dim));
+        LOG(INFO) << "unsqueeze indice tensor shape: " << shuffle_layer->getOutput(0)->getDimensions();
+        index_tensor = shuffle_layer->getOutput(0);
+    }
+   
+    /********************************************************************
+     *               values handle begin 
+     * ******************************************************************/
+    auto value_rank = values->getDimensions().nbDims;
+    //1 get self shape  self_shape is a 1D tensor
+    nvinfer1::ITensor*  self_shape = engine->network()->addShape(*self)->getOutput(0);
+    nvinfer1::Dims self_shape_dim = self_shape->getDimensions();
+
+    //2 sub_data_shape = slice(self_shape, axes=[0], starts=[indices_tensors.size()], ends=[INT64_MAX])
+    int64_t start = indices_tensors.size();
+    int64_t end = static_cast<int64_t>(self_shape_dim.d[0]);
+    int64_t size = ceil(float(end - start) / float(1));
+
+    std::vector<int64_t> start_vec = {start};
+    std::vector<int64_t> size_vec = {size};
+    std::vector<int64_t> stride_vec = {1};
+
+    nvinfer1::ITensor* sub_data_shape = engine->network()->addSlice(*self_shape, 
+                                                sizes_to_nvdim(start_vec), 
+                                                sizes_to_nvdim(size_vec), 
+                                                sizes_to_nvdim(stride_vec))->getOutput(0);
+
+    //3 values_shape = g.op("Concat", broadcast_index_shape, sub_data_shape, axis_i=0)
+    std::vector<nvinfer1::ITensor*> to_concat_tensors = {broadcast_index_shape, sub_data_shape};
+    auto shape_cat_layer = engine->network()->addConcatenation(to_concat_tensors.data(), to_concat_tensors.size());
+    shape_cat_layer->setName((layer_info(node) + "_IConcatenationLayer_for_values").c_str());
+    auto values_shape = shape_cat_layer->getOutput(0);
+
+    //4. we should expand values when it is a singular value 
+    //values = g.op("Expand", values, values_shape)
+    if (value_rank == 0) {
+        LOG(INFO) << "given value is rank == 0, expand it now";
+        auto new_value_rank = values_shape->getDimensions().d[0];
+
+        //先给value把维度补起来,补成[1, 1, 1, ...], 用shuffle实现 
+        std::vector<int64_t> new_dim(new_value_rank, 1);
+        auto reshape_layer = engine->network()->addShuffle(*values);
+        reshape_layer->setReshapeDimensions(sizes_to_nvdim(c10::IntArrayRef(new_dim)));
+        reshape_layer->setName((layer_info(node) + "_IShuffleLayer_for_rank0_values").c_str());
+        values = reshape_layer->getOutput(0);
+
+        //再slice一下, 因为是从rank 0 expand到其他的dim，
+        //所以此处start_dim 设置为全0，stride_dim 也设置为全0，
+        //sizes信息先用start_dim 作为dummy input, 后面用setInput 接口设置真是的output_dim 信息。
+        std::vector<int64_t> start_vec_new(new_value_rank, 0);
+        auto offset = sizes_to_nvdim(c10::IntArrayRef(start_vec_new));
+        
+        // Slice layer does the expansion in TRT. Desired output size is specified by values_shape
+        auto slice_layer = engine->network()->addSlice(*values, offset, offset, offset);
+        slice_layer->setInput(2, *values_shape);
+        slice_layer->setName((layer_info(node) + "_ISliceLayer_for_rank0_values").c_str());
+        values = slice_layer->getOutput(0);
+    }
+
+    auto reshape_layer_final = engine->network()->addShuffle(*values);
+    reshape_layer_final->setInput(1, *values_shape);
+    reshape_layer_final->setName((layer_info(node) + "_IShuffleLayer_for_values").c_str());
+    values = reshape_layer_final->getOutput(0);
+    LOG(INFO) << "new_values tensor shape: " << values->getDimensions();
+    /********************************************************************
+     *               values handle ends
+     * ******************************************************************/
+
+    nvinfer1::IScatterLayer* scatter_layer = engine->network()->addScatter(*self, *index_tensor, *values, nvinfer1::ScatterMode::kND);
+    //scatter_layer->setAxis(0);  // no need
+    scatter_layer->setName((layer_info(node) + "_scatterND").c_str());
+    nvinfer1::ITensor* output = scatter_layer->getOutput(0);
+    engine->context().set_tensor(node->outputs()[0], output);
+    LOG(INFO) << "Output tensor shape: " << output->getDimensions();
+    return true;
+}
+
+// aten::scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> (Tensor)
+// For a 3-D tensor, self is updated as:
+// self[index[i][j][k]][j][k] = value  # if dim == 0
+// self[i][index[i][j][k]][k] = value  # if dim == 1
+// self[i][j][index[i][j][k]] = value  # if dim == 2
+// ps: self和index的shape不一定一样，所以只遍历index的所有下标。index中不存在的下标self不更新还用原来的值。
+bool ScatterConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+        at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 4), "invaid inputs size for ScatterConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for ScatterConverter is not Tensor as expected");
+    POROS_CHECK_TRUE((inputs[2]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[2] for ScatterConverter is not Tensor as expected");
+
+    // extract self
+    nvinfer1::ITensor* self = engine->context().get_tensor(inputs[0]);
+    // extract dim
+    int64_t dim = engine->context().get_constant(inputs[1]).toInt();
+    auto maxDim = static_cast<int64_t>(self->getDimensions().nbDims);
+    dim = dim < 0 ? dim + maxDim : dim;
+    // extract indices
+    nvinfer1::ITensor* index_tensor = engine->context().get_tensor(inputs[2]);
+    // extract scalar
+    auto ivalue_scalar = engine->context().get_constant(inputs[3]);
+    float scalar = ivalue_scalar.toScalar().to<float>();
+
+    nvinfer1::DataType self_data_type = self->getType();
+
+    // IScatterLayer要求输入self必须是float类型
+    if (self_data_type != nvinfer1::DataType::kFLOAT) {
+        auto identity_layer = engine->network()->addIdentity(*self);
+        identity_layer->setOutputType(0, nvinfer1::DataType::kFLOAT);
+        identity_layer->setName((layer_info(node) + "_self_identify_float").c_str());
+        self = identity_layer->getOutput(0);
+    }
+
+    // 当value和self类型不一致时，向self对齐。这里手动做一次类型转换对齐精度。
+    if (ivalue_scalar.isDouble() && self_data_type == nvinfer1::DataType::kINT32) {
+        scalar = (float)(int)scalar;
+    }
+
+    nvinfer1::ITensor* updates_tensor = nullptr;
+    bool is_dynamic = check_nvtensor_is_dynamic(index_tensor);
+    
+    // 输入nvinfer1::IScatterLayer的index和updates的shape必须相同
+    if (!is_dynamic) {
+        std::vector<int64_t> index_dims_vec = nvdim_to_sizes(index_tensor->getDimensions());
+        updates_tensor = tensor_to_const(engine, at::full(index_dims_vec, scalar, torch::kFloat32));
+    } else {
+        nvinfer1::ITensor* index_shape_tensor = engine->network()->addShape(*index_tensor)->getOutput(0);
+        auto fill_layer = engine->network()->addFill(nvinfer1::Dims{1, {1}}, nvinfer1::FillOperation::kLINSPACE);
+        fill_layer->setInput(0, *index_shape_tensor);
+        at::Tensor alpha_tensor = torch::tensor(scalar, torch::kFloat32);
+        fill_layer->setInput(1, *tensor_to_const(engine, alpha_tensor)); // 初始值
+        at::Tensor delta_tensor = torch::zeros(index_tensor->getDimensions().nbDims, torch::kFloat32);
+        fill_layer->setInput(2, *tensor_to_const(engine, delta_tensor)); // delta值
+        fill_layer->setName((layer_info(node) + "_fill_index_shape_value").c_str());
+        updates_tensor = fill_layer->getOutput(0);
+    }
+
+    // self tensor data type must be DataType::kFLOAT.
+    // index tensor data type must be DataType::kINT32.
+    // updates tensor data type must be DataType::kFLOAT.
+    nvinfer1::IScatterLayer* scatter_layer = engine->network()->addScatter(*self, *index_tensor, *updates_tensor, nvinfer1::ScatterMode::kELEMENT);
+    scatter_layer->setAxis(dim);
+    scatter_layer->setName((layer_info(node) + "_scatter").c_str());
+
+    nvinfer1::ITensor* output = scatter_layer->getOutput(0);
+    // 输出不是原来的type需要还原回去，一般是int
+    if (output->getType() != self_data_type) {
+        auto identity_layer = engine->network()->addIdentity(*output);
+        identity_layer->setOutputType(0, self_data_type);
+        identity_layer->setName((layer_info(node) + "_output_identify_original_type").c_str());
+        output = identity_layer->getOutput(0);
+    }
+
+    engine->context().set_tensor(node->outputs()[0], output);
+    LOG(INFO) << "Output tensor shape: " << output->getDimensions();
+    return true;
+}
+
+// prim::ConstantChunk
+bool ChunkConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 1), "invaid inputs size for ChunkConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for ChunkConverter is not Tensor as expected");
+
+    auto in = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((in != nullptr), "Unable to init input tensor for node: " << *node);
+
+    // In IR, the prim::ConstantChunk always appears in the form of "prim::ConstantChunk[chunks=xx, dim=xx]()".
+    // And the way to extract its parameters is a little different.
+    int32_t raw_dim = (int32_t)node->i(torch::jit::attr::dim);
+    int32_t chunks = (int32_t)node->i(torch::jit::attr::chunks);
+
+    int32_t in_rank = in->getDimensions().nbDims;
+    // When dim < 0
+    raw_dim = raw_dim < 0 ? in_rank + raw_dim : raw_dim;
+    int32_t in_dim_size = in->getDimensions().d[raw_dim];
+
+    int32_t every_chunk_size = (int32_t)ceil((double)in_dim_size / (double)chunks);
+    int32_t remainder_size = in_dim_size % every_chunk_size;
+    int32_t chunk_num = (int32_t)ceil((double)in_dim_size / (double)every_chunk_size);
+    
+    // Check whether the calculated chunk_num is equal to the output_num of the node.
+    POROS_CHECK_TRUE((chunk_num == (int32_t)(node->outputs().size())), "The caulated chunk_num (" + std::to_string(chunk_num) + 
+    ") is not equal to the node outputs size (" + std::to_string(node->outputs().size()) + ").");
+    
+    std::vector<int32_t> chunk_sizes_vec;
+    for (int i = 0; i < chunk_num - 1; i++) {
+        chunk_sizes_vec.push_back(every_chunk_size);
+    }
+    if (remainder_size != 0) {
+        chunk_sizes_vec.push_back(remainder_size);
+    } else {
+        chunk_sizes_vec.push_back(every_chunk_size);
+    }
+
+    std::vector<nvinfer1::ITensor*> tensorlist;
+    tensorlist.reserve(chunk_sizes_vec.size());
+
+    int start_idx = 0;
+    for (size_t i = 0; i < chunk_sizes_vec.size(); i++) {
+        at::Tensor indices = torch::arange(start_idx, start_idx + chunk_sizes_vec[i], 1).to(torch::kI32);
+        auto indices_tensor = tensor_to_const(engine, indices);
+
+        auto gather_layer = engine->network()->addGather(*in, *indices_tensor, raw_dim);
+        auto gather_out = gather_layer->getOutput(0);
+
+        tensorlist.emplace_back(gather_out);
+        start_idx = start_idx + chunk_sizes_vec[i];
+    }
+    for (size_t i = 0; i < chunk_sizes_vec.size(); i++) {
+        engine->context().set_tensor(node->outputs()[i], tensorlist[i]);
+        LOG(INFO) << "Output tensor shape: " << tensorlist[i]->getDimensions();
+    }
+
+    return true;
+}
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, SelectConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, SliceConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, EmbeddingConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, NarrowConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, SplitConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, MaskedFillConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, GatherConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, IndexConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, IndexPutConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, ScatterConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, ChunkConverter);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/select.h b/poros/src/poros/converter/gpu/select.h
new file mode 100644
index 0000000000..c7a2c5ea3b
--- /dev/null
+++ b/poros/src/poros/converter/gpu/select.h
@@ -0,0 +1,248 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file select.h
+* @author tianjinjin@baidu.com
+* @date Tue Aug 24 16:31:28 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class SelectConverter : public GpuConverter {
+public:
+    SelectConverter() {}
+    virtual ~SelectConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::select.int(Tensor(a) self, int dim, int index) -> Tensor(a)"};
+    }
+
+    /** TODO: TRY TO SUPPORT SCHEMA PATTERNS BELLOW:
+     * aten::select.Dimname(Tensor(a) self, Dimname dim, int index) -> Tensor(a)
+     **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::select};
+    }
+};
+
+class SliceConverter : public GpuConverter {
+public:
+    SliceConverter() {}
+    virtual ~SliceConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::slice.Tensor(Tensor(a) self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor(a)",
+                "aten::slice.t(t[] l, int? start=None, int? end=None, int step=1) -> (t[])"
+                };
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::slice};
+    }
+
+    bool assign_schema_attr() {
+        bool result = true;
+        result &= assign_schema_attr_helper({{"aten::slice.Tensor(Tensor(a) self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor(a)", {1, 1}}});
+        result &= assign_schema_attr_helper({{"aten::slice.t(t[] l, int? start=None, int? end=None, int step=1) -> (t[])", {1, 1}}});
+        return result;
+    }
+};
+
+class EmbeddingConverter : public GpuConverter {
+public:
+    EmbeddingConverter() {}
+    virtual ~EmbeddingConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::embedding(Tensor weight, Tensor indices, int padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor"};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::embedding};
+    }
+};
+
+class NarrowConverter : public GpuConverter {
+public:
+    NarrowConverter() {}
+    virtual ~NarrowConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::narrow(Tensor(a) self, int dim, int start, int length) -> Tensor(a)",
+                "aten::narrow.Tensor(Tensor(a) self, int dim, Tensor start, int length) -> Tensor(a)"};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::narrow};
+    }
+};
+
+class SplitConverter : public GpuConverter {
+public:
+    SplitConverter() {}
+    virtual ~SplitConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::split.Tensor(Tensor(a) self, int split_size, int dim=0) -> Tensor(a)[]",
+                "aten::split_with_sizes(Tensor(a) self, int[] split_sizes, int dim=0) -> Tensor(a)[]",
+                "aten::unbind.int(Tensor(a) self, int dim=0) -> Tensor(a)[]"};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::split,
+                torch::jit::aten::split_with_sizes,
+                torch::jit::aten::unbind};
+    }
+
+    bool assign_schema_attr() {
+        bool result = true;
+        result &= assign_schema_attr_helper({{"aten::split.Tensor(Tensor(a) self, int split_size, int dim=0) -> Tensor(a)[]", {0, 0}}});
+        result &= assign_schema_attr_helper({{"aten::split_with_sizes(Tensor(a) self, int[] split_sizes, int dim=0) -> Tensor(a)[]", {0, 0}}});
+        result &= assign_schema_attr_helper({{"aten::unbind.int(Tensor(a) self, int dim=0) -> Tensor(a)[]", {0, 0}}});
+        return result;
+    }
+
+};
+
+class MaskedFillConverter : public GpuConverter {
+public:
+    MaskedFillConverter() {}
+    virtual ~MaskedFillConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor",
+                "aten::masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor"};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::masked_fill};
+    }
+
+    bool assign_schema_attr() {
+        return assign_schema_attr_helper({{"aten::masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor", {1, 0}}});
+    }
+};
+
+class GatherConverter : public GpuConverter {
+public:
+    GatherConverter() {}
+    virtual ~GatherConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::gather(Tensor self, int dim, Tensor index, *, bool sparse_grad=False) -> Tensor"};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::gather};
+    }
+};
+
+class IndexConverter : public GpuConverter {
+public:
+    IndexConverter() {}
+    virtual ~IndexConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::index.Tensor(Tensor self, Tensor?[] indices) -> Tensor"};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::index};
+    }
+};
+
+class IndexPutConverter : public GpuConverter {
+public:
+    IndexPutConverter() {}
+    virtual ~IndexPutConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor"};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::index_put};
+    }
+};
+
+class ScatterConverter : public GpuConverter {
+public:
+    ScatterConverter() {}
+    virtual ~ScatterConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> (Tensor)"};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::scatter};
+    }
+};
+
+class ChunkConverter : public GpuConverter {
+public:
+    ChunkConverter() {}
+    virtual ~ChunkConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"prim::ConstantChunk(...) -> (...)"};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::prim::ConstantChunk};
+    }
+
+    bool assign_schema_attr() {
+        return assign_schema_attr_helper({{"prim::ConstantChunk(...) -> (...)", {0, 0}}});
+    }
+};
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/shape_handle.cpp b/poros/src/poros/converter/gpu/shape_handle.cpp
new file mode 100644
index 0000000000..c8c35d93c3
--- /dev/null
+++ b/poros/src/poros/converter/gpu/shape_handle.cpp
@@ -0,0 +1,158 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file shape_handle.cpp
+* @author tianjinjin@baidu.com
+* @date Mon Mar  8 11:36:11 CST 2021
+* @brief 
+**/
+
+#include "poros/converter/gpu/shape_handle.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/context/poros_global.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+/*
+"aten::size(Tensor self) -> (int[])
+aten::size.int(Tensor self, int dim) -> int
+"*/
+bool AtenSizeConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 1 || inputs.size() == 2), "invaid inputs size for AtenSizeConverter");
+
+    //extract self
+    auto self = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((self != nullptr), "Unable to init input tensor for node: " << *node);
+
+    auto shape = engine->network()->addShape(*self);
+    POROS_CHECK(shape, "Unable to create shape layer from node: " << *node);
+    shape->setName((layer_info(node) + "_IShapeLayer_for_self").c_str());
+    auto shape_out = shape->getOutput(0);
+    
+    //output is int[] situation
+    if (inputs.size() == 1) {
+        LOG(INFO) << "start converter aten::size(Tensor self) -> (int[])";
+        engine->context().set_tensor(node->outputs()[0], shape_out);
+        LOG(INFO) << "Output tensor shape: " << shape_out->getDimensions();
+    //output is int situation
+    } else {
+        LOG(INFO) << "start converter aten::size.int(Tensor self, int dim) -> int";
+        auto dim = (engine->context().get_constant(inputs[1])).toInt();
+        nvinfer1::Dims self_dims = self->getDimensions();
+        dim = dim < 0 ? dim + self_dims.nbDims : dim;
+
+        //extract the specific dynamic dim as a 1D-1value tensor
+        std::vector<int64_t> start_vec{dim}, size_vec{1}, stride_vec{1};
+        auto size_layer = engine->network()->addSlice(*shape_out,
+                                                sizes_to_nvdim(start_vec),
+                                                sizes_to_nvdim(size_vec),
+                                                sizes_to_nvdim(stride_vec));
+        POROS_CHECK(size_layer, "Unable to given dim info from node: " << *node);
+        auto size_out = size_layer->getOutput(0);
+        size_layer->setName((layer_info(node) + "_ISliceLayer_for_size").c_str());
+        engine->context().set_tensor(node->outputs()[0], size_out);
+        LOG(INFO) << "Output tensor shape: " << size_out->getDimensions();
+    }
+    return true;
+}
+
+bool ShapeastensorConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 1), "invaid inputs size for ShapeastensorConverter");
+
+    //extract self
+    auto self = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((self != nullptr), "Unable to init input tensor for node: " << *node);
+
+    auto shape = engine->network()->addShape(*self);
+    POROS_CHECK(shape, "Unable to create shape layer from node: " << *node);
+    shape->setName((layer_info(node) + "_IShapeLayer_for_self").c_str());
+    auto shape_out = shape->getOutput(0);
+    
+    engine->context().set_tensor(node->outputs()[0], shape_out);
+    LOG(INFO) << "Output tensor shape: " << shape_out->getDimensions();
+
+    return true;
+}
+
+// aten::len.Tensor(Tensor t) -> (int)
+// aten::len.t(t[] a) -> (int)
+bool LenConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 1), "invaid inputs size for LenConverter");
+
+    // extract self
+    auto self = engine->context().get_tensor(inputs[0]);
+    // POROS_CHECK_TRUE((self != nullptr), "Unable to init input tensor for node: " << *node);
+    if (self != nullptr) {
+        nvinfer1::Dims self_dims = self->getDimensions();
+        if (self_dims.nbDims == 0) {
+            engine->context().set_constant(node->outputs()[0], 0);
+        } else if (self_dims.nbDims > 0 && self_dims.d[0] >= 0) {
+            engine->context().set_constant(node->outputs()[0], self_dims.d[0]);
+        } else {
+            // dynamic
+            nvinfer1::ITensor* self_shape = engine->network()->addShape(*self)->getOutput(0);
+            self_shape->setName((layer_info(node) + "_IShapeLayer_for_self").c_str());
+
+            std::vector<int64_t> start_vec{0}, size_vec{1}, stride_vec{1};
+            auto slice_layer = engine->network()->addSlice(*self_shape,
+                                                    sizes_to_nvdim(start_vec),
+                                                    sizes_to_nvdim(size_vec),
+                                                    sizes_to_nvdim(stride_vec));
+            POROS_CHECK(slice_layer, "Unable to given dim info from node: " << *node);
+            slice_layer->setName((layer_info(node) + "_ISliceLayer_for_len").c_str());
+            auto len_tensor = slice_layer->getOutput(0);
+            engine->context().set_tensor(node->outputs()[0], len_tensor);
+            LOG(INFO) << "Output tensor shape: " << len_tensor->getDimensions();
+        }
+    } else {
+        // tensorlist
+        if (inputs[0]->type()->isSubtypeOf(c10::ListType::ofTensors())) {
+            std::vector<nvinfer1::ITensor*> output_vec;
+            if (engine->context().get_tensorlist(inputs[0], output_vec)) {
+                engine->context().set_constant(node->outputs()[0], int(output_vec.size()));
+            } else {
+                auto in_const = engine->context().get_constant(inputs[0]);
+                engine->context().set_constant(node->outputs()[0], int(in_const.toList().size()));
+            }
+        // scalarlist
+        } else if (inputs[0]->type()->isSubtypeOf(c10::ListType::ofInts()) ||
+                inputs[0]->type()->isSubtypeOf(c10::ListType::ofFloats()) ||
+                inputs[0]->type()->isSubtypeOf(c10::ListType::ofBools())) {
+            auto in_const = engine->context().get_constant(inputs[0]);
+            engine->context().set_constant(node->outputs()[0], int(in_const.toList().size()));
+        } else {
+            POROS_THROW_ERROR("Meet some unsupported output value type in LenConverter" << *node);
+        }
+    }
+    return true;
+}
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, AtenSizeConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, ShapeastensorConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, LenConverter);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/shape_handle.h b/poros/src/poros/converter/gpu/shape_handle.h
new file mode 100644
index 0000000000..9bb23ae91b
--- /dev/null
+++ b/poros/src/poros/converter/gpu/shape_handle.h
@@ -0,0 +1,93 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file shape_handle.h
+* @author tianjinjin@baidu.com
+* @date Mon Nov 29 20:26:44 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class AtenSizeConverter : public GpuConverter {
+public:
+    AtenSizeConverter() {}
+    virtual ~AtenSizeConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::size(Tensor self) -> (int[])",
+                "aten::size.int(Tensor self, int dim) -> int"};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::size};
+    }
+};
+
+class ShapeastensorConverter : public GpuConverter {
+public:
+    ShapeastensorConverter() {}
+    virtual ~ShapeastensorConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::_shape_as_tensor(Tensor self) -> (Tensor)"};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::_shape_as_tensor};
+    }
+};
+
+
+class LenConverter : public GpuConverter {
+public:
+    LenConverter() {}
+    virtual ~LenConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::len.Tensor(Tensor t) -> (int)",
+                "aten::len.t(t[] a) -> (int)"};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::len};
+    }
+
+    bool assign_schema_attr() {
+        return assign_schema_attr_helper({{"aten::len.t(t[] a) -> (int)", {1, 1}}});
+    }
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/shuffle.cpp b/poros/src/poros/converter/gpu/shuffle.cpp
new file mode 100644
index 0000000000..6d351f8ed0
--- /dev/null
+++ b/poros/src/poros/converter/gpu/shuffle.cpp
@@ -0,0 +1,384 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Part of the following code in this file refs to
+// https://github.com/pytorch/TensorRT/blob/master/core/conversion/converters/impl/shuffle.cpp
+//
+// Copyright (c) 2020-present, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// Licensed under the 3-Clause BSD License
+
+/**
+* @file shuffle.cpp
+* @author tianjinjin@baidu.com
+* @date Wed Aug 18 16:23:29 CST 2021
+* @brief 
+**/
+
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/converter/gpu/shuffle.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/context/poros_global.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+/**
+ * aten::flatten.using_ints(Tensor(a) self, int start_dim=0, int end_dim=-1) -> Tensor(a)
+ * **/
+bool FlattenConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    //basic check
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 3), "invaid inputs size for FlattenConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for FlattenConverter is not Tensor as expected");
+    //assumes int inputs are all come from prim::Constant.
+    POROS_CHECK_TRUE((inputs[1]->node()->kind() == torch::jit::prim::Constant),
+        "input[1] for FlattenConverter is not come from prim::Constant as expected");
+    POROS_CHECK_TRUE((inputs[2]->node()->kind() == torch::jit::prim::Constant),
+        "input[2] for FlattenConverter is not come from prim::Constant as expected");
+        
+    auto in = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((in != nullptr), "Unable to init input tensor for node: " << *node);
+
+    auto start_dim = (engine->context().get_constant(inputs[1])).toInt();
+    auto end_dim = (engine->context().get_constant(inputs[2])).toInt();
+
+    auto in_shape = nvdim_to_sizes(in->getDimensions());
+    auto in_shape_rank = in_shape.size();
+    // 倒序转正序
+    start_dim = start_dim < 0 ? start_dim + in_shape_rank : start_dim;
+    end_dim = end_dim < 0 ? end_dim + in_shape_rank : end_dim;
+
+    POROS_CHECK_TRUE((start_dim >= 0 && (size_t)start_dim < in_shape_rank && 
+                    end_dim >= 0 && (size_t)end_dim < in_shape_rank && 
+                    start_dim <= end_dim), "invalid start or end dim for node: " << *node);
+
+    std::vector<int64_t> out_shape;
+
+    bool is_dynamic = check_nvtensor_is_dynamic(in);
+    nvinfer1::IShuffleLayer* shuffle_layer = engine->network()->addShuffle(*in);
+    POROS_CHECK(shuffle_layer, "Unable to create shuffle layer from node: " << *node);
+
+    if (is_dynamic) {
+        nvinfer1::ITensor* in_shape = engine->network()->addShape(*in)->getOutput(0);
+        if (start_dim == end_dim) {
+            shuffle_layer->setInput(1, *in_shape);
+        } else {
+            // Select the dims from start to end with slicelayer and calculate their product.
+            // Then, concat the result with other dims to get the new shape.
+            std::vector<nvinfer1::ITensor*> cat_nvtensor;
+            
+            std::vector<int64_t> stride{1};
+            std::vector<int64_t> front_start{0}, front_size{start_dim};
+            std::vector<int64_t> middle_start{start_dim}, middle_size{end_dim - start_dim + 1};
+            std::vector<int64_t> back_start{end_dim + 1}, back_size{(int64_t)in_shape_rank - end_dim - 1};
+            
+            // front
+            if (start_dim > 0) {
+                cat_nvtensor.push_back(engine->network()->addSlice(*in_shape,
+                                                    sizes_to_nvdim(front_start),
+                                                    sizes_to_nvdim(front_size),
+                                                    sizes_to_nvdim(stride))->getOutput(0));
+            }
+            // middle
+            nvinfer1::ITensor* middle_tensor = engine->network()->addSlice(*in_shape,
+                                                            sizes_to_nvdim(middle_start),
+                                                            sizes_to_nvdim(middle_size),
+                                                            sizes_to_nvdim(stride))->getOutput(0);
+            uint32_t axis_mask = 1;
+            // axis_mask |= 1 << 1;
+            nvinfer1::IReduceLayer* reduce_prod_layer = engine->network()->addReduce(*middle_tensor, 
+                                                            nvinfer1::ReduceOperation::kPROD, axis_mask, true);
+            // default is float32, must set int32                                                            
+            reduce_prod_layer->setPrecision(nvinfer1::DataType::kINT32);
+            
+            cat_nvtensor.push_back(reduce_prod_layer->getOutput(0));
+            // back
+            if ((size_t)end_dim < in_shape_rank - 1) {
+                cat_nvtensor.push_back(engine->network()->addSlice(*in_shape,
+                                                    sizes_to_nvdim(back_start),
+                                                    sizes_to_nvdim(back_size),
+                                                    sizes_to_nvdim(stride))->getOutput(0));
+            }
+            // cat the new shape
+            nvinfer1::IConcatenationLayer* concat_layer = 
+                                                engine->network()->addConcatenation(cat_nvtensor.data(), cat_nvtensor.size());
+            concat_layer->setAxis(0);
+            concat_layer->setName((layer_info(node) + "_IConcatenationLayer").c_str());
+            shuffle_layer->setInput(1, *(concat_layer->getOutput(0)));
+        }
+    } else {
+        // static situation
+        out_shape = torch::flatten(torch::rand(in_shape), start_dim, end_dim).sizes().vec();
+        shuffle_layer->setReshapeDimensions(sizes_to_nvdim(out_shape));
+    }
+    
+    shuffle_layer->setName(layer_info(node).c_str());
+    engine->context().set_tensor(node->outputs()[0], shuffle_layer->getOutput(0));
+    LOG(INFO) << "Output tensor shape: " << shuffle_layer->getOutput(0)->getDimensions();
+    return true;
+}
+
+/**
+ * aten::permute(Tensor(a) self, int[] dims) -> Tensor(a)
+ * aten::view(Tensor(a) self, int[] size) -> Tensor(a)
+ * **/
+bool PermuteViewConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    //basic check
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 2), "invaid inputs size for PermuteViewConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for PermuteViewConverter is not Tensor as expected");
+    //assumes int inputs are all come from prim::Constant.
+    // POROS_CHECK_TRUE((inputs[1]->node()->kind() == torch::jit::prim::Constant),
+    //     "input[1] for PermuteViewConverter is not come from prim::Constant as expected");
+        
+    auto in = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((in != nullptr), "Unable to init input tensor for node: " << *node);
+    auto in_shape = nvdim_to_sizes(in->getDimensions());
+
+    std::vector<int64_t> new_order;
+    if (!check_inputs_tensor_scalar(engine, node)) {
+        new_order = (engine->context().get_constant(inputs[1])).toIntList().vec();
+        LOG(INFO) << "Shuffle to: " << sizes_to_nvdim(new_order);
+    }
+
+    auto shuffle = engine->network()->addShuffle(*in);
+    POROS_CHECK(shuffle, "Unable to create shuffle layer from node: " << *node);
+
+    if (node->kind() == torch::jit::aten::permute) {
+        nvinfer1::Permutation permute;
+        std::copy(new_order.begin(), new_order.end(), permute.order);
+        shuffle->setSecondTranspose(permute);
+    } else if (node->kind() == torch::jit::aten::view) {
+        nvinfer1::ITensor* view_size = engine->context().get_tensor(inputs[1]);
+        if (view_size != nullptr) {
+            shuffle->setInput(1, *view_size);
+        } else {
+            shuffle->setReshapeDimensions(sizes_to_nvdim(new_order));
+        }
+    } else {
+        POROS_THROW_ERROR("We should never reach here for PermuteViewConverter, meet Unsupported node kind!");
+    }
+
+    shuffle->setName(layer_info(node).c_str());
+    engine->context().set_tensor(node->outputs()[0], shuffle->getOutput(0));
+    LOG(INFO) << "Output tensor shape: " << shuffle->getOutput(0)->getDimensions();
+    return true;
+}
+
+/**
+ * aten::reshape(Tensor(a) self, int[] shape) -> Tensor(a)
+ * **/
+bool ReshapeConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    //basic check
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 2), "invaid inputs size for ReshapeConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for ReshapeConverter is not Tensor as expected");
+        
+    auto in = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((in != nullptr), "Unable to init input tensor for node: " << *node);
+    auto in_shape = nvdim_to_sizes(in->getDimensions());
+
+    nvinfer1::IShuffleLayer* shuffle_layer = engine->network()->addShuffle(*in);
+    POROS_CHECK(shuffle_layer, "Unable to create shuffle layer from node: " << *node);
+
+    // 检查是否能使用get_tensor获取input[1]
+    if (engine->context().get_tensor(inputs[1]) != nullptr) {
+        nvinfer1::ITensor* new_shape = engine->context().get_tensor(inputs[1]);
+        shuffle_layer->setInput(1, *new_shape);
+    } else {
+        std::vector<int64_t> new_order = (engine->context().get_constant(inputs[1])).toIntList().vec();
+        // if input shape is dynamic, torch::reshape is wrong.
+        // std::vector<int64_t> new_shape = torch::reshape(torch::rand(in_shape), new_order).sizes().vec();
+        LOG(INFO) << "Shuffle to: " << sizes_to_nvdim(new_order);
+        shuffle_layer->setReshapeDimensions(sizes_to_nvdim(new_order));
+    }
+
+    shuffle_layer->setName(layer_info(node).c_str());
+    engine->context().set_tensor(node->outputs()[0], shuffle_layer->getOutput(0));
+    LOG(INFO) << "Output tensor shape: " << shuffle_layer->getOutput(0)->getDimensions();
+    return true;
+}
+
+/**
+ * aten::transpose.int(Tensor(a) self, int dim0, int dim1) -> Tensor(a)
+ * **/
+bool TransposeConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    //basic check
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 3), "invaid inputs size for TransposeConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for TransposeConverter is not Tensor as expected");
+    //assumes int inputs are all come from prim::Constant.
+    POROS_CHECK_TRUE((inputs[1]->node()->kind() == torch::jit::prim::Constant),
+        "input[1] for TransposeConverter is not come from prim::Constant as expected");
+    POROS_CHECK_TRUE((inputs[2]->node()->kind() == torch::jit::prim::Constant),
+        "input[2] for TransposeConverter is not come from prim::Constant as expected");
+        
+    auto in = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((in != nullptr), "Unable to init input tensor for node: " << *node);
+    auto in_shape = nvdim_to_sizes(in->getDimensions());
+    auto ndims = in_shape.size();
+    
+    //extract dim0 & dim1
+    auto dim0 = (engine->context().get_constant(inputs[1])).toInt();
+    auto dim1 = (engine->context().get_constant(inputs[2])).toInt();
+    
+    std::vector<int64_t> new_order;
+    for (size_t i = 0; i < ndims; i++) {
+        new_order.push_back(i);
+    }
+    dim0 = dim0 < 0 ? (dim0 + ndims) : dim0;
+    dim1 = dim1 < 0 ? (dim1 + ndims) : dim1;
+    auto tmp = dim0;
+    new_order[dim0] = new_order[dim1];
+    new_order[dim1] = tmp;
+    LOG(INFO) << "Shuffle to: " << sizes_to_nvdim(new_order);
+    
+    auto shuffle = engine->network()->addShuffle(*in);
+    POROS_CHECK(shuffle, "Unable to create shuffle layer from node: " << *node);
+    nvinfer1::Permutation permute;
+    std::copy(new_order.begin(), new_order.end(), permute.order);
+    shuffle->setSecondTranspose(permute);
+    shuffle->setName(layer_info(node).c_str());
+    engine->context().set_tensor(node->outputs()[0], shuffle->getOutput(0));
+    LOG(INFO) << "Output tensor shape: " << shuffle->getOutput(0)->getDimensions();
+    return true;
+}
+
+/**
+ * aten::t(Tensor(a) self) -> Tensor(a)
+ * **/
+bool AtenTConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    //basic check
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 1), "invaid inputs size for AtenTConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for AtenTConverter is not Tensor as expected");
+   
+    auto in = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((in != nullptr), "Unable to init input tensor for node: " << *node);
+    auto input_dims = in->getDimensions();
+
+    if (input_dims.nbDims < 2) {
+        //For aten::t situation. if input tensors < 2D, return them as is 
+        engine->context().set_tensor(node->outputs()[0], in);
+        LOG(INFO) << "Output tensor shape: " << in->getDimensions();
+        return true;
+    }
+    
+    auto shuffle = engine->network()->addShuffle(*in);
+    POROS_CHECK(shuffle, "Unable to create shuffle layer from node: " << *node);
+    nvinfer1::Permutation first_perm;
+    first_perm.order[0] = 1;
+    first_perm.order[1] = 0;
+    shuffle->setFirstTranspose(first_perm);
+    shuffle->setZeroIsPlaceholder(false);
+    shuffle->setName(layer_info(node).c_str());
+    engine->context().set_tensor(node->outputs()[0], shuffle->getOutput(0));
+    LOG(INFO) << "Output tensor shape: " << shuffle->getOutput(0)->getDimensions();
+    return true;
+}
+
+/**
+ * aten::pixel_shuffle(Tensor self, int upscale_factor) -> Tensor
+ * **/
+bool PixelShuffleConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    //basic check
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 2), "invaid inputs size for PixelShuffleConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for PixelShuffleConverter is not Tensor as expected");
+    //assumes int inputs are all come from prim::Constant.
+    POROS_CHECK_TRUE((inputs[1]->node()->kind() == torch::jit::prim::Constant),
+        "input[1] for PixelShuffleConverter is not come from prim::Constant as expected");
+
+    //extract self
+    auto self = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((self != nullptr), "Unable to init input tensor for node: " << *node);
+    auto in_shape = nvdim_to_sizes(self->getDimensions());
+    int64_t irank = in_shape.size();
+    POROS_CHECK(irank >= 3, "pixel_shuffle expects input to have at least 3 dimensions, but got input with "
+                    << std::to_string(irank) << " dimension(s)");
+    
+    //extract upscale_factor
+    int64_t upscale_factor = (engine->context().get_constant(inputs[1])).toInt();
+    POROS_CHECK(upscale_factor > 0, "pixel_shuffle expects a positive upscale_factor, but got " 
+                    << std::to_string(upscale_factor));
+    int64_t upscale_factor_squared = upscale_factor * upscale_factor;
+
+
+    const auto NUM_NON_BATCH_DIMS = 3;
+    const auto self_sizes_batch_end = in_shape.end() - NUM_NON_BATCH_DIMS;
+
+    int64_t ic = in_shape[irank - 3];
+    int64_t ih = in_shape[irank - 2];
+    int64_t iw = in_shape[irank - 1];
+    POROS_CHECK(ic % upscale_factor_squared == 0, 
+                    "pixel_shuffle expects its input's 'channel' dimension to be divisible by the square of "
+                    << "upscale_factor, but input.size(-3)=" << std::to_string(ic) << " is not divisible by "
+                    << std::to_string(upscale_factor_squared));
+
+    int64_t oc = ic / upscale_factor_squared;
+    int64_t oh = ih * upscale_factor;
+    int64_t ow = iw * upscale_factor;
+
+    std::vector<int64_t> added_dims_shape(in_shape.begin(), self_sizes_batch_end);
+    added_dims_shape.insert(added_dims_shape.end(), {oc, upscale_factor, upscale_factor, ih, iw});
+    auto view_layer = engine->network()->addShuffle(*self);
+    POROS_CHECK(view_layer, "Unable to create shuffle layer from node: " << *node);
+    view_layer->setReshapeDimensions(sizes_to_nvdim(added_dims_shape));
+    int64_t view_rank = added_dims_shape.size();
+
+    auto permutation_layer = engine->network()->addShuffle(*view_layer->getOutput(0));
+    POROS_CHECK(permutation_layer, "Unable to create shuffle layer from node: " << *node);
+    std::vector<int64_t> new_order(in_shape.begin(), self_sizes_batch_end);
+    std::iota(new_order.begin(), new_order.end(), 0);
+    new_order.insert(
+        new_order.end(),
+        {view_rank - 5, view_rank - 2, view_rank - 4, view_rank - 1, view_rank - 3});
+    nvinfer1::Permutation permute;
+    std::copy(new_order.begin(), new_order.end(), permute.order);
+    permutation_layer->setSecondTranspose(permute);
+
+
+    std::vector<int64_t> final_shape(in_shape.begin(), self_sizes_batch_end);
+    final_shape.insert(final_shape.end(), {oc, oh, ow});
+    auto last_view_layer = engine->network()->addShuffle(*permutation_layer->getOutput(0));
+    POROS_CHECK(last_view_layer, "Unable to create shuffle layer from node: " << *node);
+    last_view_layer->setReshapeDimensions(sizes_to_nvdim(final_shape));
+    last_view_layer->setName(layer_info(node).c_str());
+    engine->context().set_tensor(node->outputs()[0], last_view_layer->getOutput(0));
+    LOG(INFO) << "Output tensor shape: " << last_view_layer->getOutput(0)->getDimensions();
+    return true;
+}
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, FlattenConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, PermuteViewConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, ReshapeConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, TransposeConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, AtenTConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, PixelShuffleConverter);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/shuffle.h b/poros/src/poros/converter/gpu/shuffle.h
new file mode 100644
index 0000000000..81bac43c0c
--- /dev/null
+++ b/poros/src/poros/converter/gpu/shuffle.h
@@ -0,0 +1,163 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file shuffle.h
+* @author tianjinjin@baidu.com
+* @date Wed Aug 18 15:37:48 CST 2021
+* @brief
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class FlattenConverter : public GpuConverter {
+public:
+    FlattenConverter() {}
+    virtual ~FlattenConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::flatten.using_ints(Tensor(a) self, int start_dim=0, int end_dim=-1) -> Tensor(a)"};
+    }
+
+    /** TODO: TO SUPPORT CONVERTERS BELLOW:
+     * aten::flatten.named_out_dim(Tensor(a) self, int start_dim, int end_dim, Dimname out_dim) -> Tensor(a)
+     * aten::flatten.using_names(Tensor(a) self, Dimname start_dim, Dimname end_dim, Dimname out_dim) -> Tensor(a)
+     * aten::flatten.DimnameList(Tensor(a) self, Dimname[] dims, Dimname out_dim) -> Tensor(a)
+     * **/
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::flatten};
+    }
+};
+
+
+class PermuteViewConverter : public GpuConverter {
+public:
+    PermuteViewConverter() {}
+    virtual ~PermuteViewConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::permute(Tensor(a) self, int[] dims) -> Tensor(a)",
+                "aten::view(Tensor(a) self, int[] size) -> Tensor(a)"};
+    }
+
+    /** TODO: TRY TO SUPPORT SCHEMA PATTERNS BELLOW:
+     * aten::view.dtype(Tensor(a) self, ScalarType dtype) -> Tensor(a)
+     **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::permute,
+                torch::jit::aten::view};
+    }
+
+    bool assign_schema_attr() {
+        return assign_schema_attr_helper({{"aten::view(Tensor(a) self, int[] size) -> Tensor(a)", {1, 1}}});
+    }
+};
+
+class ReshapeConverter : public GpuConverter {
+public:
+    ReshapeConverter() {}
+    virtual ~ReshapeConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::reshape(Tensor(a) self, int[] shape) -> Tensor(a)"};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::reshape};
+    }
+
+    bool assign_schema_attr() {
+        return assign_schema_attr_helper({{"aten::reshape(Tensor(a) self, int[] shape) -> Tensor(a)", {1, 1}}});
+    }
+};
+
+class TransposeConverter : public GpuConverter {
+public:
+    TransposeConverter() {}
+    virtual ~TransposeConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::transpose.int(Tensor(a) self, int dim0, int dim1) -> Tensor(a)"};
+    }
+
+    /** TODO: TRY TO SUPPORT SCHEMA PATTERNS BELLOW:
+     * aten::transpose.Dimname(Tensor(a) self, Dimname dim0, Dimname dim1) -> Tensor(a)
+     * aten::transpose_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!)
+     **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::transpose};
+    }
+};
+
+class AtenTConverter : public GpuConverter {
+public:
+    AtenTConverter() {}
+    virtual ~AtenTConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::t(Tensor(a) self) -> Tensor(a)"};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::t};
+    }
+};
+
+class PixelShuffleConverter : public GpuConverter {
+public:
+    PixelShuffleConverter() {}
+    virtual ~PixelShuffleConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::pixel_shuffle(Tensor self, int upscale_factor) -> Tensor"};
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::pixel_shuffle};
+    }
+
+    bool assign_schema_attr() {
+        return assign_schema_attr_helper({{"aten::pixel_shuffle(Tensor self, int upscale_factor) -> Tensor", {0, 0}}});
+    }
+};
+
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
\ No newline at end of file
diff --git a/poros/src/poros/converter/gpu/softmax.cpp b/poros/src/poros/converter/gpu/softmax.cpp
new file mode 100644
index 0000000000..0b1857e184
--- /dev/null
+++ b/poros/src/poros/converter/gpu/softmax.cpp
@@ -0,0 +1,118 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file softmax.cpp
+* @author tianjinjin@baidu.com
+* @date Tue Aug 24 17:15:33 CST 2021
+* @brief 
+**/
+
+#include "poros/converter/gpu/softmax.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/context/poros_global.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+/*aten::softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor*/
+bool SoftmaxConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 3), "invaid inputs size for SoftmaxConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for SoftmaxConverter is not Tensor as expected");
+    POROS_CHECK_TRUE((inputs[1]->node()->kind() == torch::jit::prim::Constant), 
+        "input[1] for SoftmaxConverter is not come from prim::Constant as expected");
+    LOG(INFO) << "Disregarding input[2] dtype argument";
+
+    auto in = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((in != nullptr), "Unable to init input tensor for node: " << *node);
+    auto shape = nvdim_to_sizes(in->getDimensions());
+
+    bool is_dynamic = check_nvtensor_is_dynamic(in);
+    nvinfer1::ITensor* in_shape_tensor = nullptr;
+    if (is_dynamic) {
+        in_shape_tensor = engine->network()->addShape(*in)->getOutput(0);
+    }
+    // SoftMax needs at least 2D input
+    if (shape.size() < 2) {
+        auto new_shape = sizes_to_nvdim_with_pad(shape, 2);
+        auto shuffle = engine->network()->addShuffle(*in);
+        shuffle->setReshapeDimensions(new_shape);
+        shuffle->setName((layer_info(node) + " [Reshape to " + nvdim_to_str(new_shape) + ']').c_str());
+        if (is_dynamic) {
+            nvinfer1::ITensor* insert_tensor = tensor_to_const(engine, torch::tensor({1}, torch::kInt32));
+            std::vector<nvinfer1::ITensor*> inputs_nvtensor;
+            inputs_nvtensor.push_back(insert_tensor);
+            inputs_nvtensor.push_back(in_shape_tensor);
+            nvinfer1::IConcatenationLayer* concat_layer = 
+                    engine->network()->addConcatenation(inputs_nvtensor.data(), inputs_nvtensor.size());
+            concat_layer->setAxis(0);
+            concat_layer->setName((layer_info(node) + "_IConcatenationLayer").c_str());
+            nvinfer1::ITensor* concat_out = concat_layer->getOutput(0);
+            shuffle->setInput(1, *concat_out);
+            shuffle->setName((layer_info(node) + "_IShuffleLayer_1D_to_2D").c_str());
+        }
+        in = shuffle->getOutput(0);
+    }
+    
+    //extract dim
+    auto dim = (engine->context().get_constant(inputs[1])).toInt();
+    if (dim < 0) {
+        dim = shape.size() + dim;
+    }
+    
+    //main function
+    auto softmax = engine->network()->addSoftMax(*in);
+    POROS_CHECK(softmax, "Unable to create softmax layer from node: " << *node);
+    if (shape.size() > 1) {
+        softmax->setAxes(1 << (dim));
+    } else {
+        // When there is no batch dimension
+        softmax->setAxes(1 << (dim + 1));
+    }
+    softmax->setName((layer_info(node) + "_ISoftMaxLayer").c_str());
+    auto out_tensor = softmax->getOutput(0);
+    
+    // SoftMax reshape back
+    if (shape.size() < 2) {
+        auto old_shape = sizes_to_nvdim(shape);
+        LOG(INFO) << "Input shape was less than 2D got: " << old_shape 
+                << ", inserting shuffle layer to reshape back";
+        auto shuffle = engine->network()->addShuffle(*out_tensor);
+        shuffle->setReshapeDimensions(old_shape);
+        shuffle->setName((layer_info(node) + " [Reshape to " + nvdim_to_str(old_shape) + ']').c_str());
+        if (is_dynamic) {
+            shuffle->setInput(1, *in_shape_tensor);
+            shuffle->setName((layer_info(node) + "shuffle_to_old_shape").c_str());
+        }
+        out_tensor = shuffle->getOutput(0);
+    }
+
+    engine->context().set_tensor(node->outputs()[0], out_tensor);
+    LOG(INFO) << "Output tensor shape: " << out_tensor->getDimensions();
+    return true;
+}
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, SoftmaxConverter);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/softmax.h b/poros/src/poros/converter/gpu/softmax.h
new file mode 100644
index 0000000000..9e0fdaf5ab
--- /dev/null
+++ b/poros/src/poros/converter/gpu/softmax.h
@@ -0,0 +1,57 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file softmax.h
+* @author tianjinjin@baidu.com
+* @date Tue Aug 24 17:15:33 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class SoftmaxConverter : public GpuConverter {
+public:
+    SoftmaxConverter() {}
+    virtual ~SoftmaxConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor"};
+    }
+    
+    /** TODO: TRY TO SUPPORT SCHEMA PATTERNS BELLOW:
+     * aten::softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
+     **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::softmax};
+    }
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/squeeze.cpp b/poros/src/poros/converter/gpu/squeeze.cpp
new file mode 100644
index 0000000000..77aed22854
--- /dev/null
+++ b/poros/src/poros/converter/gpu/squeeze.cpp
@@ -0,0 +1,206 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file squeeze.cpp
+* @author tianjinjin@baidu.com
+* @date Wed Sep  1 11:19:13 CST 2021
+* @brief 
+**/
+
+#include "poros/converter/gpu/squeeze.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/context/poros_global.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+nvinfer1::IShuffleLayer* add_shuffle_layer(TensorrtEngine* engine, const torch::jit::Node *node, \
+                                                nvinfer1::ITensor* input, int64_t dim, int64_t idx) {
+    auto shuffle_layer = engine->network()->addShuffle(*input);
+    POROS_CHECK(shuffle_layer, "Unable to create shuffle layer from node: " << *node);
+    shuffle_layer->setName((layer_info(node) + "_IShuffleLayer_for_index_" + std::to_string(idx)).c_str());
+    nvinfer1::ITensor* input_shape_tensor = (engine->network()->addShape(*input))->getOutput(0);
+    nvinfer1::ITensor* reshape_tensor = squeeze_nv_shapetensor(engine, input_shape_tensor, dim);
+    
+    if (reshape_tensor != nullptr) {
+        shuffle_layer->setInput(1, *reshape_tensor);
+    } else {
+        LOG(INFO) << "squeeze nv shape tensor error!";
+        return nullptr;
+    }
+    return shuffle_layer;
+}
+
+
+/*
+"aten::squeeze.dim(Tensor(a) self, int dim) -> Tensor(a)",
+https://pytorch.org/docs/stable/generated/torch.squeeze.html
+将输入张量形状中的1去除并返回。 如果输入是形如(A×1×B×1×C×1×D)，那么输出形状就为： (A×B×C×D)*/
+bool SqueezeConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 2 || inputs.size() == 1), "invaid inputs size for SqueezeConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for SqueezeConverter is not Tensor as expected");
+    if (inputs.size() == 2) {
+        POROS_CHECK_TRUE((inputs[1]->node()->kind() == torch::jit::prim::Constant),
+            "input[1] for SqueezeConverter is not come from prim::Constant as expected");
+    }
+
+    //extract self
+    auto self = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((self != nullptr), "Unable to init input tensor for node: " << *node);
+
+    std::vector<int64_t> dims;
+    int64_t sign = 0;
+    if (inputs.size() == 1) {
+        // 这里目前只支持非dynamic
+        auto shape = self->getDimensions().d;
+        for (int i = 0; i < self->getDimensions().nbDims; i++) {
+            if (shape[i] == 1) {
+                dims.push_back(i - sign);
+                sign += 1;
+            }
+        }
+        if (dims.size() == 0) {
+            return true;
+        }
+    }
+    else {
+        //extract dim
+        auto dim = (engine->context().get_constant(inputs[1])).toInt();
+        auto self_dim = nvdim_to_sizes(self->getDimensions());
+        if (dim < 0) {
+            dim = self_dim.size() + dim;
+        }
+
+        if (self_dim[dim] != 1) {
+            //不需要squeeze的情况
+            engine->context().set_tensor(node->outputs()[0], self);
+            LOG(INFO) << "Output tensor shape: " << self->getDimensions();
+            return true;
+        } else {
+            dims = {dim};
+        }
+    }
+    
+    bool is_dynamic = check_nvtensor_is_dynamic(self);
+    nvinfer1::IShuffleLayer* shuffle_layer = nullptr;
+    if (is_dynamic) {
+        shuffle_layer = add_shuffle_layer(engine, node, self, dims[0], 0);
+        POROS_CHECK(shuffle_layer, "Unable to create shuffle layer from node: " << *node)
+        if (nullptr == shuffle_layer){
+            LOG(INFO) << "unsqueeze nv shape tensor error!";
+            return false;
+        } 
+        for (size_t i = 1; i < dims.size(); i++) {
+            shuffle_layer = add_shuffle_layer(engine, node, shuffle_layer->getOutput(0), dims[i], i);
+            POROS_CHECK(shuffle_layer, "Unable to create shuffle layer from node: " << *node)
+        }
+        engine->context().set_tensor(node->outputs()[0], shuffle_layer->getOutput(0));
+        LOG(INFO) << "Output tensor shape: " << shuffle_layer->getOutput(0)->getDimensions();
+    } else {
+        shuffle_layer = engine->network()->addShuffle(*self);
+        POROS_CHECK(shuffle_layer, "Unable to create shuffle layer from node: " << *node);
+        shuffle_layer->setName((layer_info(node) + "_IShuffleLayer_for_self").c_str());
+        for (size_t i = 0; i < dims.size(); i++) {
+            if (i == 0) {
+                shuffle_layer->setReshapeDimensions(squeeze_dims(self->getDimensions(), dims[i]));
+            } else {
+                shuffle_layer->setReshapeDimensions(squeeze_dims(shuffle_layer->getOutput(0)->getDimensions(), dims[i]));
+            }
+            
+            if (i != dims.size() - 1) {
+                shuffle_layer = engine->network()->addShuffle(*shuffle_layer->getOutput(0));
+                shuffle_layer->setName((layer_info(node) + "_IShuffleLayer_for_output").c_str());
+            }
+        }
+    }
+    engine->context().set_tensor(node->outputs()[0], shuffle_layer->getOutput(0));
+    LOG(INFO) << "Output tensor shape: " << shuffle_layer->getOutput(0)->getDimensions();
+    return true;
+}
+
+/*
+"aten::unsqueeze(Tensor(a) self, int dim) -> Tensor(a)",
+https://pytorch.org/docs/stable/generated/torch.unsqueeze.html*/
+bool UnSqueezeConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 2), "invaid inputs size for UnSqueezeConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for UnSqueezeConverter is not Tensor as expected");
+    POROS_CHECK_TRUE((inputs[1]->node()->kind() == torch::jit::prim::Constant),
+        "input[1] for UnSqueezeConverter is not come from prim::Constant as expected");
+
+    //extract self
+    auto self = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((self != nullptr), "Unable to init input tensor for node: " << *node);
+
+    //extract dim
+    auto dim = (engine->context().get_constant(inputs[1])).toInt();
+    if (self->getDimensions().nbDims == 0 && dim == 0) {
+        auto shuffle_layer = engine->network()->addShuffle(*self);
+        nvinfer1::Dims unsqueeze_dim;
+        unsqueeze_dim.nbDims = 1;
+        unsqueeze_dim.d[0] = 1; 
+        shuffle_layer->setReshapeDimensions(unsqueeze_dim);
+        shuffle_layer->setName((layer_info(node) + "_IShuffleLayer_for_self").c_str());
+        auto output = shuffle_layer->getOutput(0);
+        engine->context().set_tensor(node->outputs()[0], output);
+        LOG(INFO) << "Output tensor shape: " << output->getDimensions();
+        return true;
+    }
+    auto self_dim = nvdim_to_sizes(self->getDimensions());
+    int64_t nbDims = self_dim.size();
+    POROS_CHECK((dim <= nbDims && dim >= -(nbDims + 1)), 
+        "Dimension out of range (expected to be in range of [" << -(nbDims + 1)
+        << ", " << nbDims << "], but got " << dim << ")");
+    if (dim < 0) {
+        dim = self_dim.size() + dim + 1;
+    }
+        
+    auto shuffle_layer = engine->network()->addShuffle(*self);
+    POROS_CHECK(shuffle_layer, "Unable to create shuffle layer from node: " << *node);
+    bool is_dynamic = check_nvtensor_is_dynamic(self);
+    if (is_dynamic) {
+        nvinfer1::ITensor* input_shape_tensor = (engine->network()->addShape(*self))->getOutput(0);
+        nvinfer1::ITensor* reshape_tensor = unsqueeze_nv_shapetensor(engine, input_shape_tensor, dim);
+        if (reshape_tensor != nullptr) {
+            shuffle_layer->setInput(1, *reshape_tensor);
+        } else {
+            LOG(INFO) << "unsqueeze nv shape tensor error!";
+            return false;
+        }
+    } else {
+        shuffle_layer->setReshapeDimensions(unsqueeze_dims(self->getDimensions(), dim));
+    }
+    shuffle_layer->setName((layer_info(node) + "_IShuffleLayer_for_self").c_str());
+    engine->context().set_tensor(node->outputs()[0], shuffle_layer->getOutput(0));
+    LOG(INFO) << "Output tensor shape: " << shuffle_layer->getOutput(0)->getDimensions();
+
+    return true;
+}
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, SqueezeConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, UnSqueezeConverter);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/squeeze.h b/poros/src/poros/converter/gpu/squeeze.h
new file mode 100644
index 0000000000..aca128578a
--- /dev/null
+++ b/poros/src/poros/converter/gpu/squeeze.h
@@ -0,0 +1,76 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file squeeze.h
+* @author tianjinjin@baidu.com
+* @date Wed Sep  1 11:19:13 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class SqueezeConverter : public GpuConverter {
+public:
+    SqueezeConverter() {}
+    virtual ~SqueezeConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::squeeze.dim(Tensor(a) self, int dim) -> Tensor(a)",
+                "aten::squeeze(Tensor(a) self) -> (Tensor(a))"};
+    }
+
+    /** TODO: TO SUPPORT CONVERTERS BELLOW:
+     * "aten::squeeze(Tensor(a) self) -> Tensor(a)",
+     * "aten::squeeze.dimname(Tensor(a) self, Dimname dim) -> Tensor(a)"
+     * **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::squeeze};
+    }
+};
+
+class UnSqueezeConverter : public GpuConverter {
+public:
+    UnSqueezeConverter() {}
+    virtual ~UnSqueezeConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::unsqueeze(Tensor(a) self, int dim) -> Tensor(a)",
+                };
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::unsqueeze};
+    }
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/stack.cpp b/poros/src/poros/converter/gpu/stack.cpp
new file mode 100644
index 0000000000..8cd3a315bd
--- /dev/null
+++ b/poros/src/poros/converter/gpu/stack.cpp
@@ -0,0 +1,110 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file stack.cpp
+* @author tianjinjin@baidu.com
+* @date Tue Sep  7 15:09:14 CST 2021
+* @brief 
+**/
+
+#include "poros/converter/gpu/stack.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/context/poros_global.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+/*
+"aten::stack(Tensor[] tensors, int dim=0) -> Tensor",
+"aten::vstack(Tensor[] tensors) -> Tensor"
+*/
+
+bool StackConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 1 || inputs.size() == 2), "invaid inputs size for StackConverter");
+    POROS_CHECK_TRUE(inputs[0]->type()->isSubtypeOf(c10::ListType::ofTensors()), 
+        "input[0] for StackConverter is not TensorList as expected");
+
+    //extract tensors
+    std::vector<nvinfer1::ITensor*> tensorlist;
+    POROS_CHECK_TRUE((engine->context().get_tensorlist(inputs[0], tensorlist)), "extract tensorlist error");
+
+    int64_t dim = 0;
+    
+    std::vector<nvinfer1::ITensor*> tensors;
+    if (inputs.size() == 2) {
+        // aten::stack
+        POROS_CHECK_TRUE(inputs[1]->type()->isSubtypeOf(c10::NumberType::get()), 
+            "input[1] for StackConverter is not int64_t as expected");
+
+        //extract dims
+        dim = (engine->context().get_constant(inputs[1])).toInt();
+        
+        // aten::stack should unsqueeze dims
+        // check if input tensorlist is dynamic.
+        bool is_dynamic = check_nvtensor_is_dynamic(tensorlist[0]);
+        nvinfer1::Dims inputs_dims = tensorlist[0]->getDimensions();
+
+        // when dim is negtive
+        if (dim < 0) {
+            dim = inputs_dims.nbDims + dim + 1;
+        }
+        // generate unsqueeze dimensions by shapetensor if dynamic.
+        nvinfer1::ITensor* unsqueeze_dim = nullptr;
+        if (is_dynamic) {
+            nvinfer1::ITensor* input_shapetensor = engine->network()->addShape(*(tensorlist[0]))->getOutput(0);
+            unsqueeze_dim = unsqueeze_nv_shapetensor(engine, input_shapetensor, dim);
+            if (unsqueeze_dim == nullptr) {
+                LOG(INFO) << "unsqueeze nv shape tensor failed";
+                return false;
+            }
+        }
+        // unsqueeze each tensor in tensorlist
+        for (size_t i = 0; i < tensorlist.size(); ++i) {
+            auto shuffle_layer = engine->network()->addShuffle(*tensorlist[i]);
+            POROS_CHECK(shuffle_layer, "Unable to create shuffle layer from node: " << *node);
+            if (is_dynamic) {
+                shuffle_layer->setInput(1, *unsqueeze_dim);
+            } else {
+                shuffle_layer->setReshapeDimensions(unsqueeze_dims(tensorlist[i]->getDimensions(), dim));
+            }
+            shuffle_layer->setName((layer_info(node) + "_IShuffleLayer_for_tensor_" + std::to_string(i)).c_str());
+            tensors.push_back(shuffle_layer->getOutput(0));
+        }
+    } else {
+        // aten::vstack need not unsqueeze dims
+        tensors = tensorlist;
+    }
+
+    auto cat_layer = engine->network()->addConcatenation(tensors.data(), tensors.size());
+    POROS_CHECK(cat_layer, "Unable to create concatenation layer from node: " << *node);
+    cat_layer->setAxis(static_cast<int>(dim));
+    cat_layer->setName((layer_info(node) + "_IConcatenationLayer").c_str());
+    engine->context().set_tensor(node->outputs()[0], cat_layer->getOutput(0));
+    LOG(INFO) << "Output tensor shape: " << cat_layer->getOutput(0)->getDimensions();
+    return true;
+}
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, StackConverter);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/stack.h b/poros/src/poros/converter/gpu/stack.h
new file mode 100644
index 0000000000..e400e3fd16
--- /dev/null
+++ b/poros/src/poros/converter/gpu/stack.h
@@ -0,0 +1,65 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file stack.h
+* @author tianjinjin@baidu.com
+* @date Tue Sep  7 15:09:14 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class StackConverter : public GpuConverter {
+public:
+    StackConverter() {}
+    virtual ~StackConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::stack(Tensor[] tensors, int dim=0) -> Tensor",
+                "aten::vstack(Tensor[] tensors) -> Tensor"
+                };
+    }
+
+    /** TODO: TO SUPPORT CONVERTERS BELLOW:
+     * "aten::stack.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)",
+     * **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::stack,
+                torch::jit::aten::vstack,
+                };
+    }
+
+    bool assign_schema_attr() {
+        return assign_schema_attr_helper({{"aten::stack(Tensor[] tensors, int dim=0) -> Tensor", {1, 1}}});
+    }
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/to.cpp b/poros/src/poros/converter/gpu/to.cpp
new file mode 100644
index 0000000000..32373f408b
--- /dev/null
+++ b/poros/src/poros/converter/gpu/to.cpp
@@ -0,0 +1,140 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file to.cpp
+* @author wangrui39@baidu.com
+* @date Saturday November 13 11:36:11 CST 2021
+* @brief 
+**/
+
+#include "poros/converter/gpu/to.h"
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/engine/engine_context.h"
+#include "poros/util/macros.h"
+#include "poros/context/poros_global.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+static void long_to_int(at::ScalarType &scalar_type) {
+    if (scalar_type == at::kLong && PorosGlobalContext::instance().get_poros_options().long_to_int == true) {
+        scalar_type = at::kInt;
+        LOG(WARNING) << "gen_tensor_type meets at::KLong tensor type, change this to at::KInt. "
+                    << "Attention: this may leed to percision change";
+    }
+}
+
+bool ToConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 5 || inputs.size() == 6 || 
+                inputs.size() == 8), "invaid inputs size for ToConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())),
+        "inputs[0] for ToConverter is not Tensor as expected");
+    auto self = engine->context().get_tensor(inputs[0]);
+    nvinfer1::DataType output_type = self->getType();
+
+    // aten::to.other(Tensor self, Tensor other, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor
+    if (inputs[1]->type()->isSubtypeOf(c10::TensorType::get())) {
+        auto other = engine->context().get_tensor(inputs[1]);
+        output_type = other->getType();
+
+    // aten::to.device(Tensor self, Device device, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor
+    // aten::to.prim_Device(Tensor(a) self, Device? device, int? dtype=None, bool non_blocking=False, bool copy=False) -> (Tensor(b|a))
+    } else if (inputs[2]->type()->str() == "int" && inputs[3]->type()->str() == "bool") {
+        auto scalar_type = engine->context().get_constant(inputs[2]).toScalarType();
+        long_to_int(scalar_type);
+        output_type = attype_to_nvtype(scalar_type);
+        if (engine->context().get_constant(inputs[1]).toDevice().is_cuda()) {
+            auto device = nvinfer1::TensorLocation::kDEVICE;
+            self->setLocation(device);
+        } else {
+            LOG(WARNING) << "Set tensor device to HOST but only DEVICE is supported";
+            return false;
+        }
+
+    /*  aten::to.dtype(Tensor self, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor
+        aten::to.dtype_layout(Tensor self, *, int? dtype=None, int? layout=None, Device? device=None, bool? pin_memory=None, 
+                                                            bool non_blocking=False, bool copy=False, int? memory_format=None) -> (Tensor)*/
+    } else if (inputs[1]->type()->str() == "int") {
+        auto scalar_type = engine->context().get_constant(inputs[1]).toScalarType();
+        long_to_int(scalar_type);
+        output_type = attype_to_nvtype(scalar_type);
+    // Input err
+    } else {
+        POROS_THROW_ERROR("Meet some unsupported inputs value type in ToConstructConverter" << *node);
+        return false;
+    }
+
+    // Set datatype for self to dtype
+    // 注：尽管此处output_type可能和input_type一样，但保险起见也需要过一下identity_layer，否则execute_engine时可能发生错误。
+    // 例如：aten::to的input和output tensor同时被mark成engine的输出，如果不走identity_layer，那么这两个tensor其实是一个tensor。
+    // build_engine时trt会报 xxx has been marked as output（trt不支持重复标记输出，只会覆盖之前的输出。）
+    // 原本期望输出两个实际却只有一个，execute_engine获取输出时会出core。
+    // todo: 同类型转换的aten::to.dtype也可以在graph中干掉
+    auto identity = engine->network()->addIdentity(*self);
+    identity->setOutputType(0, output_type);
+    identity->setName((layer_info(node) + "_IIdentityLayer_for_self").c_str());
+    self = identity->getOutput(0);
+    // setOutputType可能不起作用，用setType再次确保self的类型发生了转换
+    self->setType(output_type);
+
+    engine->context().set_tensor(node->outputs()[0], self);
+    LOG(INFO) << "Output tensor shape: " << self->getDimensions();
+    return true;
+}
+
+// prim::NumToTensor.Scalar(Scalar a) -> (Tensor)
+bool NumtotensorConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 1), "invaid inputs size for NumtotensorConverter");
+    // 如果是tensor封装的scalar直接向后传
+    nvinfer1::ITensor* self = engine->context().get_tensor(inputs[0]);
+    if (self != nullptr) {
+        engine->context().set_tensor(node->outputs()[0], self);
+        LOG(INFO) << "Output tensor shape: " << self->getDimensions();
+    } else {
+        // 如果传入的是真实的scalar
+        auto input_scalar = engine->context().get_constant(inputs[0]);
+        if (!input_scalar.isScalar()) {
+            POROS_THROW_ERROR("prim::NumToTensor input[0] is not scalar!");
+            return false;
+        }
+        nvinfer1::ITensor* output_tensor = nullptr;
+        if (input_scalar.isInt()) {
+            output_tensor = tensor_to_const(engine, at::tensor(input_scalar.toInt(), torch::kInt));
+        } else if (input_scalar.isDouble()) {
+            output_tensor = tensor_to_const(engine, at::tensor(input_scalar.toDouble(), torch::kDouble).to(at::ScalarType::Float));
+        } else if (input_scalar.isBool()) {
+            output_tensor = tensor_to_const(engine, at::tensor(input_scalar.toBool(), torch::kBool));
+        } else {
+            POROS_THROW_ERROR("prim::NumToTensor Converter meets an unsupported scalar type, which leads to fail.");
+            return false;
+        }
+        engine->context().set_tensor(node->outputs()[0], output_tensor);
+        LOG(INFO) << "Output tensor shape: " << output_tensor->getDimensions();
+    }
+    return true;
+}
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, ToConverter);
+POROS_REGISTER_CONVERTER(TensorrtEngine, NumtotensorConverter);
+
+} // baidu
+} // mirana
+} // poros
diff --git a/poros/src/poros/converter/gpu/to.h b/poros/src/poros/converter/gpu/to.h
new file mode 100644
index 0000000000..83f300f741
--- /dev/null
+++ b/poros/src/poros/converter/gpu/to.h
@@ -0,0 +1,82 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file to.h
+* @author wangrui39@baidu.com
+* @date Saturday November 13 11:36:11 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+// Correspons to torch.tensor.to https://pytorch.org/docs/1.9.0/generated/torch.Tensor.to.html?highlight=#torch.to
+class ToConverter : public GpuConverter {
+public:
+    ToConverter() {}
+    virtual ~ToConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::to.device(Tensor self, Device device, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor",
+                "aten::to.dtype(Tensor self, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor",
+                "aten::to.other(Tensor self, Tensor other, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor",
+                "aten::to.dtype_layout(Tensor self, *, int? dtype=None, int? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, bool copy=False, int? memory_format=None) -> (Tensor)",
+                "aten::to.prim_Device(Tensor(a) self, Device? device, int? dtype=None, bool non_blocking=False, bool copy=False) -> (Tensor(b|a))", 
+        };
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::to};
+    }
+};
+
+class NumtotensorConverter : public GpuConverter {
+public:
+    NumtotensorConverter() {}
+    virtual ~NumtotensorConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"prim::NumToTensor.Scalar(Scalar a) -> (Tensor)",
+        };
+    }
+
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::prim::NumToTensor};
+    }
+
+    bool assign_schema_attr() {
+        return assign_schema_attr_helper({{"prim::NumToTensor.Scalar(Scalar a) -> (Tensor)", {1, 1}}});
+    }
+};
+
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
\ No newline at end of file
diff --git a/poros/src/poros/converter/gpu/topk.cpp b/poros/src/poros/converter/gpu/topk.cpp
new file mode 100644
index 0000000000..e43bba11bb
--- /dev/null
+++ b/poros/src/poros/converter/gpu/topk.cpp
@@ -0,0 +1,78 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file topk.cpp
+* @author tianjinjin@baidu.com
+* @date Tue Sep  7 14:29:20 CST 2021
+* @brief 
+**/
+
+#include "poros/converter/gpu/topk.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/context/poros_global.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+/*
+"aten::topk(Tensor self, 
+int k, 
+int dim=-1, 
+bool largest=True, 
+bool sorted=True) -> (Tensor values, Tensor indices)",
+*/
+bool TopkConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 5), "invaid inputs size for TopkConverter");
+    POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+        "input[0] for TopkConverter is not Tensor as expected");
+
+    //extract self
+    auto self = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((self != nullptr), "Unable to init input tensor for node: " << *node);
+    auto self_dim = nvdim_to_sizes(self->getDimensions());
+
+    //extract k & dim & largest
+    auto k = (engine->context().get_constant(inputs[1])).toInt();
+    auto dim = (engine->context().get_constant(inputs[2])).toInt();
+    auto largest = (engine->context().get_constant(inputs[3])).toBool();
+    
+    if (dim < 0) {
+        dim = self_dim.size() + dim;
+    }
+    uint32_t shift_dim = 1 << dim;
+    auto topk_type = largest ? (nvinfer1::TopKOperation::kMAX) : (nvinfer1::TopKOperation::kMIN);
+    auto new_layer = engine->network()->addTopK(*self, topk_type, k, shift_dim);
+
+    POROS_CHECK(new_layer, "Unable to create topk layer from node: " << *node);
+    new_layer->setName((layer_info(node) + "_ITopKLayer").c_str());
+    engine->context().set_tensor(node->outputs()[0], new_layer->getOutput(0));
+    engine->context().set_tensor(node->outputs()[1], new_layer->getOutput(1));
+    LOG(INFO) << "Output tensor(0) shape: " << new_layer->getOutput(0)->getDimensions();
+    LOG(INFO) << "Output tensor(1) shape: " << new_layer->getOutput(1)->getDimensions();
+    return true;
+}
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, TopkConverter);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/topk.h b/poros/src/poros/converter/gpu/topk.h
new file mode 100644
index 0000000000..3bb309986e
--- /dev/null
+++ b/poros/src/poros/converter/gpu/topk.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file topk.h
+* @author tianjinjin@baidu.com
+* @date Tue Sep  7 14:29:20 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class TopkConverter : public GpuConverter {
+public:
+    TopkConverter() {}
+    virtual ~TopkConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::topk(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)",
+                };
+    }
+
+    /** TODO: TO SUPPORT CONVERTERS BELLOW:
+     * "aten::topk.values(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)",
+     * **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::topk,
+                };
+    }
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/unary.cpp b/poros/src/poros/converter/gpu/unary.cpp
new file mode 100644
index 0000000000..71835c0e94
--- /dev/null
+++ b/poros/src/poros/converter/gpu/unary.cpp
@@ -0,0 +1,182 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file unary.cpp
+* @author tianjinjin@baidu.com
+* @date Mon Sep  6 20:23:14 CST 2021
+* @brief 
+**/
+
+#include "poros/converter/gpu/unary.h"
+#include "poros/converter/gpu/weight.h"
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/context/poros_global.h"
+#include "poros/util/macros.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+/*
+"aten::cos(Tensor self) -> Tensor",
+*/
+bool UnaryConverter::converter(TensorrtEngine* engine, const torch::jit::Node *node) {
+    at::ArrayRef<const torch::jit::Value*> inputs = node->inputs();
+    POROS_CHECK_TRUE((inputs.size() == 1), "invaid inputs size for UnaryConverter");
+    if (node->schema().operator_name() != 
+        torch::jit::parseSchema("aten::floor.float(float a) -> (int)").operator_name()) {
+        POROS_CHECK_TRUE((inputs[0]->type()->isSubtypeOf(c10::TensorType::get())), 
+            "input[0] for UnaryConverter is not Tensor as expected");
+    }
+
+    //extract self
+    auto self = engine->context().get_tensor(inputs[0]);
+    POROS_CHECK_TRUE((self != nullptr), "Unable to init input tensor for node: " << *node);
+
+    nvinfer1::UnaryOperation trt_type;
+    switch (node->kind()) {
+        case torch::jit::aten::cos:
+            trt_type = nvinfer1::UnaryOperation::kCOS;
+            break;
+        case torch::jit::aten::acos:
+            trt_type = nvinfer1::UnaryOperation::kACOS;
+            break;
+        case torch::jit::aten::cosh:
+            trt_type = nvinfer1::UnaryOperation::kCOSH;
+            break;
+        case torch::jit::aten::sin:
+            trt_type = nvinfer1::UnaryOperation::kSIN;
+            break;
+        case torch::jit::aten::asin:
+            trt_type = nvinfer1::UnaryOperation::kASIN;
+            break;
+        case torch::jit::aten::sinh:
+            trt_type = nvinfer1::UnaryOperation::kSINH;
+            break;
+        case torch::jit::aten::tan:
+            trt_type = nvinfer1::UnaryOperation::kTAN;
+            break;
+        case torch::jit::aten::atan:
+            trt_type = nvinfer1::UnaryOperation::kATAN;
+            break;
+        case torch::jit::aten::abs:
+            trt_type = nvinfer1::UnaryOperation::kABS;
+            break;
+        case torch::jit::aten::floor:
+            trt_type = nvinfer1::UnaryOperation::kFLOOR;
+            break;
+        case torch::jit::aten::reciprocal:
+            trt_type = nvinfer1::UnaryOperation::kRECIP;
+            break;
+        case torch::jit::aten::log:
+            trt_type = nvinfer1::UnaryOperation::kLOG;
+            break;
+        case torch::jit::aten::ceil:
+            trt_type = nvinfer1::UnaryOperation::kCEIL;
+            break;
+        case torch::jit::aten::sqrt:
+            trt_type = nvinfer1::UnaryOperation::kSQRT;
+            break;
+        case torch::jit::aten::exp:
+            trt_type = nvinfer1::UnaryOperation::kEXP;
+            break;
+        case torch::jit::aten::neg:
+            trt_type = nvinfer1::UnaryOperation::kNEG;
+            break;
+        case torch::jit::aten::erf:
+            trt_type = nvinfer1::UnaryOperation::kERF;
+            break;
+        case torch::jit::aten::asinh:
+            trt_type = nvinfer1::UnaryOperation::kASINH;
+            break;
+        case torch::jit::aten::acosh:
+            trt_type = nvinfer1::UnaryOperation::kACOSH;
+            break;
+        case torch::jit::aten::atanh:
+            trt_type = nvinfer1::UnaryOperation::kATANH;
+            break;
+        case torch::jit::aten::log2:
+            trt_type = nvinfer1::UnaryOperation::kLOG;
+            break;
+        case torch::jit::aten::log10:
+            trt_type = nvinfer1::UnaryOperation::kLOG;
+            break;
+        case torch::jit::aten::round:
+            trt_type = nvinfer1::UnaryOperation::kROUND;
+            break;
+        default:
+            POROS_THROW_ERROR("We should never reach here for UnaryConverter, meet Unsupported node kind!");
+    }
+    //IUnaryLayer only support: operation NEG not allowed on type Int32
+    nvinfer1::DataType self_type = self->getType();
+    const nvinfer1::DataType allowed_type = trt_type == nvinfer1::UnaryOperation::kNOT ? nvinfer1::DataType::kBOOL : nvinfer1::DataType::kFLOAT;
+    bool should_cast = self_type == allowed_type ? false : true;
+    if (should_cast) {
+        nvinfer1::IIdentityLayer* cast_layer = engine->network()->addIdentity(*self);
+        cast_layer->setName((layer_info(node) + "_IIdentityLayer").c_str());
+        cast_layer->setOutputType(0, allowed_type);
+        self = cast_layer->getOutput(0);  
+    }
+
+    auto unary = engine->network()->addUnary(*self, trt_type); 
+    POROS_CHECK(unary, "Unable to create unary layer from node: " << *node);
+    unary->setName((layer_info(node) + "_IUnaryLayer").c_str());
+    auto output = unary->getOutput(0);
+    if (trt_type == nvinfer1::UnaryOperation::kLOG) {
+        nvinfer1::ITensor* alphaTensor = nullptr; 
+        if (node->kind() == torch::jit::aten::log2) {
+            alphaTensor = tensor_to_const(engine, torch::tensor(std::log2(std::exp(1)), {torch::kFloat32}));
+        } else if (node->kind() == torch::jit::aten::log10) {
+            alphaTensor = tensor_to_const(engine, torch::tensor(std::log10(std::exp(1)), {torch::kFloat32}));
+        } else {
+            // need not to do anything.
+        }
+        // ln(x) * log2(e) = log2(x)
+        // ln(x) * log10(e) = log10(x)
+        if (alphaTensor != nullptr) {
+            auto scaleLayer = add_elementwise(engine,
+                            nvinfer1::ElementWiseOperation::kPROD,
+                            output,
+                            alphaTensor,
+                            layer_info(node) + std::string("_prod"));
+            POROS_CHECK(scaleLayer, "Unable to create scale layer from node: " << *node);
+            output = scaleLayer->getOutput(0);
+        }
+    }
+    if (node->schema().operator_name() == 
+        torch::jit::parseSchema("aten::floor.float(float a) -> (int)").operator_name()) {
+        auto identity = engine->network()->addIdentity(*output);
+        identity->setOutputType(0, nvinfer1::DataType::kINT32);
+        identity->setName((layer_info(node) + "_IIdentityLayer_for_output").c_str());
+        output = identity->getOutput(0);
+    } else if (should_cast) {
+        nvinfer1::IIdentityLayer* castback_layer = engine->network()->addIdentity(*output);
+        castback_layer->setName((layer_info(node) + "_IIdentityLayer_for_output").c_str());
+        castback_layer->setOutputType(0, self_type);
+        output = castback_layer->getOutput(0);
+    }
+    engine->context().set_tensor(node->outputs()[0], output);
+    LOG(INFO) << "Output tensor shape: " << output->getDimensions();
+    return true;
+}
+
+POROS_REGISTER_CONVERTER(TensorrtEngine, UnaryConverter);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/unary.h b/poros/src/poros/converter/gpu/unary.h
new file mode 100644
index 0000000000..b5604e6869
--- /dev/null
+++ b/poros/src/poros/converter/gpu/unary.h
@@ -0,0 +1,112 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file unary.h
+* @author tianjinjin@baidu.com
+* @date Mon Sep  6 20:23:14 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+
+#include "poros/converter/gpu/gpu_converter.h"
+#include "poros/engine/tensorrt_engine.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class UnaryConverter : public GpuConverter {
+public:
+    UnaryConverter() {}
+    virtual ~UnaryConverter() {}
+
+    bool converter(TensorrtEngine* engine, const torch::jit::Node *node);
+
+    const std::vector<std::string> schema_string() {
+        return {"aten::cos(Tensor self) -> Tensor",
+                "aten::acos(Tensor self) -> Tensor",
+                "aten::cosh(Tensor self) -> Tensor",
+                "aten::sin(Tensor self) -> Tensor",
+                "aten::asin(Tensor self) -> Tensor",
+                "aten::sinh(Tensor self) -> Tensor",
+                "aten::tan(Tensor self) -> Tensor",
+                "aten::atan(Tensor self) -> Tensor",
+                "aten::abs(Tensor self) -> Tensor",
+                "aten::floor(Tensor self) -> Tensor",
+                "aten::reciprocal(Tensor self) -> Tensor",
+                "aten::log(Tensor self) -> Tensor",
+                "aten::ceil(Tensor self) -> Tensor",
+                "aten::sqrt(Tensor self) -> Tensor",
+                "aten::exp(Tensor self) -> Tensor",
+                "aten::neg(Tensor self) -> Tensor",
+                "aten::erf(Tensor self) -> Tensor",
+                "aten::asinh(Tensor self) -> Tensor",
+                "aten::acosh(Tensor self) -> Tensor",
+                "aten::atanh(Tensor self) -> Tensor",
+                "aten::log2(Tensor self) -> (Tensor)",
+                "aten::log10(Tensor self) -> (Tensor)",
+                "aten::floor.float(float a) -> (int)",
+                "aten::round(Tensor self) -> (Tensor)"
+            };
+    }
+
+    /** TODO: TO SUPPORT CONVERTERS BELLOW:
+     * "aten::cos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)",
+     * "aten::acos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)",
+     * "aten::cosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)",
+     * "ALL OF THEIR .out CONVERTERS IS NOT SUPPORTED"
+     * "ALL OF THEIR .out CONVERTERS IS NOT SUPPORTED"
+     * "ALL OF THEIR .out CONVERTERS IS NOT SUPPORTED"
+     * **/
+    const std::vector<torch::jit::NodeKind> node_kind() {
+        return {torch::jit::aten::cos,
+                torch::jit::aten::acos,
+                torch::jit::aten::cosh,
+                torch::jit::aten::sin,
+                torch::jit::aten::asin,
+                torch::jit::aten::sinh,
+                torch::jit::aten::tan,
+                torch::jit::aten::atan,
+                torch::jit::aten::abs,
+                torch::jit::aten::floor,
+                torch::jit::aten::reciprocal,
+                torch::jit::aten::log,
+                torch::jit::aten::ceil,
+                torch::jit::aten::sqrt,
+                torch::jit::aten::exp,
+                torch::jit::aten::neg,
+                torch::jit::aten::erf,
+                torch::jit::aten::asinh,
+                torch::jit::aten::acosh,
+                torch::jit::aten::atanh,
+                torch::jit::aten::log2,
+                torch::jit::aten::log10,
+                torch::jit::aten::round
+                };
+    }
+    bool assign_schema_attr() {
+        return assign_schema_attr_helper({{"aten::floor.float(float a) -> (int)", {1, 1}}});
+    }
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/weight.cpp b/poros/src/poros/converter/gpu/weight.cpp
new file mode 100644
index 0000000000..1244acccb7
--- /dev/null
+++ b/poros/src/poros/converter/gpu/weight.cpp
@@ -0,0 +1,111 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file weight.cpp
+* @author tianjinjin@baidu.com
+* @date Fri Aug  6 14:17:11 CST 2021
+* @brief 
+**/
+
+#include "poros/converter/gpu/weight.h"
+#include "poros/engine/trtengine_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+Weights::Weights() {
+    this->inputs_num = 0;
+    this->outputs_num = 0;
+    this->data.type = nvinfer1::DataType::kFLOAT;
+    this->data.values = nullptr;
+    this->data.count = 0;
+}
+
+Weights::Weights(at::Tensor tensor) {
+    POROS_CHECK((tensor.sizes().size() <= nvinfer1::Dims::MAX_DIMS), 
+        "given tensor is outof max_dims");
+        
+    if (tensor.scalar_type() == c10::ScalarType::Long) {
+        LOG(WARNING) << "Weights meets c10::ScalarType::Long tensor type, change this to c10::ScalarType::Int. "
+                << "Attention: this may leed to percision change";
+        tensor = tensor.to(at::ScalarType::Int);
+    }
+
+    this->shape = sizes_to_nvdim(tensor.sizes());
+    //TODO: CHECK this bias info. 
+    this->inputs_num = (tensor.sizes().size() > 1) ? tensor.sizes()[1] : tensor.sizes()[0];
+    this->outputs_num = tensor.sizes()[0];
+
+    if (tensor.sizes().size() > 2) {
+        this->kernel_shape.nbDims = tensor.sizes().size() - 2;
+        for (size_t i = 2; i < tensor.sizes().size(); i++) {
+            this->kernel_shape.d[i - 2] = tensor.sizes()[i];
+        }
+    } else {
+        this->kernel_shape.nbDims = 1;
+        this->kernel_shape.d[0] = 1;
+    }
+
+    auto t_cpu = tensor.to(at::kCPU);
+    t_cpu = t_cpu.contiguous();
+
+    auto t_type = c10::optTypeMetaToScalarType(t_cpu.dtype());
+    POROS_CHECK(t_type.has_value(), "unsupported datatype");
+    //TODO: may be failed here
+    auto dtype = attype_to_nvtype(t_type.value());
+
+    void* buf = nullptr;
+    if (dtype == nvinfer1::DataType::kFLOAT) {
+        buf = malloc(t_cpu.numel() * sizeof(float));
+        memcpy(buf, t_cpu.data_ptr(), t_cpu.numel() * sizeof(float));
+    } else if (dtype == nvinfer1::DataType::kHALF) {
+        buf = malloc(t_cpu.numel() * (sizeof(float) / 2));
+        memcpy(buf, t_cpu.data_ptr(), t_cpu.numel() * (sizeof(float) / 2));
+    } else if (dtype == nvinfer1::DataType::kINT8) {
+        buf = malloc(t_cpu.numel() * sizeof(char));
+        memcpy(buf, t_cpu.data_ptr(), t_cpu.numel() * sizeof(char));
+    } else if (dtype == nvinfer1::DataType::kINT32) {
+        buf = malloc(t_cpu.numel() * sizeof(int));
+        memcpy(buf, t_cpu.data_ptr(), t_cpu.numel() * sizeof(int));
+    } else if (dtype == nvinfer1::DataType::kBOOL) {
+        buf = malloc(t_cpu.numel() * sizeof(bool));
+        memcpy(buf, t_cpu.data_ptr(), t_cpu.numel() * sizeof(bool));
+    }
+
+    this->data.type = dtype;
+    this->data.count = t_cpu.numel();
+    this->data.values = buf;
+
+}
+
+std::ostream& operator<<(std::ostream& os, const Weights& w) {
+  os << "Weights: " << w.shape
+     << "\n    Number of input maps: " << w.inputs_num
+     << "\n    Number of output maps: " << w.outputs_num
+     << "\n    Element shape: [";
+  for (int i = 0; i < w.kernel_shape.nbDims; i++) {
+    os << w.kernel_shape.d[i];
+    if (i + 1 < w.kernel_shape.nbDims) {
+      os << ',';
+    }
+  }
+  os << ']';
+  return os;
+}
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/gpu/weight.h b/poros/src/poros/converter/gpu/weight.h
new file mode 100644
index 0000000000..13b6e90c81
--- /dev/null
+++ b/poros/src/poros/converter/gpu/weight.h
@@ -0,0 +1,68 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file weight.h
+* @author tianjinjin@baidu.com
+* @date Fri Aug 13 11:14:51 CST 2021
+* @brief
+**/
+
+#pragma once
+
+#include <string>
+
+#include "torch/script.h"
+#include "NvInfer.h"
+
+#include "poros/engine/tensorrt_engine.h"
+#include "poros/util/macros.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+struct Weights {
+    nvinfer1::Weights data;
+    nvinfer1::Dims kernel_shape;
+    nvinfer1::Dims shape;
+    int64_t inputs_num;
+    int64_t outputs_num;
+
+    Weights();
+    Weights(at::Tensor tensor);
+    // Weights(float val);
+    // Weights(int32_t val);
+    friend std::ostream& operator<<(std::ostream& os, const Weights& w);
+};
+
+inline nvinfer1::ITensor* tensor_to_const(TensorrtEngine* engine, at::Tensor t) {
+    auto t_weights = Weights(t);
+    auto const_layer = engine->network()->addConstant(t_weights.shape, t_weights.data);
+    POROS_CHECK(const_layer, "unable to freeze tensor to constant");
+
+    auto out = const_layer->getOutput(0);
+
+    std::ostringstream tensor_id;
+    tensor_id << reinterpret_cast<int*>(out);
+
+    LOG(INFO) << "Freezing tensor " << tensor_id.str() << " as an IConstantLayer";
+    const_layer->setName(("[Freeze Tensor " + tensor_id.str() + " ]").c_str());
+
+    return out;
+}
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/converter/iconverter.h b/poros/src/poros/converter/iconverter.h
new file mode 100644
index 0000000000..352892c249
--- /dev/null
+++ b/poros/src/poros/converter/iconverter.h
@@ -0,0 +1,421 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file iconverter.h
+* @author tianjinjin@baidu.com
+* @author huangben@baidu.com
+* @date Tue Jul 27 11:24:21 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+
+#include "torch/script.h"
+#include "ATen/core/function_schema.h"
+#include "torch/csrc/jit/frontend/function_schema_parser.h"
+#include "torch/csrc/jit/ir/ir.h"
+#include "torch/csrc/jit/runtime/custom_operator.h"
+
+#include "poros/context/poros_global.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class IEngine;
+
+// schema属性
+// 默认支持dynamic shape，不支持tensor scalar输入
+struct schema_attr {
+    int is_support_dynamic_shape = 1;
+    int is_support_tensor_scalar = 0;
+};
+
+class IConverter {
+public:
+    virtual ~IConverter() {}
+    /**
+     * @brief converter核心实现，注意要将输入tensor和engine的tensor对应起来，通过engine.context
+     *        tensor->tensor,  constant->constant
+     * @param [in] sub_graph  : 子图
+     * @return [res]int
+     * @retval 0 => success, <0 => fail
+     **/
+    virtual bool converter(IEngine* engine, const torch::jit::Node *node) = 0;
+    virtual const std::vector<std::string> schema_string() = 0;
+    virtual const std::vector<torch::jit::NodeKind> node_kind() = 0;
+    const std::unordered_map<c10::OperatorName, schema_attr> get_schema_attr_map() {
+        return _schema_attr_map;
+    }
+protected:
+    /**
+     * @brief help assign schema attr
+     *
+     * @param [in] schema_attr_vec : schema and schema_attr which want to assign
+     * @return  true => succeed  false => failed
+    **/
+    virtual bool assign_schema_attr_helper(std::vector<std::pair<std::string, schema_attr>> schema_attr_vec) {
+        if (_schema_attr_map.empty()) {
+            LOG(INFO) << "the schema_attr_map may not have been initialized.";
+            return false;
+        }
+        for (size_t i = 0; i < schema_attr_vec.size(); i++) {
+            c10::OperatorName op_name = torch::jit::parseSchema(schema_attr_vec[i].first).operator_name();
+            if (_schema_attr_map.count(op_name) == 0) {
+                LOG(INFO) << "schema: [ " << schema_attr_vec[i].first << " ] was not found in schema_attr_map";
+                return false;
+            }
+            _schema_attr_map[op_name] = schema_attr_vec[i].second;
+        }
+        return true;
+    }
+    // 给schema赋予attr，在子类中实现
+    virtual bool assign_schema_attr() {
+        return true;
+    }
+private:
+    // 声明 ConvertersMap为友元，在其中调用init_schema_attr
+    friend class ConvertersMap;
+    // 初始化schema attr，需converter为子类时调用
+    bool init_schema_attr() {
+        _schema_attr_map.clear();
+        std::vector<std::string> schema_strings = this->schema_string();
+        schema_attr attr;
+        for (const std::string& s : schema_strings) {
+            _schema_attr_map.insert({torch::jit::parseSchema(s).operator_name(), attr});
+        }
+        return assign_schema_attr();
+    }
+    std::unordered_map<c10::OperatorName, schema_attr> _schema_attr_map;
+};
+
+struct ConverterOptions {
+    std::vector<c10::OperatorName> valid_schemas;
+
+    ConverterOptions() = default;
+
+    ConverterOptions& set_valid_schemas(std::vector<std::string> schema_string) {
+        use_options = true;
+        for (auto s : schema_string) {
+            valid_schemas.push_back(torch::jit::parseSchema(s).operator_name());
+        }
+        return *this;
+    }
+
+    bool use() {
+        return use_options;
+    }
+private:
+    bool use_options = false;
+};
+
+struct ConvRegistration {
+    torch::jit::NodeKind kind;
+    IConverter*   converter;
+    ConverterOptions  options;
+};
+
+class ConvertersMap {
+public:
+    ConvertersMap() {}
+
+    virtual ~ConvertersMap() {
+    }
+    
+    //添加converter到当前的map。
+    bool add_converter(torch::jit::NodeKind node_kind, ConvRegistration conv_reg) {
+        auto iter = converters_map.find(node_kind);
+        if (iter != converters_map.end()) {
+            LOG(WARNING) << "override converter for [ " << node_kind.toQualString() <<  " ]";
+        }
+        converters_map[node_kind] = std::move(conv_reg);
+        return true;
+    }
+
+    IConverter* get_converter(const torch::jit::Node* node) {
+        if (!node_converterable(node)) {
+            return nullptr;
+        }
+        auto node_kind = node->kind();
+        auto iter = converters_map.find(node_kind);
+        if (iter == converters_map.end()) {
+            return nullptr;
+        }
+        auto conv_reg = iter->second;
+        if (conv_reg.options.use()) {
+            if (conv_reg.options.valid_schemas.size() != 0) {
+                auto schema = node->maybeSchema();
+                if (!schema) {
+                    return nullptr;
+                }
+                for (auto reg_schema : conv_reg.options.valid_schemas) {
+                    if (reg_schema == schema->operator_name()) {
+                        return conv_reg.converter;
+                    }
+                }
+                return nullptr;
+            }
+        }
+        return conv_reg.converter;;
+    }
+
+    // 判断list类型的输入输出长度是否发生变化
+    bool list_size_is_variable_length(const torch::jit::Node *node) {
+        auto list_size_map_input = PorosGlobalContext::instance()._list_size_map._list_size_map_input;
+        for (size_t i = 0; i < node->inputs().size(); i++) {
+            auto value = node->input(i);
+            // 如果是list类型
+            if (value->type()->kind() == c10::TypeKind::ListType) {
+                if (list_size_map_input.count(value) != 0 && list_size_map_input[value].count(const_cast<torch::jit::Node*>(node)) != 0) {
+                    // 如果本node对应value（即list变量）记录的size有1个以上的，说明长度发生变化
+                    // 返回true，外部no-converterable
+                    if (list_size_map_input[value].at(const_cast<torch::jit::Node*>(node)).size() != 1){
+                        return true;
+                    }
+                }
+            }
+        }
+        // 输出list变量判断长度是否变化，原理同输入
+        auto list_size_map_output = PorosGlobalContext::instance()._list_size_map._list_size_map_output;
+        for (size_t i = 0; i < node->outputs().size(); i++) {
+            auto value = node->output(i);
+            if (value->type()->kind() == c10::TypeKind::ListType) {
+                if (list_size_map_output.count(value) != 0 && list_size_map_output[value].count(const_cast<torch::jit::Node*>(node)) != 0) {
+                    if (list_size_map_output[value].at(const_cast<torch::jit::Node*>(node)).size() != 1){
+                        return true;
+                    }
+                }
+            }
+        }
+        return false;
+    }
+    // 判断特殊输出类型节点例如list[list[]]
+    bool special_node_check(const torch::jit::Node *node) {
+        if (node->kind() == torch::jit::prim::ListConstruct) {
+            const torch::jit::Value* output = node->outputs()[0];
+            if (output->type()->str().find("[][]") != output->type()->str().npos) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    // 判断是否属于支持tensor scalar输入的op集合
+    bool is_unsupport_tensor_scalar_inputs(const torch::jit::Node *node, 
+                             std::unordered_map<c10::OperatorName, schema_attr> schema_attr_map) {
+        if (node->kind() == torch::jit::prim::CudaFusionGroup) {
+            return false;
+        }
+        for(size_t i = 0; i < node->inputs().size(); i++) {
+            // 如果input是scalar或scalar list，且不来自于prim::Constant
+            torch::jit::Value* current_input = node->input(i);
+            if ((current_input->type()->isSubtypeOf(c10::NumberType::get()) || 
+                current_input->type()->isSubtypeOf(c10::BoolType::get()) ||
+                current_input->type()->isSubtypeOf(c10::StringType::get()) ||
+                current_input->type()->isSubtypeOf(c10::ListType::ofFloats()) ||
+                current_input->type()->isSubtypeOf(c10::ListType::ofInts()) || 
+                current_input->type()->isSubtypeOf(c10::ListType::ofBools()) || 
+                current_input->type()->isSubtypeOf(c10::ListType::ofStrings())
+                ) &&
+                current_input->node()->kind() != torch::jit::prim::Constant) {
+                // 判断node是否属于: 1、prim::ListConstruct 2、prim::ListUnpack 3、支持scalar tensor输入的schema 
+                // 都不属于则不支持，返回true，外部no-converterable
+                if (!node->maybeSchema()) {
+                    // 这两个op没有schema，需要单独判断
+                    if (node->kind() == torch::jit::prim::ListConstruct || 
+                        node->kind() == torch::jit::prim::ListUnpack) {
+                        return false;
+                    } else {
+                        return true;
+                    }
+                } else {
+                    if (schema_attr_map[node->maybeSchema()->operator_name()].is_support_tensor_scalar == 1) {
+                        return false;
+                    } else {
+                        return true;
+                    }
+                }
+            }
+        }
+        return false;
+    }
+
+    bool node_converterable(const torch::jit::Node* node) {
+        auto node_kind = node->kind();
+        auto iter = converters_map.find(node_kind);
+        if (iter == converters_map.end()) {
+            LOG(WARNING) << "no converter find for [ " << node_kind.toQualString() <<  " ]";
+            if (node->maybeSchema()) {
+                LOG(WARNING) << "unsupported schema is [ " << *node->maybeSchema() <<  " ]";
+            }
+            return false;
+        }
+        auto conv_reg = iter->second;
+        if (conv_reg.options.use()) {
+            if (conv_reg.options.valid_schemas.size() != 0) {
+                auto schema = node->maybeSchema();
+                if (!schema) {
+                    LOG(WARNING) << "no schema find for [ " << node_kind.toQualString() <<  " ]";
+                    return false;
+                }
+                // 检查用户自定义不支持node schema
+                if (_unsupport_schema_set.count(schema->operator_name())) {
+                    LOG(WARNING) << "The user specifies that the unsupported node schema is [ " << *schema <<  " ]";
+                    return false;
+                } 
+                // 检查用户自定义不支持node kind
+                if (_unsupport_nodekind_set.count(node->kind())) {
+                    LOG(WARNING) << "The user specifies that the unsupported node kind is [ " << node->kind().toQualString() <<  " ]";
+                    return false;
+                }
+                // 由于tensorrt支持问题，aten::_convolution为反卷积时（transposed==true） 
+                // output_padding参数必须为0，否则不支持
+                if (node->kind() == torch::jit::aten::_convolution && node->inputs().size() >= 12) {
+                    if (node->input(6)->node()->kind() == torch::jit::prim::Constant && 
+                        node->input(6)->type()->kind() == c10::TypeKind::BoolType && 
+                        node->input(7)->node()->kind() == torch::jit::prim::Constant && 
+                        node->input(7)->type()->isSubtypeOf(c10::ListType::ofInts())) {
+
+                        bool transposed = toIValue(node->input(6)).value().toBool();  
+                        auto input_7_vec = toIValue(node->input(7)).value().toIntVector();  
+                        if (transposed && (input_7_vec[0] > 0 || input_7_vec[1] > 0)) {
+                            LOG(INFO) << "TensorRT does not have a notion of output_padding for deconvolution layers."
+                            " output_padding has to be set as zeros.";
+                            return false;
+                        }
+                    }
+                }
+                IConverter* current_converter = conv_reg.converter;
+                if (!current_converter->init_schema_attr()) {
+                    LOG(WARNING) << "converter [ " << node_kind.toQualString() << " ] failed to initialize schema attribute.";
+                    return false;
+                }
+                auto conv_schema_attr = current_converter->get_schema_attr_map();
+                if (conv_schema_attr.count(schema->operator_name()) == 0) {
+                    LOG(WARNING) << "no supported schema find for [ " << node_kind.toQualString() <<  " ]";
+                    LOG(WARNING) << "unsupported schema is [ " << *schema << " ]";
+                    return false;
+                }
+                
+                // 如果是dynamic，检查no_supported_dynamic_schema
+                PorosOptions poros_options = PorosGlobalContext::instance().get_poros_options();
+                if (poros_options.is_dynamic) {
+                    if (conv_schema_attr[schema->operator_name()].is_support_dynamic_shape == 0) {
+                        LOG(WARNING) << "no supported dynamic schema is [ " << *schema << " ]";
+                        return false;
+                    }
+                }
+                
+                auto node_kind = node->kind();
+                // 特殊op的特殊判断，比如list[list[]]在ListConstructConverter中的支持问题
+                if (special_node_check(node)) {
+                    LOG(WARNING) << "node input or output type is not support: " << node_kind.toQualString();
+                    return false;
+                }
+
+                // list输入输出长度是否变化
+                if (list_size_is_variable_length(node)) {
+                    LOG(WARNING) << "input or output is variable length list [ " << node_kind.toQualString() <<  " ]";
+                    return false;
+                }
+
+                // scalar的问题判断，判断一个op是否支持把scalar变成tensor来使用
+                if (is_unsupport_tensor_scalar_inputs(node, conv_schema_attr)) {
+                    LOG(WARNING) << "unsupport nvtensor scalar input node is [ " << node_kind.toQualString() <<  " ]";
+                    return false;
+                }
+
+                // 如果是mutable的op，是否属于目前支持的范围
+                if (node->kind().is_aten() && node->schema().is_mutable() && node->input(0)->type()->kind() == c10::TypeKind::ListType) {
+                    if (PorosGlobalContext::instance().supported_mutable_ops_set.count(node->kind()) == 0) {
+                        LOG(WARNING) << "Meet unsupport mutable node. The node is [ " << node_kind.toQualString() <<  " ]";
+                        return false;
+                    }
+                }
+            } else {
+                // 用户指定不支持OP列表时，图中node只有kind没有schema的，只比较nodekind
+                if (_unsupport_nodekind_set.count(node->kind())) {
+                    LOG(WARNING) << "The user specifies that the unsupported node kind is [ " << node->kind().toQualString() <<  " ]";
+                    return false;
+                }
+            }
+        }
+        return true;
+    }
+    // 由全局option初始化不支持op set
+    void init_unsupport_op_set() {
+        try {
+            std::vector<std::string> unsupport_op_vec = PorosGlobalContext::instance().get_poros_options().unsupport_op_list;
+            for (size_t i = 0; i < unsupport_op_vec.size(); i++) {
+                std::string line = unsupport_op_vec[i];
+                if (line.size() == 0) {
+                    continue;
+                }
+                auto schema_or_opname = torch::jit::parseSchemaOrName(line);
+                // operator name
+                if (schema_or_opname.is_left()) {
+                    torch::jit::NodeKind node_kind = c10::Symbol::fromQualString(line);
+                    if (!converters_map.count(node_kind)) {
+                        LOG(WARNING) << "WARNING: The user-defined unsupported nodekind [ " << node_kind.toQualString() << " ] cannot be found in the poros supported op set."
+                        " Please check the PorosOptions.unsupport_op_list input.";
+                    }
+                    _unsupport_nodekind_set.insert(node_kind);
+                    LOG(INFO) << "The user-defined unsupported node kind is [ " << node_kind.toQualString() << " ].";
+                // schema
+                } else {
+                    c10::FunctionSchema fs = schema_or_opname.right();
+                    std::string fs_name = fs.name();
+                    auto end_index = fs_name.find_last_of('.') == std::string::npos ? fs_name.size() : fs_name.find_last_of('.');
+                    std::string node_kind_name = fs_name.substr(0, end_index);
+                    torch::jit::NodeKind node_kind = c10::Symbol::fromQualString(node_kind_name);
+                    c10::OperatorName node_op_name = schema_or_opname.right().operator_name();
+                    if (converters_map.count(node_kind)) {
+                        std::vector<c10::OperatorName> node_valid_schema_vec = converters_map[node_kind].options.valid_schemas;
+                        int e = 0;
+                        for (auto i : node_valid_schema_vec) {
+                            if (i == node_op_name) {
+                                e++;
+                            }
+                        }
+                        if (!e) {
+                            LOG(WARNING) << "WARNING: The user-defined unsupported schema [ " << line << " ] cannot be found in the poros supported op set."
+                            " Please check the PorosOptions.unsupport_op_list input.";
+                        }
+                    } else {
+                        LOG(WARNING) << "WARNING: The user-defined unsupported schema nodekind [ " << node_kind.toQualString() << " ] cannot be found in the poros supported op set."
+                        " Please check the PorosOptions.unsupport_op_list input.";
+                    }
+                    _unsupport_schema_set.insert(node_op_name);
+                    LOG(INFO) << "The user-defined unsupported schema is [ " << fs << " ].";
+                }
+            }
+        } catch (...) {
+            LOG(WARNING) << "WARNING: Failed to initialize user-defined unsupport operator list. Please check the PorosOptions.unsupport_op_list parameter.";
+        }
+    }
+    
+private:
+    std::set<std::string> converter_schemas;
+    std::unordered_map<torch::jit::NodeKind, ConvRegistration> converters_map;
+    std::unordered_set<torch::jit::NodeKind> _unsupport_nodekind_set;
+    std::unordered_set<c10::OperatorName> _unsupport_schema_set;
+
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/engine/engine.cpp b/poros/src/poros/engine/engine.cpp
new file mode 100644
index 0000000000..9b1c55715d
--- /dev/null
+++ b/poros/src/poros/engine/engine.cpp
@@ -0,0 +1,52 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file engine.cpp
+* @author huangben@baidu.com
+* @date Mon Mar  8 11:36:11 CST 2021
+* @brief 
+**/
+
+#include "poros/engine/iengine.h"
+#include "poros/context/poros_global.h"
+#include "poros/converter/iconverter.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+bool IEngine::is_node_supported(const torch::jit::Node* node) {
+    auto converter_map = PorosGlobalContext::instance().get_converter_map(who_am_i());
+    if (converter_map != nullptr && converter_map->node_converterable(node)) {
+        return true;
+    } else {
+        if (node->kind() != torch::jit::prim::Loop && 
+                node->kind() != torch::jit::prim::If &&
+                node->kind() != torch::jit::prim::CudaFusionGroup &&
+                node->kind() != torch::jit::prim::Param) {
+            LOG(INFO) << "not supported node: " << node->kind().toQualString()
+                        << ", detail info: " << *node;
+        }
+        // auto convertableItr = get_non_convertable_nodes().find(node->kind().toQualString());
+        // if (convertableItr != get_non_convertable_nodes().end()) {
+        //     return true;
+        // }
+        return false;
+    }
+}
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/engine/engine_context.h b/poros/src/poros/engine/engine_context.h
new file mode 100644
index 0000000000..cf60dffc05
--- /dev/null
+++ b/poros/src/poros/engine/engine_context.h
@@ -0,0 +1,100 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file engine_context.h
+* @author tianjinjin@baidu.com
+* @date Fri Jul 23 11:21:10 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <mutex>
+#include <unordered_map>
+#include "torch/script.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+template <class T>
+class EngineContext {
+public:
+    explicit EngineContext() {}
+
+    T* get_tensor(const torch::jit::Value* value) {
+        auto it = _value_tensor_map.find(value);
+        if (it == _value_tensor_map.end()) {
+            return nullptr;
+        }
+        return it->second;
+    }
+
+    bool set_tensor(const torch::jit::Value* value,  T* tensor) {
+        if (value != nullptr && tensor != nullptr) {
+            _value_tensor_map[value] = tensor;
+            return true;
+        }
+        return false;
+    }
+
+    bool get_tensorlist(const torch::jit::Value* value, std::vector<T*>& tensorlist) {
+        auto it = _value_tensorlist_map.find(value);
+        if (it == _value_tensorlist_map.end()) {
+            return false;
+        }
+        tensorlist = it->second;
+        return true;
+    }
+
+    bool set_tensorlist(const torch::jit::Value* value,  std::vector<T*> tensorlist) {
+        if (value != nullptr) {
+            _value_tensorlist_map[value] = tensorlist;
+            return true;
+        }
+        return false;
+    }
+
+    torch::jit::IValue get_constant(const torch::jit::Value* value) {
+        auto it = _value_constant_map.find(value);
+        if (it != _value_constant_map.end()) {
+            return it->second;
+        } else {
+            return torch::jit::IValue();
+        }
+    }
+    
+    bool set_constant(const torch::jit::Value* value,  torch::jit::IValue constant) {
+        if (value != nullptr) {
+            _value_constant_map[value] = constant;
+            return true;
+        }
+        return false;
+    }
+
+private:
+    std::string _engine_id;
+    //value <-> nvtensor
+    std::unordered_map<const torch::jit::Value*, T*> _value_tensor_map;
+    //value <-> nvtensor list
+    //std::unordered_map<const torch::jit::Value*, c10::List<T*>> _value_tensorlist_map;
+    std::unordered_map<const torch::jit::Value*, std::vector<T*>> _value_tensorlist_map;
+    //value <-> others
+    std::unordered_map<const torch::jit::Value*, torch::jit::IValue> _value_constant_map;
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/engine/iengine.h b/poros/src/poros/engine/iengine.h
new file mode 100644
index 0000000000..a9d1094bc2
--- /dev/null
+++ b/poros/src/poros/engine/iengine.h
@@ -0,0 +1,91 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file iengine.h
+* @author tianjinjin@baidu.com
+* @author huangben@baidu.com
+* @date Mon Mar  8 11:36:11 CST 2021 * @brief 
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "torch/script.h"
+#include "torch/csrc/jit/ir/ir.h"
+#include "ATen/core/interned_strings.h"
+
+#include "poros/iplugin/plugin_create.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+/**
+ * the base engine class 
+ * every registered engine should inherit from this IEngine
+ **/
+
+struct PorosGraph {
+    torch::jit::Graph* graph = NULL;
+    torch::jit::Node* node = NULL;
+};
+
+typedef uint64_t EngineID;
+
+class IEngine : public IPlugin, public torch::CustomClassHolder{
+public:
+    virtual ~IEngine() {}
+
+    /**
+     * @brief init, 必须init成功才算初始化成功
+     * @return int
+     * @retval 0 => success, <0 => fail
+     **/
+    virtual int init() = 0;
+    
+    /**
+     * @brief 编译期将subgraph转化成对应engine的图结构保存在engine内部，以使得运行期的excute_engine能调用, 此处保证所有的op都被支持，核心实现
+     * @param [in] sub_graph  : 子图
+     * @return [res]int
+     * @retval 0 => success, <0 => fail
+     **/
+    virtual int transform(const PorosGraph& sub_graph) = 0;
+
+    /**
+     * @brief 子图执行期逻辑
+     * @param [in] inputs  : 输入tensor
+     * @return [res] 输出tensor
+     **/
+    virtual std::vector<at::Tensor> excute_engine(const std::vector<at::Tensor>& inputs) = 0;
+
+    virtual void register_module_attribute(const std::string& name, torch::jit::Module& module) = 0;
+
+    //标识
+    virtual const std::string who_am_i() = 0;
+
+    //node是否被当前engine支持
+    bool is_node_supported(const torch::jit::Node* node);
+
+public:
+    std::pair<uint64_t, uint64_t> _num_io; //输入/输出参数个数
+    EngineID _id;
+
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/engine/tensorrt_engine.cpp b/poros/src/poros/engine/tensorrt_engine.cpp
new file mode 100644
index 0000000000..650db10033
--- /dev/null
+++ b/poros/src/poros/engine/tensorrt_engine.cpp
@@ -0,0 +1,515 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file tensorrt_engine.cpp
+* @author tianjinjin@baidu.com
+* @author huangben@baidu.com
+* @date Mon Mar  8 11:36:11 CST 2021
+* @brief 
+**/
+#include "poros/engine/tensorrt_engine.h"
+
+#include "poros/context/poros_global.h"
+#include "poros/converter/gpu/converter_util.h"
+#include "poros/converter/iconverter.h"
+// #include "poros/engine/trtengine_util.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+
+TensorrtEngine::TensorrtEngine() : _logger(get_nvlogger().torch_level()), _builder(nullptr), 
+    _network(nullptr), _cfg(nullptr), _runtime(nullptr), _cuda_engine(nullptr), 
+    _exec_ctx(nullptr), _mutex(nullptr) {
+    // init nvinfer plgins
+    initLibNvInferPlugins(&_logger, "");
+}
+
+TensorrtEngine::TensorrtEngine(std::string engine_str) : _logger(get_nvlogger().torch_level()) {
+
+    init();
+
+    _cuda_engine = make_shared_ptr(_runtime->deserializeCudaEngine((void*)engine_str.c_str(), engine_str.length()));
+    _exec_ctx = make_shared_ptr(_cuda_engine->createExecutionContext());
+    binding_io();
+}
+
+//TensorrtEngine::~TensorrtEngine() {
+//    //_exec_ctx->destroy();
+//    //_cuda_engine->destroy();
+//    //_runtime->destroy();
+//
+//}
+
+void TensorrtEngine::binding_io() {
+    uint64_t inputs = 0;
+    uint64_t outputs = 0;
+
+    for (int64_t idx = 0; idx < _cuda_engine->getNbBindings(); idx++) {
+        std::string name = _cuda_engine->getBindingName(idx);
+        //if (name.find("profile") != name.npos) {
+        //    continue;
+        //}
+        std::string idx_s = name.substr(name.find("_") + 1);
+        uint64_t idx_new = static_cast<uint64_t>(std::stoi(idx_s));
+        if (_cuda_engine->bindingIsInput(idx)) {
+            inputs++;
+            _in_binding_map[idx] = idx_new;
+        } else {
+            outputs++;
+            _out_binding_map[idx] = idx_new;
+        }
+    }
+    _num_io = std::make_pair(inputs, outputs);
+}
+
+int TensorrtEngine::init() {
+    // init nvinfer plgins
+    initLibNvInferPlugins(&_logger, "");
+    _mutex = std::make_shared<std::mutex>();
+    _poros_options = PorosGlobalContext::instance().get_poros_options();
+
+    _builder = make_shared_ptr(nvinfer1::createInferBuilder(_logger));
+    _network = make_shared_ptr(_builder->createNetworkV2(1U << static_cast<uint32_t>(
+            nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)));
+    _cfg = make_shared_ptr(_builder->createBuilderConfig());
+    _runtime = make_shared_ptr(nvinfer1::createInferRuntime(_logger));
+
+
+    // Nvidia tf32 is enabled by default. 
+    // if don't want to ues, the BuilderFlag::kTF32 should be clear.
+    if (!_poros_options.use_nvidia_tf32) {
+        _cfg->clearFlag(nvinfer1::BuilderFlag::kTF32);
+    }
+    if (_poros_options.use_fp16) {
+        _cfg->setFlag(nvinfer1::BuilderFlag::kFP16);
+    }
+#if NV_TENSORRT_MAJOR >=8 && NV_TENSORRT_MINOR >=3
+    // trt version >= 8.3
+    _cfg->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, _poros_options.max_workspace_size);
+#else
+    _cfg->setMaxWorkspaceSize(_poros_options.max_workspace_size);
+#endif
+    return 0;
+}
+
+
+int TensorrtEngine::transform(const PorosGraph& sub_graph) {
+    //step1. get the given graph
+    torch::jit::Graph* to_trans_graph = sub_graph.graph;
+    //PorosGraph to_trans_sub_graph = {to_trans_graph.get(), sub_graph.node};
+
+    //step2. init the engine input
+    if (init_engine_inputs(sub_graph) < 0) {
+        LOG(ERROR) << " init engine inputs failed";
+        return -1;
+    }
+
+    //step3. get op converter_map that tensortengine supports
+    ConvertersMap* converter_map = PorosGlobalContext::instance().get_converter_map(who_am_i());
+    if (converter_map == nullptr) {
+        LOG(ERROR) << "could not find given engine [ " << who_am_i() << " ] in global context";
+        return -1;
+    }
+
+    //step4. converter the nodes in the given graph one by one. this is the core function.
+    const torch::jit::Block* block = to_trans_graph->block();
+    for (const torch::jit::Node* node : block->nodes()) {
+        IConverter* conv = converter_map->get_converter(node);
+        if (nullptr ==  conv) {
+            LOG(ERROR) << "pre judgment failed: " << node_info(node);
+            return -1; 
+        }
+        
+        LOG(INFO) << "start to converter for: " << node_info(node);
+        if (!conv->converter(this, node)) {
+            LOG(ERROR) << "converter for node failed [ " << *node->maybeSchema() <<  " ]";
+            return -1;        
+        }
+    }
+
+    //step5. mark the graph output.
+    at::ArrayRef<const torch::jit::Value*> graph_outputs = block->outputs();
+    if (mark_graph_outputs(graph_outputs) < 0) {
+        LOG(ERROR) << " mark graph outputs failed";
+        return -1;
+    }
+
+    //step6. build cuda engine.
+    _cuda_engine = make_shared_ptr(_builder->buildEngineWithConfig(*_network, *_cfg));
+    if (!_cuda_engine) {
+        LOG(ERROR) << "build tensorrt engine failed";
+        return -1;
+    }
+
+    //step7. create execution context and binding io
+    // Easy way to get a unique name for each engine, maybe there is a more
+    // descriptive way (using something associated with the graph maybe)
+    _id = reinterpret_cast<EngineID>(_cuda_engine.get());
+    _exec_ctx = make_shared_ptr(_cuda_engine->createExecutionContext());
+    binding_io();
+
+    return 0;
+}
+
+//DEPRECATED
+inline void TensorrtEngine::gen_tensorrt_input_type(const torch::jit::Value* input, 
+                                                nvinfer1::DataType& input_type) {
+    if (input->type()->isSubtypeOf(c10::BoolType::get())) {
+        input_type = nvinfer1::DataType::kBOOL;
+    //NumberTypes below
+    } else if (input->type()->isSubtypeOf(c10::IntType::get())) {
+        input_type = nvinfer1::DataType::kINT32;
+    } else if (input->type()->isSubtypeOf(c10::FloatType::get())) {
+        input_type = nvinfer1::DataType::kFLOAT;
+    } else {
+        //TODO: TO ADD LOGGER
+    }
+}
+
+nvinfer1::Dims TensorrtEngine::gen_dynamic_dims(torch::jit::Value* value) {
+    if (PorosGlobalContext::instance()._value_dynamic_shape_map.count(value) <= 0) {
+        LOG(ERROR) << "value is not in value_dynamic_shape_map"; 
+        throw std::runtime_error("value is not in value_dynamic_shape_map");
+    }
+    std::vector<int64_t> sizes = PorosGlobalContext::instance()._value_dynamic_shape_map[value].sizes;
+    return sizes_to_nvdim(sizes);
+}
+
+//try to extract input value from subgraph_node
+int TensorrtEngine::init_engine_inputs(const PorosGraph& sub_graph) {
+    torch::jit::Node* subgraph_node = sub_graph.node;
+    torch::jit::Graph* subgraph = sub_graph.graph;
+    AT_ASSERT(subgraph_node->kind() == torch::jit::prim::CudaFusionGroup);
+    at::ArrayRef<torch::jit::Value*> graph_inputs = subgraph->inputs();
+    at::ArrayRef<torch::jit::Value*> node_inputs = subgraph_node->inputs();
+
+    nvinfer1::IOptimizationProfile* profile = _builder->createOptimizationProfile();
+
+    bool total_is_dynamic = false;
+    for (size_t i = 0; i < graph_inputs.size(); i++) {
+        torch::jit::Value* in = graph_inputs[i];
+        torch::jit::Value* node_in = node_inputs[i];
+        std::string name = std::string("input_") + std::to_string(i);
+
+        nvinfer1::DataType nv_type;
+        if (!gen_tensor_type(*subgraph_node, i, nv_type)) {
+            LOG(WARNING) << "init_engine_inputs failed:  reason: can't gen nv_type info from input";
+            return -1;
+        }
+
+        //根据当前poros的设计，subgraph的输入在子图分割阶段，已经全部转换为tensor
+        //此处如果出现了非tensor类型，则不在poros预期内，不做处理。
+        if (in->type()->isSubtypeOf(c10::TensorType::get()) == false) {
+            LOG(WARNING) << "not supported input type by tensorrt: " << node_info(in->node());
+            return -1;
+        }
+
+        std::vector<int64_t> sizes;
+        if (!gen_dims_for_tensor(in, sizes)) {
+            LOG(WARNING) << "gen_dims_for_tensor failed for: " << in->debugName();
+            return -1;
+        };
+        nvinfer1::Dims nv_dims = sizes_to_nvdim(sizes);
+        bool current_dynamic = false;
+        if (std::find(sizes.begin(), sizes.end(), -1) != sizes.end() || input_is_dynamic(node_in)) {
+            total_is_dynamic = true;
+            current_dynamic = true;
+        }
+        // mark: 从nv提供的api无法先验地去判断是否是shape tensor
+        // 这里先这样判断输入的tensor scalar是否属于shape tensor范围
+        // 可能会有误判
+        bool is_shape_tensor = false;
+        if (nv_type == nvinfer1::DataType::kINT32 && nv_dims.nbDims <= 1 && 
+            node_in->node()->kind() == torch::jit::aten::tensor) {
+            torch::jit::use_list in_use_list = in->uses();
+            for (size_t u = 0; u < in_use_list.size(); u++) {
+                if (in_use_list[u].user->kind() == torch::jit::aten::IntImplicit ||
+                    in_use_list[u].user->kind() == torch::jit::prim::tolist) {
+                    is_shape_tensor = true;
+                    _in_shape_tensor_index.emplace(i);
+                    break;
+                }
+            }
+        }
+        // 上面输入为tensor scalar（nv_type是nvinfer1::DataType::kINT32且nv_dims.nbDims <= 1）的状况
+        // 是我们自己通过AdjustmentSalarInputs加的，有int_intlist_values_map预热数据支持，可获取到真实的max、min、opt，
+        // 而不外乎有其他tensor scalar输入的情况，此时由value_dynamic_shape_map记录的max min opt全为0，
+        // 输入到engine后面converter会报错，这里需要提前拦截。
+        // todo: 预热时候给其他tensor scalar加上int_intlist_values_map预热数据支持
+        if (nv_dims.nbDims < 1 && !is_shape_tensor) {
+            LOG(WARNING) << "init_engine_inputs failed:  reason: Meet unknown tensor scalar with 0 dim.";
+            return -1;
+        }
+
+        nvinfer1::ITensor* trt_in = nullptr;
+        if (is_shape_tensor) {
+            c10::List<int64_t> int_sizes = {1};
+            nv_dims = nv_dims.nbDims == 0 ? sizes_to_nvdim(int_sizes) : nv_dims;
+            trt_in = _network->addInput(name.c_str(), nv_type, nv_dims);
+            int32_t nbvalues = nv_dims.d[0];
+
+
+            std::unique_ptr<int32_t[]> max_values(new int32_t[nbvalues]);
+            std::unique_ptr<int32_t[]> min_values(new int32_t[nbvalues]);
+            std::unique_ptr<int32_t[]> opt_values(new int32_t[nbvalues]);
+
+            if (PorosGlobalContext::instance()._value_dynamic_shape_map.count(node_in) == 0) {
+                LOG(WARNING) << "can't find %" << node_in->debugName() << " in global _value_dynamic_shape_map!";
+                return -1;
+            } 
+            ValueDynamicShape int_value_max_min_opt;
+            int_value_max_min_opt = PorosGlobalContext::instance()._value_dynamic_shape_map[node_in];
+
+            std::vector<int64_t> min_values_in_map = int_value_max_min_opt.min_shapes;
+            std::vector<int64_t> max_values_in_map = int_value_max_min_opt.max_shapes;
+            std::vector<int64_t> opt_values_in_map = int_value_max_min_opt.opt_shapes;
+
+            if ((size_t)nbvalues != min_values_in_map.size() || 
+                (size_t)nbvalues != max_values_in_map.size() || 
+                (size_t)nbvalues != opt_values_in_map.size()) {
+                LOG(WARNING) << "input %" << node_in->debugName() << " int or int[] length must match the size of max || min || opt vector!";
+                return -1;
+            }
+            
+            for (int i = 0; i < nbvalues; i++) {
+                max_values[i] = max_values_in_map[i];
+                min_values[i] = min_values_in_map[i];
+                opt_values[i] = opt_values_in_map[i];
+            }
+
+            bool ret_min = profile->setShapeValues(trt_in->getName(), nvinfer1::OptProfileSelector::kMIN, min_values.get(), nbvalues);
+            bool ret_max = profile->setShapeValues(trt_in->getName(), nvinfer1::OptProfileSelector::kMAX, max_values.get(), nbvalues);
+            bool ret_opt = profile->setShapeValues(trt_in->getName(), nvinfer1::OptProfileSelector::kOPT, opt_values.get(), nbvalues);
+
+            if (ret_min == false || ret_opt == false || ret_max == false) {
+                LOG(WARNING) << "setDimensions for value: %" << node_in->debugName() << " failed"
+                            << ", min_shape_info: " << sizes_to_nvdim(min_values_in_map)
+                            << ", opt_shape_info: " << sizes_to_nvdim(opt_values_in_map)
+                            << ", max_shape_info: " << sizes_to_nvdim(max_values_in_map);
+                return -1;
+            }
+
+            LOG(INFO) << "Init shape tensor input ok: %" << node_in->debugName()
+                            << ", min_shape_info: " << sizes_to_nvdim(min_values_in_map)
+                            << ", opt_shape_info: " << sizes_to_nvdim(opt_values_in_map)
+                            << ", max_shape_info: " << sizes_to_nvdim(max_values_in_map);
+
+        } else {
+            if (!current_dynamic) {
+                trt_in = _network->addInput(name.c_str(), nv_type, nv_dims);
+                LOG(INFO) << "init static tensor input ok : " << nv_dims;
+            } else {
+                if (PorosGlobalContext::instance()._value_dynamic_shape_map.count(node_in) <= 0) {
+                    LOG(WARNING) << "can't generate max min opt input setting for value: %" << node_in->debugName();
+                    return -1;
+                }
+                nvinfer1::Dims dynamic_nv_dims = gen_dynamic_dims(node_in);
+                trt_in = _network->addInput(name.c_str(), nv_type, dynamic_nv_dims);
+                std::vector<int64_t> min_shapes = PorosGlobalContext::instance()._value_dynamic_shape_map[node_in].min_shapes;
+                bool ret_min = profile->setDimensions(trt_in->getName(), nvinfer1::OptProfileSelector::kMIN, sizes_to_nvdim(min_shapes));
+                std::vector<int64_t> opt_shapes = PorosGlobalContext::instance()._value_dynamic_shape_map[node_in].opt_shapes;
+                bool ret_opt = profile->setDimensions(trt_in->getName(), nvinfer1::OptProfileSelector::kOPT, sizes_to_nvdim(opt_shapes));
+                std::vector<int64_t> max_shapes = PorosGlobalContext::instance()._value_dynamic_shape_map[node_in].max_shapes;
+                bool ret_max = profile->setDimensions(trt_in->getName(), nvinfer1::OptProfileSelector::kMAX, sizes_to_nvdim(max_shapes));
+                if (ret_min == false || ret_opt == false || ret_max == false) {
+                    LOG(WARNING) << "setDimensions for value: %" << node_in->debugName() << " failed"
+                                << ", min_shape_info: " << sizes_to_nvdim(min_shapes)
+                                << ", opt_shape_info: " << sizes_to_nvdim(opt_shapes)
+                                << ", max_shape_info: " << sizes_to_nvdim(max_shapes)
+                                << ", dynamic tensor info: " << dynamic_nv_dims;
+                    return -1;
+                }
+                LOG(INFO) << "Init dynamic tensor input ok: " << nv_dims
+                            << ", min_shape_info: " << sizes_to_nvdim(min_shapes)
+                            << ", opt_shape_info: " << sizes_to_nvdim(opt_shapes)
+                            << ", max_shape_info: " << sizes_to_nvdim(max_shapes);
+            }
+        }            
+        _context.set_tensor(in, trt_in);
+    }
+
+    if (total_is_dynamic) {
+        POROS_CHECK(profile->isValid(), "Optimization profile is invalid, please check the input range provided");
+        _cfg->addOptimizationProfile(profile);
+    }
+    return 0;
+}
+
+int TensorrtEngine::mark_graph_outputs(at::ArrayRef<const torch::jit::Value*> outputs) {
+    int index = 0;
+    for (const torch::jit::Value* out : outputs) {
+        auto out_tensor = _context.get_tensor(out);
+        if (out_tensor == nullptr) {
+            LOG(WARNING) << "can't get output tensor from context. something is wrong";
+            return -1;
+        }
+        //output should always be a tensor according to the segmentation setting.
+        std::string name = std::string("output_") + std::to_string(index++);
+        out_tensor->setName(name.c_str());
+        _network->markOutput(*out_tensor);
+        LOG(INFO) << "mark  " << out->debugName() << " named " << name << " as graph output";
+    }
+    return 0;
+}
+
+//DEPRECATED
+std::string TensorrtEngine::convert_graph_to_engine(std::shared_ptr<torch::jit::Graph>& graph) {
+    const torch::jit::Block* block = graph->block();
+    ConvertersMap* converter_map = PorosGlobalContext::instance().get_converter_map(who_am_i());
+    if (converter_map == nullptr) {
+        LOG(ERROR) << "could not find given engine [ " << who_am_i() << " ] in global context";
+        return "";
+    }
+
+    for (const torch::jit::Node* node :  block->nodes()) {
+        IConverter* conv = converter_map->get_converter(node);
+        LOG(INFO) << "start to converter for: " << node_info(node);
+        if (!conv->converter(this, node)) {
+            LOG(ERROR) << "converter for node failed [ " << *node->maybeSchema() <<  " ]";
+            return "";        
+        }
+    }
+
+    at::ArrayRef<const torch::jit::Value*> outputs = block->outputs();
+    if (mark_graph_outputs(outputs) < 0) {
+        LOG(ERROR) << " mark graph outputs failed";
+        return "";
+    }
+
+    nvinfer1::ICudaEngine*  engine = _builder->buildEngineWithConfig(*_network, *_cfg);
+    if (!engine) {
+        LOG(FATAL) << "build tensorrt engine failed";
+    }
+    
+    nvinfer1::IHostMemory* serialized_engine = engine->serialize();
+    engine->destroy();
+    std::string engine_str = std::string((const char*)serialized_engine->data(), serialized_engine->size());
+    serialized_engine->destroy();
+    return engine_str;
+}
+
+void TensorrtEngine::register_module_attribute(const std::string& name, torch::jit::Module& module) {
+    //auto engine_ptr = c10::make_intrusive<TensorrtEngine>(*static_cast<TensorrtEngine*>(this));
+    auto engine_ptr = c10::make_intrusive<TensorrtEngine>(*this);
+
+    module.register_attribute(
+            name,
+            c10::getCustomClassType<c10::intrusive_ptr<TensorrtEngine>>(),
+            c10::IValue(std::move(engine_ptr)),
+            false);
+}
+
+std::vector<at::Tensor> TensorrtEngine::excute_engine(const std::vector<at::Tensor>& inputs) {
+    std::vector<void*> gpu_handles;
+
+    std::vector<at::Tensor> contig_inputs{};
+    contig_inputs.reserve(inputs.size());
+
+    for (size_t i = 0; i < inputs.size(); i++) {
+        uint64_t pyt_idx = _in_binding_map[i];
+        //auto expected_type = nvtype_to_attype(_exec_ctx->getEngine().getBindingDataType(i));
+        // POROS_CHECK(inputs[pyt_idx].dtype() == expected_type,
+        //     "Expected input tensors to have type " << expected_type << ", found type " << inputs[pyt_idx].dtype());
+
+        nvinfer1::Dims dims = sizes_to_nvdim_with_pad(inputs[pyt_idx].sizes(), 1);
+        std::vector<int64_t> shape = nvdim_to_sizes(dims);
+        // at::ScalarType::Long -> at::ScalarType::Int
+        if (inputs[pyt_idx].scalar_type() == c10::ScalarType::Long) {
+            LOG(WARNING) << "excute_engine input meets c10::ScalarType::Long tensor type, change this to c10::ScalarType::Int. "
+                    << "Attention: this may leed to percision change";
+            contig_inputs.push_back(inputs[pyt_idx].to(at::ScalarType::Int).view(shape).contiguous());
+        } else {
+            contig_inputs.push_back(inputs[pyt_idx].view(shape).contiguous());
+        }
+
+        // 输入可能不在cuda上面，要tocuda
+        if (contig_inputs[i].device() != c10::DeviceType::CUDA) {
+            contig_inputs[i] = contig_inputs[i].to(c10::kCUDA).contiguous();
+        }
+        // set input shape binding for nvidia shape tensor
+        if (_in_shape_tensor_index.count(i) > 0) {
+            size_t data_nb = inputs[pyt_idx].sizes()[0];
+            if (data_nb == 0) {
+                int32_t set_shape_int = c10::IValue(inputs[pyt_idx].item()).toInt();
+                if (!_exec_ctx->setInputShapeBinding(i, &set_shape_int)) {
+                    throw std::runtime_error("tensorrt setInputShapeBinding error");
+                }
+            } else {
+                std::unique_ptr<int32_t[]> set_shape_ints(new int32_t[data_nb]);
+                for (size_t s = 0; s < data_nb; s++) {
+                    c10::IValue tmp_ivalue(inputs[pyt_idx][s].item());
+                    if (tmp_ivalue.isInt()) {
+                        set_shape_ints[s] = tmp_ivalue.toInt();
+                    }
+                }
+                if (!_exec_ctx->setInputShapeBinding(i, set_shape_ints.get())) {
+                    throw std::runtime_error("tensorrt setInputShapeBinding error");
+                }
+            }
+
+        } else {
+            if (_exec_ctx->setBindingDimensions(i, dims) == false) {
+                throw std::runtime_error("tensorrt setBindingDimensions error");
+            }
+        }
+        gpu_handles.push_back(contig_inputs.back().data_ptr());
+    }
+
+    std::vector<at::Tensor> outputs(_num_io.second);
+    for (size_t o = inputs.size(); o < (_num_io.first + _num_io.second); o++) {
+        uint64_t pyt_idx = _out_binding_map[o];
+        nvinfer1::Dims out_shape = _exec_ctx->getBindingDimensions(o);
+        std::vector<int64_t> dims = nvdim_to_sizes(out_shape);
+        at::ScalarType type = nvtype_to_attype(_exec_ctx->getEngine().getBindingDataType(o));
+        outputs[pyt_idx] = std::move(at::empty(dims, {at::kCUDA}).to(type).contiguous());
+        gpu_handles.push_back(outputs[pyt_idx].data_ptr());
+    }
+    
+    c10::cuda::CUDAStream stream = c10::cuda::getCurrentCUDAStream(inputs[0].device().index());
+    {
+        std::lock_guard<std::mutex> lock(*_mutex);
+        _exec_ctx->enqueueV2(gpu_handles.data(), stream, nullptr);
+    }
+    return outputs;
+
+}
+
+std::vector<at::Tensor> execute_engine(const std::vector<at::Tensor>& inputs, 
+                                    c10::intrusive_ptr<TensorrtEngine> compiled_engine) {
+    return compiled_engine->excute_engine(inputs);
+}
+
+TORCH_LIBRARY(TensorrtEngine, m) {
+    auto engine_class = m.class_<TensorrtEngine>("TensorrtEngine")
+        .def(torch::init<>())
+        .def_pickle(
+            [](const c10::intrusive_ptr<TensorrtEngine>& self) -> std::string {
+                auto serialized_engine = self->cuda_engine()->serialize();
+                return std::string((const char*)serialized_engine->data(), serialized_engine->size());
+            },
+            [](std::string seralized_engine) -> c10::intrusive_ptr<TensorrtEngine> {
+                return c10::make_intrusive<TensorrtEngine>(std::move(seralized_engine));
+            });
+    m.def("execute_engine", execute_engine);
+}
+
+POROS_REGISTER_ENGINE(TensorrtEngine);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/engine/tensorrt_engine.h b/poros/src/poros/engine/tensorrt_engine.h
new file mode 100644
index 0000000000..fbbc031e22
--- /dev/null
+++ b/poros/src/poros/engine/tensorrt_engine.h
@@ -0,0 +1,193 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file tensorrt_engine.h
+* @author huangben@baidu.com
+* @date Mon Mar  8 11:36:11 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+//from cuda
+#include <cuda_runtime.h>
+
+//from pytorch
+#include <c10/cuda/CUDAStream.h>
+#include <c10/util/Optional.h>
+
+//from tensorrt
+#include <NvInfer.h>
+#include <NvInferPlugin.h>
+
+#include "poros/compile/poros_module.h"
+#include "poros/engine/engine_context.h"
+#include "poros/engine/iengine.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/log/tensorrt_logging.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+/**
+ * the implement of tensorRT engine
+ **/
+
+class TensorrtEngine : public IEngine {
+public:
+    TensorrtEngine();
+    TensorrtEngine(std::string engine_str);
+    //virtual ~TensorrtEngine();
+
+    /**
+     * @brief init
+     * @return int
+     * @retval 0 => success, <0 => fail
+     **/
+    virtual int init() override;
+
+    /**
+     * @brief 核心实现
+     *        编译期将subgraph转化成对应engine的图结构保存在engine内部，以使得运行期的excute_engine能调用, 此处保证所有的op都被支持，
+     * @      注意要注册输入输出tensor到engine_context中
+     * @param [in] sub_graph  : 子图
+     * @return [res]int
+     * @retval 0 => success, <0 => fail
+     **/
+    virtual int transform(const PorosGraph& sub_graph) override;
+
+    /**
+     * @brief 子图执行期逻辑
+     * @param [in] inputs  : 输入tensor
+     * @return [res] 输出tensor
+     **/
+    virtual std::vector<at::Tensor> excute_engine(const std::vector<at::Tensor>& inputs) override;
+
+    /**
+     * @brief 在jit 模块中标记engine
+     * @param [in] name : engine sign
+     * @param [in] module  : jit modeule
+     * @param [out] module  : 添加了engine sign之后的module
+     **/
+    virtual void register_module_attribute(const std::string& name, torch::jit::Module& module) override;
+
+    /**
+     * @brief get engine mark
+     * @retval engine name
+    **/
+    virtual const std::string who_am_i() override {
+        return "TensorrtEngine";
+    }
+
+    /**
+     * @brief get context
+     * @retval context
+    **/
+    EngineContext<nvinfer1::ITensor>& context() {
+        return _context;
+    }
+
+    /**
+     * @brief get network
+     * @retval network
+    **/
+    nvinfer1::INetworkDefinition* network() {
+        return _network.get();
+    }
+
+    /**
+     * @brief get cuda engine
+     * @retval engine
+    **/
+    nvinfer1::ICudaEngine* cuda_engine() {
+        return _cuda_engine.get();
+    }  
+
+private:
+
+    /**
+     * @brief convert input type from torch to tensorrt
+     * @param [in] input : input value
+     * @param [in] input_type : input valur type
+    **/
+    //DEPRECATED
+    void gen_tensorrt_input_type(const torch::jit::Value* input,
+                                nvinfer1::DataType& input_type);
+
+    /**
+     * @brief extract input value from subgraph_node
+     * @param [in] sub_graph : poros graph
+     * @retval 0 => success, <0 => fail
+    **/
+    int init_engine_inputs(const PorosGraph& sub_graph);
+
+    /**
+     * @brief mark a tensor as a network output.
+     * @param [in] outputs : outputs value list
+     * @retval 0 => success, <0 => fail
+    **/
+    int mark_graph_outputs(at::ArrayRef<const torch::jit::Value*> outputs);
+    
+    /**
+     * @brief binding input and output for engine
+    **/ 
+    void binding_io();
+
+    /**
+     * @brief convert jit graph to engine
+     * @param [in] graph : jit grpah
+     * @retval tetengin serialize data
+    **/
+   //DEPRECATED
+    std::string convert_graph_to_engine(std::shared_ptr<torch::jit::Graph>& graph);
+
+    /**
+     * @brief gen dynamic dims for given value
+     * @param [in] value : jit value
+     * @retval value dims
+    **/
+    nvinfer1::Dims gen_dynamic_dims(torch::jit::Value* value);
+    
+private:
+    //for tensortrt networkbuilding
+    baidu::mirana::poros::TensorrtLogger _logger;
+    std::shared_ptr<nvinfer1::IBuilder> _builder;
+    std::shared_ptr<nvinfer1::INetworkDefinition> _network;
+    std::shared_ptr<nvinfer1::IBuilderConfig> _cfg;
+
+    //engine conrtext. to store the relationship of value-itensor
+    baidu::mirana::poros::EngineContext<nvinfer1::ITensor> _context;
+
+    //for runtime
+    std::shared_ptr<nvinfer1::IRuntime> _runtime;
+    std::shared_ptr<nvinfer1::ICudaEngine> _cuda_engine;
+    std::shared_ptr<nvinfer1::IExecutionContext> _exec_ctx;
+
+    std::unordered_map<uint64_t, uint64_t> _in_binding_map;
+    std::unordered_map<uint64_t, uint64_t> _out_binding_map;
+
+    std::unordered_set<uint64_t> _in_shape_tensor_index;
+
+    PorosOptions _poros_options;
+    std::shared_ptr<std::mutex> _mutex; //for enqueue
+};
+
+//std::vector<at::Tensor> execute_engine(const std::vector<at::Tensor> inputs,
+//                            c10::intrusive_ptr<TensorrtEngine> compiled_engine);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/engine/trtengine_util.cpp b/poros/src/poros/engine/trtengine_util.cpp
new file mode 100644
index 0000000000..01fbe00655
--- /dev/null
+++ b/poros/src/poros/engine/trtengine_util.cpp
@@ -0,0 +1,364 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file trtengine_util.cpp
+* @author tianjinjin@baidu.com
+* @date Wed Jul 21 11:45:49 CST 2021
+* @brief 
+**/
+#include "poros/context/poros_global.h"
+#include "poros/engine/trtengine_util.h"
+#include "poros/util/poros_util.h"
+#include "poros/util/macros.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+namespace {
+const std::unordered_map<at::ScalarType, nvinfer1::DataType>& get_at_trt_type_map() {
+    static const std::unordered_map<at::ScalarType, nvinfer1::DataType> at_trt_type_map = {
+        {at::kFloat, nvinfer1::DataType::kFLOAT},
+        {at::kHalf, nvinfer1::DataType::kHALF},
+        {at::kInt, nvinfer1::DataType::kINT32},
+        {at::kChar, nvinfer1::DataType::kINT8},
+        {at::kBool, nvinfer1::DataType::kBOOL},
+        {at::kByte, nvinfer1::DataType::kINT8},
+    };
+    return at_trt_type_map;
+}
+
+const std::unordered_map<nvinfer1::DataType, at::ScalarType>& get_trt_at_type_map() {
+    static const std::unordered_map<nvinfer1::DataType, at::ScalarType> trt_at_type_map = {
+        {nvinfer1::DataType::kFLOAT, at::kFloat},
+        {nvinfer1::DataType::kHALF, at::kHalf},
+        {nvinfer1::DataType::kINT32, at::kInt},
+        {nvinfer1::DataType::kINT8, at::kByte},  //TODO: should trans kChar or kByte???
+        {nvinfer1::DataType::kBOOL, at::kBool},
+    };
+    return trt_at_type_map;
+}
+} // namespace
+
+bool broadcastable(nvinfer1::Dims a, nvinfer1::Dims b, bool multidirectional) {
+    if (a == b) {
+        return true;
+    }
+    
+    if (multidirectional) {
+        nvinfer1::Dims a_dims_eq;
+        nvinfer1::Dims b_dims_eq;
+        if (a.nbDims > b.nbDims) {
+            a_dims_eq = a;
+            b_dims_eq = sizes_to_nvdim_with_pad(nvdim_to_sizes(b), a.nbDims);
+        } else if (a.nbDims < b.nbDims) {
+            a_dims_eq = sizes_to_nvdim_with_pad(nvdim_to_sizes(a), b.nbDims);
+            b_dims_eq = b;
+        } else {
+            a_dims_eq = a;
+            b_dims_eq = b;
+        }
+
+        bool broadcastable = true;
+        for (int i = 0; i < a_dims_eq.nbDims; i++) {
+            if (b_dims_eq.d[i] == a_dims_eq.d[i] || (b_dims_eq.d[i] == 1 || a_dims_eq.d[i] == 1)) {
+                continue;
+            } else {
+                broadcastable = false;
+                break;
+            }
+        }
+        return broadcastable;
+    } else {
+        nvinfer1::Dims b_dims_eq;
+        if (a.nbDims > b.nbDims) {
+            b_dims_eq = sizes_to_nvdim_with_pad(nvdim_to_sizes(b), a.nbDims);
+        } else if (a.nbDims < b.nbDims) {
+            return false;
+        } else {
+            b_dims_eq = b;
+        }
+
+        bool broadcastable = true;
+        for (int i = 0; i < a.nbDims; i++) {
+            if (b_dims_eq.d[i] == a.d[i] || b_dims_eq.d[i] == 1) {
+                continue;
+            } else {
+                broadcastable = false;
+                break;
+            }
+        }
+        return broadcastable;
+    }
+}
+
+nvinfer1::Dims sizes_to_nvdim(const std::vector<int64_t>& sizes) {
+    if (sizes.size() > nvinfer1::Dims::MAX_DIMS) {
+        LOG(FATAL) << "given sizes is exceed of max dims of tensorrt";
+        throw std::runtime_error("given sizes is exceed of max dims of tensorrt");
+    }
+    nvinfer1::Dims dims;
+    dims.nbDims = sizes.size();
+    for (size_t i = 0; i < sizes.size(); i++) {
+        dims.d[i] = sizes[i];
+    }
+    return dims;
+}
+
+nvinfer1::Dims sizes_to_nvdim(c10::IntArrayRef sizes) {
+    if (sizes.size() > nvinfer1::Dims::MAX_DIMS) {
+        LOG(FATAL) << "given sizes is exceed of max dims of tensorrt";
+        throw std::runtime_error("given sizes is exceed of max dims of tensorrt");
+    }
+    nvinfer1::Dims dims;
+    dims.nbDims = sizes.size();
+    for (size_t i = 0; i < sizes.size(); i++) {
+        dims.d[i] = sizes[i];
+    }
+    return dims;
+}
+
+nvinfer1::Dims sizes_to_nvdim(c10::List<int64_t> sizes) {
+    if (sizes.size() > nvinfer1::Dims::MAX_DIMS) {
+        LOG(FATAL) << "given sizes is exceed of max dims of tensorrt";
+        throw std::runtime_error("given sizes is exceed of max dims of tensorrt");
+    }
+    nvinfer1::Dims dims;
+    dims.nbDims = sizes.size();
+    for (size_t i = 0; i < sizes.size(); i++) {
+        dims.d[i] = sizes[i];
+    }
+    return dims;
+}
+
+nvinfer1::Dims sizes_to_nvdim_with_pad(c10::IntArrayRef sizes, uint64_t pad_to) {
+    if (pad_to > nvinfer1::Dims::MAX_DIMS || sizes.size() > nvinfer1::Dims::MAX_DIMS) {
+        LOG(FATAL) << "given sizes is exceed of max dims of tensorrt";
+        throw std::runtime_error("given sizes is exceed of max dims of tensorrt");
+    }
+
+    nvinfer1::Dims dims;
+    //no need padding situation
+    if (sizes.size() > pad_to) {
+        dims.nbDims = sizes.size();
+        for (size_t i = 0; i < sizes.size(); i++) {
+            dims.d[i] = sizes[i];
+        }
+    //need padding situation
+    } else {  
+        dims.nbDims = pad_to;
+        for (size_t i = 0; i < pad_to - sizes.size(); i++) {
+            dims.d[i] = 1;
+        }
+        for (size_t i = pad_to - sizes.size(); i < pad_to; i++) {
+            dims.d[i] = sizes[i - (pad_to - sizes.size())];
+        }
+    }
+    return dims;
+}
+
+nvinfer1::Dims sizes_to_nvdim_with_pad(c10::List<int64_t> sizes, uint64_t pad_to) {
+    if (pad_to > nvinfer1::Dims::MAX_DIMS || sizes.size() > nvinfer1::Dims::MAX_DIMS) {
+        LOG(FATAL) << "given sizes is exceed of max dims of tensorrt";
+        throw std::runtime_error("given sizes is exceed of max dims of tensorrt");
+    }
+
+    nvinfer1::Dims dims;
+    //no need padding situation
+    if (sizes.size() > pad_to) { 
+        LOG(INFO) << "no need to pad, give sizes: " << sizes.size() 
+                  << ", expected dims: " << pad_to;
+        dims.nbDims = sizes.size();
+        for (size_t i = 0; i < sizes.size(); i++) {
+            dims.d[i] = sizes[i];
+        }
+    //need padding situation
+    } else {  
+        dims.nbDims = pad_to;
+        for (size_t i = 0; i < pad_to - sizes.size(); i++) {
+            dims.d[i] = 1;
+        }
+        for (size_t i = pad_to - sizes.size(); i < pad_to; i++) {
+            dims.d[i] = sizes[i - (pad_to - sizes.size())];
+        }
+    }
+    return dims;    
+}
+
+std::vector<int64_t> nvdim_to_sizes(const nvinfer1::Dims& dim) {
+    std::vector<int64_t> sizes;
+    for (int i = 0; i < dim.nbDims; i++) {
+        sizes.push_back(dim.d[i]);
+    }
+    return std::move(sizes);
+}
+
+std::string nvdim_to_str(const nvinfer1::Dims& dim) {
+    std::stringstream ss;
+    ss << dim;
+    return ss.str();
+}
+
+int64_t nvdim_to_volume(const nvinfer1::Dims& dim) {
+    return std::accumulate(dim.d, dim.d + dim.nbDims, 1, std::multiplies<int64_t>());
+}
+
+nvinfer1::Dims unpad_nvdim(const nvinfer1::Dims& dim) {
+    nvinfer1::Dims new_dim;
+    int j = 0;
+    bool pad_dims_done = false;
+    
+    for (int i = 0; i < dim.nbDims; i++) {
+        if (dim.d[i] == 1 && !pad_dims_done) {
+            // skip over unecessary dimension
+            continue;
+        } else {
+            new_dim.d[j] = dim.d[i];
+            j++;
+            // keep all other dimensions (don't skip over them)
+            pad_dims_done = true;
+        }
+    }
+    new_dim.nbDims = j;
+    return new_dim;
+}
+
+bool gen_tensor_type(const torch::jit::Node& node, const size_t index, nvinfer1::DataType& nv_type) {
+    c10::optional<at::ScalarType> maybe_type;
+    //at::ArrayRef<const torch::jit::Value*> inputs = node.inputs();
+    std::shared_ptr<torch::jit::Graph> subgraph = node.g(torch::jit::attr::Subgraph);
+    at::ArrayRef<torch::jit::Value*> inputs = subgraph->inputs();
+    //for (size_t index = 0; index < inputs.size(); index++) {
+    auto value = inputs[index];
+
+    //extract scalar type from tensor.
+    if (value->type()->isSubtypeOf(c10::TensorType::get())) {
+        c10::TensorTypePtr op = value->type()->cast<c10::TensorType>();
+        if (op->scalarType().has_value()) {
+            maybe_type = op->scalarType().value();
+        }
+
+    //extract scalar type from tensorlist.
+    } else if (value->type()->isSubtypeOf(c10::ListType::ofTensors())) {
+        auto list_element = value->type()->cast<c10::ListType>()->getElementType();
+        //TODO: ADD SOPPORT HERE
+        LOG(WARNING) << "gen_tensor_type for tensorlist to add more";
+        return false;
+    }
+
+    //this is added because tensorrt only support five kinds of date type 
+    //(kFloat / kHalf / kINT8 / kINT32 / kBOOL) for now. (2021.08.01)
+    if (maybe_type.has_value() && maybe_type.value() != at::kFloat &&
+        maybe_type.value() != at::kHalf && maybe_type.value() != at::kChar &&
+        maybe_type.value() != at::kInt && maybe_type.value() != at::kBool) {
+        // when we meet at::KLong and globalContext allow us to down to at::KInt
+        if (maybe_type.value() == at::kLong && PorosGlobalContext::instance().get_poros_options().long_to_int == true) {
+            nv_type = attype_to_nvtype(at::kInt);
+            LOG(WARNING) << "gen_tensor_type meets at::KLong tensor type, change this to at::KInt. "
+                    << "Attention: this may leed to percision change";
+            return true;       
+        }
+        LOG(WARNING) << "gen_tensor_type failed, reason: "
+                 << "given scalartype is not supported by tensorrt";
+        return false;
+    }
+
+    if (maybe_type.has_value()) {
+        nv_type = attype_to_nvtype(maybe_type.value());
+        return true;
+    } else {
+        LOG(WARNING) << "gen_tensor_type failed, reason: "
+            << "cant't extract scalar type from all the input value";
+        return false;
+    }
+}
+
+at::ScalarType nvtype_to_attype(nvinfer1::DataType type) {
+    auto trt_at_type_map = get_trt_at_type_map();
+    if (trt_at_type_map.find(type) == trt_at_type_map.end()) {
+        LOG(FATAL) << "unsupported tensorrt datatype";
+        throw std::runtime_error("unsupported tensorrt datatype");
+    }
+    return trt_at_type_map.at(type);
+}
+
+nvinfer1::DataType attype_to_nvtype(at::ScalarType type) {
+    auto at_trt_type_map = get_at_trt_type_map();
+    if (at_trt_type_map.find(type) == at_trt_type_map.end()) {
+        LOG(FATAL) << "unsupported aten datatype";
+        throw std::runtime_error("unsupported aten datatype");
+    }
+    return at_trt_type_map.at(type);
+}
+
+nvinfer1::Dims unsqueeze_dims(const nvinfer1::Dims& d, int pos, int val, bool use_zeros) {
+    // acceptable range for pos is [0, d.nbDims]
+    POROS_CHECK(pos >= 0 && pos <= d.nbDims, "ERROR: Index to unsqueeze is out of bounds.");
+    nvinfer1::Dims dims;
+    for (int i = 0, j = 0; j <= d.nbDims; j++) {
+        // add new dimension at pos
+        if (j == pos) {
+            dims.d[j] = val;
+        } else {
+            dims.d[j] = (use_zeros && d.d[i] == -1) ? 0 : d.d[i];
+            ++i;
+        }
+    }
+    dims.nbDims = d.nbDims + 1;
+    return dims;
+}
+
+nvinfer1::Dims squeeze_dims(const nvinfer1::Dims& d, int pos, bool use_zeros) {
+    // acceptable range for pos is [0, d.nbDims]
+    POROS_CHECK(pos >= 0 && pos <= d.nbDims, "ERROR: Index to unsqueeze is out of bounds.");
+    nvinfer1::Dims dims;
+    int j = 0;
+    for (int i = 0; i < d.nbDims; i++) {
+        if (i != pos) {
+            dims.d[j++] = (use_zeros && d.d[i] == -1) ? 0 : d.d[i];
+        }
+    }
+    dims.nbDims = j;
+    return dims;
+}
+
+bool check_nvtensor_is_dynamic(const nvinfer1::ITensor* nvtensor) {
+    POROS_CHECK(nvtensor != nullptr, "input nvtensor is null");
+    nvinfer1::Dims nvtensor_dims = nvtensor->getDimensions();
+    for (int i = 0; i < nvtensor_dims.nbDims; i++) {
+        if (nvtensor_dims.d[i] < 0) {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool input_is_dynamic(torch::jit::Value* input) {
+    auto _value_dynamic_shape_map = PorosGlobalContext::instance()._value_dynamic_shape_map;
+    if (_value_dynamic_shape_map.find(input) != _value_dynamic_shape_map.end()) {
+        auto min_shapes = _value_dynamic_shape_map[input].min_shapes;
+        auto max_shapes = _value_dynamic_shape_map[input].max_shapes;
+        for(size_t i = 0; i < min_shapes.size(); i++) {
+            if (max_shapes[i] != min_shapes[i]) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/engine/trtengine_util.h b/poros/src/poros/engine/trtengine_util.h
new file mode 100644
index 0000000000..93aa629708
--- /dev/null
+++ b/poros/src/poros/engine/trtengine_util.h
@@ -0,0 +1,130 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file trtengine_util.h
+* @author tianjinjin@baidu.com
+* @date Wed Jul 21 11:45:49 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+//from pytorch
+#include "torch/script.h"
+//from tensorrt
+#include "NvInfer.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+//实现nvinfer::Dims的 == 运算符
+inline bool operator==(const nvinfer1::Dims& in1, const nvinfer1::Dims& in2) {
+    if (in1.nbDims != in2.nbDims) {
+        return false;
+    }
+    // TODO maybe look to support broadcasting comparisons
+    for (int64_t i = 0; i < in1.nbDims; i++) {
+        if (in1.d[i] != in2.d[i]) {
+            return false;
+        }
+    }
+    return true;
+}
+
+//实现nvinfer::Dims的 != 运算符
+inline bool operator!=(const nvinfer1::Dims& in1, const nvinfer1::Dims& in2) {
+    return !(in1 == in2);
+}
+
+//实现nvinfer::Dims的<<运算符
+template <typename T>
+inline std::ostream& print_sequence(std::ostream& stream, const T* begin, int count) {
+    stream << "[";
+    if (count > 0) {
+        std::copy_n(begin, count - 1, std::ostream_iterator<T>(stream, ", "));
+        stream << begin[count - 1];
+    }
+    stream << "]";
+    return stream;
+}
+
+inline std::ostream& operator<<(std::ostream& stream, const nvinfer1::Dims& shape) {
+    return print_sequence(stream, shape.d, shape.nbDims);
+}
+
+//实现nvinfer::DataType的<<运算符
+inline std::ostream& operator<<(std::ostream& stream, const nvinfer1::DataType& dtype) {
+    switch (dtype) {
+        case nvinfer1::DataType::kFLOAT:
+            return stream << "Float32";
+        case nvinfer1::DataType::kHALF:
+            return stream << "Float16";
+        case nvinfer1::DataType::kINT8:
+            return stream << "Int8";
+        case nvinfer1::DataType::kINT32:
+            return stream << "Int32";
+        case nvinfer1::DataType::kBOOL:
+            return stream << "Bool";
+        default:
+            return stream << "Unknown Data Type";
+    }
+}
+
+// 创建智能指针
+template <class T>
+std::shared_ptr<T> make_shared_ptr(T* p) {
+    return std::shared_ptr<T>(p);
+}
+
+//int64_t volume(const nvinfer1::Dims& dim);  //move to nvdim_to_volume
+bool broadcastable(nvinfer1::Dims a, nvinfer1::Dims b, bool multidirectional = true);
+
+//以下四个函数，实现tensorrt的dims结构与vec形式的sizes的互换。
+nvinfer1::Dims sizes_to_nvdim(const std::vector<int64_t>& sizes);
+nvinfer1::Dims sizes_to_nvdim(c10::IntArrayRef sizes);
+nvinfer1::Dims sizes_to_nvdim(c10::List<int64_t> sizes);
+nvinfer1::Dims sizes_to_nvdim_with_pad(c10::IntArrayRef sizes, uint64_t pad_to);
+nvinfer1::Dims sizes_to_nvdim_with_pad(c10::List<int64_t> sizes, uint64_t pad_to);
+//以下三个函数，实现tensorrt的dim到其他形式的转换。
+std::vector<int64_t> nvdim_to_sizes(const nvinfer1::Dims& dim);
+std::string nvdim_to_str(const nvinfer1::Dims& dim);
+int64_t nvdim_to_volume(const nvinfer1::Dims& dim);
+
+//以下一个函数，实现dim的unpad
+nvinfer1::Dims unpad_nvdim(const nvinfer1::Dims& dim);
+
+//以下两个函数，实现tensorrt与aten的类型互转
+//transform tensorrt-type to aten-type(which used in pytorch and torchscript)
+at::ScalarType nvtype_to_attype(nvinfer1::DataType type);
+//transform aten-type(which used in pytorch and torchscript) to tensorrt-type
+nvinfer1::DataType attype_to_nvtype(at::ScalarType type);
+
+//以下两个函数，实现tensorrt的dims的展开和压缩(???)。
+nvinfer1::Dims unsqueeze_dims(const nvinfer1::Dims& d, int pos, int val = 1, bool use_zeros = true);
+nvinfer1::Dims squeeze_dims(const nvinfer1::Dims& d, int pos, bool use_zeros = true);
+
+/*
+* @brief 通过node的输入信息，获取相应的tensor的类型。
+**/
+bool gen_tensor_type(const torch::jit::Node& node, const size_t index, nvinfer1::DataType & nv_type);
+// 检查输入nvtensor是否为dynamic输入
+bool check_nvtensor_is_dynamic(const nvinfer1::ITensor* nvtensor);
+// 检查一个子图输入是否为dynamic
+bool input_is_dynamic(torch::jit::Value* input);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/iplugin/plugin_create.cpp b/poros/src/poros/iplugin/plugin_create.cpp
new file mode 100644
index 0000000000..21e0d9e2e4
--- /dev/null
+++ b/poros/src/poros/iplugin/plugin_create.cpp
@@ -0,0 +1,91 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+ /**
+ * @file plugin_create.cpp
+ * @author huangben(huangben@baidu.com)
+ * @date 2018/10/23 14:16:18
+ * @version $Revision$ 
+ * @brief 
+ **/
+#include "poros/iplugin/plugin_create.h"
+#include "poros/log/poros_logging.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+    static plugin_creator_map_t _g_creator_map;
+
+    void register_plugin_creator(const std::string& plugin_name, plugin_creator_t creator) {
+        if (_g_creator_map.find(plugin_name) != _g_creator_map.end()) {
+            //throw bsl::KeyAlreadyExistException() << BSL_EARG
+            //    << "[plugin_name:" << plugin_name << "]";
+            LOG(ERROR) << plugin_name << " had resiterd! there have more than 1 plugin use samename";
+        }
+        _g_creator_map[plugin_name] = creator;
+    }
+    
+    void register_plugin_creator(const std::string& plugin_name, plugin_creator_t creator, 
+            plugin_creator_map_t& plugin_creator_map) {
+
+        if (plugin_creator_map.find(plugin_name) != plugin_creator_map.end()) {
+            //throw bsl::KeyAlreadyExistException() << BSL_EARG
+            //    << "[plugin_name:" << plugin_name << "]";
+            LOG(ERROR) << plugin_name << " had resiterd! there have more than 1 plugin use samename";
+        }
+        plugin_creator_map[plugin_name] = creator;
+    }
+
+    IPlugin* create_plugin(const std::string& plugin_name) {
+        plugin_creator_map_t::const_iterator it;
+        
+        it = _g_creator_map.find(plugin_name);
+        if (it == _g_creator_map.end()) {
+            LOG(FATAL) << "No such plugin type:" << plugin_name;
+            return NULL;
+        }
+        return it->second();
+    }
+    
+    IPlugin* create_plugin(const std::string& plugin_name, const plugin_creator_map_t& plugin_creator_map) {
+        plugin_creator_map_t::const_iterator it;
+        
+        it = plugin_creator_map.find(plugin_name);
+        if (it == plugin_creator_map.end()) {
+            LOG(FATAL) << "No such plugin type:" << plugin_name;
+            return NULL;
+        }
+        return it->second();
+    }
+
+    //void create_all_plugins(std::unordered_map<std::string, IPlugin*>& plugin_m) {
+    //    for (auto& e : _g_creator_map) {
+    //        plugin_m[e.first] = e.second();
+    //    }
+    //}
+    void create_all_plugins(const plugin_creator_map_t& plugin_creator_map, 
+            std::unordered_map<std::string, IPlugin*>& plugin_m) {
+        for (auto& e : plugin_creator_map) {
+            plugin_m[e.first] = e.second();
+        }
+    }
+    
+}//poros
+}//mirana
+}//baidu
+
+
+/* vim: set ts=4 sw=4 sts=4 tw=100 */
diff --git a/poros/src/poros/iplugin/plugin_create.h b/poros/src/poros/iplugin/plugin_create.h
new file mode 100644
index 0000000000..586a2a0468
--- /dev/null
+++ b/poros/src/poros/iplugin/plugin_create.h
@@ -0,0 +1,73 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+ /**
+ * @file plugin_create.h
+ * @author huangben(huangben@baidu.com)
+ * @date 2018/10/23 14:16:18
+ * @version $Revision$ 
+ * @brief 
+ **/
+#pragma once
+
+#include <unordered_map>
+#include <string>
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class IPlugin {
+public:
+    virtual ~IPlugin() {}
+    virtual const std::string who_am_i() = 0;
+};
+
+typedef IPlugin* (*plugin_creator_t)();
+typedef std::unordered_map<std::string, plugin_creator_t> plugin_creator_map_t;
+
+IPlugin* create_plugin(const std::string& plugin_name);
+IPlugin* create_plugin(const std::string& plugin_name, const plugin_creator_map_t& plugin_creator_map);
+
+void create_all_plugins(const plugin_creator_map_t& plugin_creator_map, 
+        std::unordered_map<std::string, IPlugin*>& plugin_m);
+//void create_all_plugins(std::unordered_map<std::string, IPlugin*>& plugin_m);
+
+template <typename PluginType>
+IPlugin* default_plugin_creator() {
+    return new (std::nothrow)PluginType;
+}
+
+void register_plugin_creator(const std::string& plugin_name, plugin_creator_t creator);
+void register_plugin_creator(const std::string& plugin_name, 
+        plugin_creator_t creator, plugin_creator_map_t& plugin_creator_map);
+
+template <typename PluginType>
+void register_plugin_class(const std::string& plugin_name) {
+    return register_plugin_creator(plugin_name, default_plugin_creator<PluginType>);
+}
+
+//推荐使用此版本
+template <typename PluginType>
+void register_plugin_class(const std::string& plugin_name, plugin_creator_map_t& plugin_creator_map) {
+    return register_plugin_creator(plugin_name, default_plugin_creator<PluginType>, plugin_creator_map);
+}
+
+}//poros
+}//mirana
+}//baidu
+
+
+/* vim: set ts=4 sw=4 sts=4 tw=100 */
diff --git a/poros/src/poros/log/poros_logging.h b/poros/src/poros/log/poros_logging.h
new file mode 100644
index 0000000000..0cbe1d141d
--- /dev/null
+++ b/poros/src/poros/log/poros_logging.h
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file poros_logging.h
+* @author tianjinjin@baidu.com
+* @date Wed Jun  2 20:54:24 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+//from pytorch
+#include "c10/util/Logging.h"
diff --git a/poros/src/poros/log/tensorrt_logging.cpp b/poros/src/poros/log/tensorrt_logging.cpp
new file mode 100644
index 0000000000..65064b1093
--- /dev/null
+++ b/poros/src/poros/log/tensorrt_logging.cpp
@@ -0,0 +1,97 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file tensorrt_logging.cpp
+* @author tianjinjin@baidu.com
+* @date Wed Jun  2 21:14:23 CST 2021
+* @brief 
+**/
+
+#include <iostream>
+#include "poros/context/poros_global.h"
+#include "poros/log/tensorrt_logging.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+/*
+TensorrtLogger::TensorrtLogger(logging::LogSeverity severity) {
+    _severity = severity;
+    switch (severity) {
+        case logging::BLOG_INFO:
+            _nv_level = nvinfer1::ILogger::Severity::kINFO;
+            break;
+        case logging::BLOG_NOTICE:
+            _nv_level = nvinfer1::ILogger::Severity::kVERBOSE;
+            break;
+        case logging::BLOG_WARNING:
+            _nv_level = nvinfer1::ILogger::Severity::kWARNING;
+            break;
+        case logging::BLOG_ERROR:
+            _nv_level = nvinfer1::ILogger::Severity::kERROR;
+            break;
+        case logging::BLOG_FATAL:
+            _nv_level = nvinfer1::ILogger::Severity::kINTERNAL_ERROR;
+            break;
+        default:
+            break;           
+    }
+} */
+
+TensorrtLogger::TensorrtLogger() {
+    auto debug = PorosGlobalContext::instance().get_poros_options().debug;
+    _torch_level = debug ?  0 : 1;
+    _nv_level = debug ? nvinfer1::ILogger::Severity::kVERBOSE : 
+                        nvinfer1::ILogger::Severity::kWARNING;
+}
+
+TensorrtLogger::TensorrtLogger(uint32_t torch_level) {
+    _torch_level = torch_level;
+    switch (torch_level) {
+    case 3: /*c10::GLOG_FATAL*/
+        _nv_level = nvinfer1::ILogger::Severity::kINTERNAL_ERROR;
+        break;
+    case 2: /*c10::GLOG_ERROR*/
+        _nv_level = nvinfer1::ILogger::Severity::kERROR;
+        break;
+    case 1: /*c10::GLOG_WARNING*/
+        _nv_level = nvinfer1::ILogger::Severity::kWARNING;
+        break;
+    case 0: /*c10::GLOG_INFO*/
+        _nv_level = nvinfer1::ILogger::Severity::kVERBOSE;
+        break;
+    default: /*c10::GLOG_WARNING*/
+        _nv_level = nvinfer1::ILogger::Severity::kWARNING;
+        break;
+    }
+}
+
+void TensorrtLogger::log(nvinfer1::ILogger::Severity severity, const char* msg) noexcept {
+    // suppress unprintable messages
+    if (severity > _nv_level) {
+        return;
+    }
+    std::cout << msg << std::endl;  //TO MAKE THIS BETTER.
+}
+
+TensorrtLogger& get_nvlogger() {
+  static TensorrtLogger nv_logger;
+  return nv_logger;
+}
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/log/tensorrt_logging.h b/poros/src/poros/log/tensorrt_logging.h
new file mode 100644
index 0000000000..795f27e3ef
--- /dev/null
+++ b/poros/src/poros/log/tensorrt_logging.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file tensorrt_logging.h
+* @author tianjinjin@baidu.com
+* @date Wed Jun  2 20:54:24 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+
+//from pytorch
+#include "c10/util/Logging.h"
+
+//from tensorrt
+#include "NvInfer.h"
+
+#include "poros/log/poros_logging.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+/**
+ * the required logger setting for tensorrt engine
+ * **/
+class TensorrtLogger : public nvinfer1::ILogger {
+public:
+    TensorrtLogger();
+    TensorrtLogger(uint32_t torch_level);
+    void log(nvinfer1::ILogger::Severity severity, const char* msg) noexcept;
+    uint32_t torch_level() {
+        return _torch_level;
+    }
+
+private:
+    uint32_t _torch_level = 1;
+    nvinfer1::ILogger::Severity _nv_level;
+};
+
+TensorrtLogger& get_nvlogger();
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/lowering/eliminate_exception_pass.cpp b/poros/src/poros/lowering/eliminate_exception_pass.cpp
new file mode 100644
index 0000000000..133e519483
--- /dev/null
+++ b/poros/src/poros/lowering/eliminate_exception_pass.cpp
@@ -0,0 +1,119 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file eliminate_exception_pass.cpp
+* @author tianjinjin@baidu.com
+* @date Thu Sep 23 11:15:49 CST 2021
+* @brief
+**/
+#include "poros/lowering/lowering_pass.h"
+
+#include <torch/csrc/jit/passes/dead_code_elimination.h>
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+namespace {
+using namespace torch::jit;
+struct EliminateExceptionPasses {
+    EliminateExceptionPasses(std::shared_ptr<Graph> graph) : graph_(std::move(graph)) {}
+
+    void run() {
+        find_exception_if_node(graph_->block());
+        torch::jit::EliminateDeadCode(graph_);
+    }
+
+private:
+    bool is_exception_if_node(Node* n) {
+        /// Check if this Node hosts a pattern like so:
+        /// situation 1:
+        ///  = prim::If(%5958)
+        ///   block0():
+        ///    -> ()
+        ///   block1():
+        ///     = prim::RaiseException(%45)
+        ///    -> ()
+
+        /// situation 2:
+        ///  = prim::If(%5958)
+        ///   block0():
+        ///    = prim::RaiseException(%45)
+        ///      -> ()
+        ///   block1():
+        ///   -> ()
+        if (n->blocks().size() != 2) {
+            return false;
+        }
+        auto arm1 = n->blocks()[0];
+        auto arm2 = n->blocks()[1];
+        if (arm1->outputs().size() != 0 || arm2->outputs().size() != 0) {
+            // Make sure that the node doesn't actually produce any Value that are
+            // used by other nodes
+            return false;
+        }
+
+        auto arm1_start = arm1->nodes().begin();
+        auto arm2_start = arm2->nodes().begin();
+
+        if ((*arm1_start)->kind() == prim::Return) {
+            // Make sure that block0 is solely the return
+            if ((*arm2_start)->kind() != prim::RaiseException || (*(++arm2_start))->kind() != prim::Return) {
+                // Make sure that block1 is solely just the exception and the return
+                return false;
+            }
+            return true;
+        }
+
+        if ((*arm2_start)->kind() == prim::Return) {
+            // Make sure that block1 is solely the return
+            if ((*arm1_start)->kind() != prim::RaiseException || (*(++arm1_start))->kind() != prim::Return) {
+                // Make sure that block0 is solely just the exception and the return
+                return false;
+            }
+            return true;
+        }
+        return false;
+    }
+
+    void find_exception_if_node(Block* b) {
+        for (auto it = b->nodes().begin(); it != b->nodes().end(); it++) {
+            auto n = *it;
+            if (n->kind() == prim::If && is_exception_if_node(n)) {
+                it.destroyCurrent();
+            } else if (n->kind() == prim::If) {
+                auto true_block = n->blocks()[0];
+                find_exception_if_node(true_block);
+                auto false_block = n->blocks()[1];
+                find_exception_if_node(false_block);
+            } else if (n->kind() == prim::Loop) {
+                auto loop_block = n->blocks()[0];
+                find_exception_if_node(loop_block);
+            }
+        }
+    }
+
+    std::shared_ptr<Graph> graph_;
+};
+} // namespace
+
+void eliminate_exception_pass(std::shared_ptr<torch::jit::Graph> graph) {
+    EliminateExceptionPasses eppe(std::move(graph));
+    eppe.run();
+}
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/lowering/eliminate_maxpoll_with_indices.cpp b/poros/src/poros/lowering/eliminate_maxpoll_with_indices.cpp
new file mode 100644
index 0000000000..c18b40208f
--- /dev/null
+++ b/poros/src/poros/lowering/eliminate_maxpoll_with_indices.cpp
@@ -0,0 +1,129 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file eliminate_maxpoll_with_indices.cpp
+* @author tianjinjin@baidu.com
+* @date Tue Sep 13 11:06:07 CST 2022
+* @brief
+**/
+#include "poros/lowering/lowering_pass.h"
+
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/common_subexpression_elimination.h>
+#include <torch/csrc/jit/passes/constant_pooling.h>
+#include <torch/csrc/jit/passes/constant_propagation.h>
+#include <torch/csrc/jit/passes/dead_code_elimination.h>
+
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+namespace {
+using namespace torch::jit;
+/**
+ * @brief 尝试用maxpool 代替 maxpool_with_indeces.
+ * 以 maxpoll2d 为例：
+ * maxpoll2d_with_indices 的schema为：aten::max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
+ * 而 maxpoll 的schema为：aten::max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
+ * 这两个op，输入参数完全一致，输出上，max_pool2d_with_indices有两个输出，第一个输出与max_pool2d的输出完全一致，第二个输出为indeces信息。
+ * 当 max_pool2d_with_indices 的第二个输出indices，后续没有其他op使用该value的时候，
+ * 我们直接用max_pool2d 替代 max_pool2d_with_indices。
+ **/
+struct EliminateMaxpollWithIndices {
+    EliminateMaxpollWithIndices(std::shared_ptr<Graph> graph) : graph_(std::move(graph)) {}
+
+    void run() {    
+        GRAPH_DUMP("before eliminate_maxpool_with_indices Graph: ", graph_);
+        bool changed = eliminate_maxpool_with_indices(graph_->block());
+        if (changed) {
+            ConstantPropagation(graph_);
+            EliminateDeadCode(graph_);
+            EliminateCommonSubexpression(graph_);
+            ConstantPooling(graph_);
+        }
+        GRAPH_DUMP("after eliminate_maxpool_with_indices Graph: ", graph_);
+        return;
+    }
+
+private:
+
+    bool is_maxpoll_with_indices_pattern(Node* node) {
+        if (node->kind() != aten::max_pool1d_with_indices &&
+            node->kind() != aten::max_pool2d_with_indices &&
+            node->kind() != aten::max_pool3d_with_indices){
+            return false;
+        }
+
+        //当outputs的第二个值，也就是indices，没有被其他op使用的时候，满足替换条件。
+        Value* indices = node->output(1);
+        if (indices->uses().size() == 0) {
+            return true;
+        }
+        return false;
+    }
+
+    bool replace_maxpool_with_indices(Node* node) {
+        NodeKind replace_kind = aten::max_pool1d;
+        if (node->kind() == aten::max_pool2d_with_indices) {
+            replace_kind = aten::max_pool2d;
+        } else if (node->kind() == aten::max_pool3d_with_indices) {
+            replace_kind = aten::max_pool3d;
+        };
+
+        Node* maxpool_node = graph_->create(replace_kind, node->inputs());
+        maxpool_node->output(0)->setType(node->output(0)->type());
+        maxpool_node->copyMetadata(node);
+        maxpool_node->insertBefore(node);
+        node->output(0)->replaceAllUsesAfterNodeWith(node, maxpool_node->output(0));
+        //node->output(0)->replaceAllUsesWith(maxpool_node->output(0));
+
+        LOG(INFO) << "destroy maxpool_with_indeces node now: " << node_info(node);
+        node->destroy();
+        return true;
+    }
+
+    bool eliminate_maxpool_with_indices(Block* block) {
+        bool changed = false;
+        for (auto it = block->nodes().begin(); it != block->nodes().end();) {
+            // we might destroy the current node, so we need to pre-increment
+            // the iterator
+            Node* node = *it;
+            ++it;
+            for (Block* subblock : node->blocks()) {
+                changed |= eliminate_maxpool_with_indices(subblock);
+            }
+            if (is_maxpoll_with_indices_pattern(node)) {
+                LOG(INFO) << "find maxpoll with indices pattern: " << node_info(node);
+                changed |= replace_maxpool_with_indices(node);
+            }
+        }
+        return changed;
+    }
+
+std::shared_ptr<Graph> graph_;
+};
+
+} // namespace
+
+void eliminate_maxpool_with_indices(std::shared_ptr<torch::jit::Graph> graph) {
+    EliminateMaxpollWithIndices emwi(std::move(graph));
+    emwi.run();
+}
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/lowering/eliminate_simple_useless_nodes.cpp b/poros/src/poros/lowering/eliminate_simple_useless_nodes.cpp
new file mode 100644
index 0000000000..139bcbeb5c
--- /dev/null
+++ b/poros/src/poros/lowering/eliminate_simple_useless_nodes.cpp
@@ -0,0 +1,99 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file eliminate_simple_useless_nodes.cpp
+* @author tianshaoqing@baidu.com
+* @date 2022-08-25 11:06:26
+* @brief
+**/
+#include "poros/lowering/lowering_pass.h"
+
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/common_subexpression_elimination.h>
+#include <torch/csrc/jit/passes/constant_pooling.h>
+#include <torch/csrc/jit/passes/constant_propagation.h>
+#include <torch/csrc/jit/passes/dead_code_elimination.h>
+
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+namespace {
+using namespace torch::jit;
+
+struct EliminateSimpleUselessNodes {
+    EliminateSimpleUselessNodes(std::shared_ptr<Graph> graph) : graph_(std::move(graph)) {
+        useless_schema_set_.emplace(torch::jit::parseSchema("aten::dropout(Tensor input, float p, "
+        "bool train) -> Tensor").operator_name());
+        useless_schema_set_.emplace(torch::jit::parseSchema("aten::warn(str message, int stacklevel=2) "
+        "-> ()").operator_name());
+    }
+
+    void run() {
+        GRAPH_DUMP("before eliminate_simple_useless_nodes Graph: ", graph_);
+        bool changed = find_and_eliminate_simple_useless_nodes(graph_->block());
+        if (changed) {
+            ConstantPropagation(graph_);
+            EliminateDeadCode(graph_);
+            EliminateCommonSubexpression(graph_);
+            ConstantPooling(graph_);
+        }
+        GRAPH_DUMP("after eliminate_simple_useless_nodes Graph: ", graph_);
+        return;
+    }
+
+private:
+    bool find_and_eliminate_simple_useless_nodes(Block* block) {
+        bool graph_changed = false;
+        auto it = block->nodes().begin();
+        while (it != block->nodes().end()) {
+            auto node = *it;
+            ++it;  //++it first, node may be destroyed later。
+            for (auto sub_block: node->blocks()) {
+                if (find_and_eliminate_simple_useless_nodes(sub_block)) {
+                    graph_changed = true;
+                }
+            }
+            
+            if (node->maybeSchema() && useless_schema_set_.count(node->schema().operator_name())) {
+                if (node->kind() == torch::jit::aten::warn) {
+                    node->destroy();
+                }
+                if (node->kind() == torch::jit::aten::dropout) {
+                    node->output(0)->replaceAllUsesWith(node->input(0));
+                    node->destroy();
+                }
+                graph_changed = true;
+            }
+        }
+        return graph_changed;
+    }
+
+    std::shared_ptr<Graph> graph_;
+    std::unordered_set<c10::OperatorName> useless_schema_set_;
+};
+
+} // namespace
+
+void eliminate_simple_useless_nodes(std::shared_ptr<torch::jit::Graph> graph) {
+    EliminateSimpleUselessNodes esun(std::move(graph));
+    esun.run();
+}
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
\ No newline at end of file
diff --git a/poros/src/poros/lowering/eliminate_some_dict.cpp b/poros/src/poros/lowering/eliminate_some_dict.cpp
new file mode 100644
index 0000000000..e43daca5fe
--- /dev/null
+++ b/poros/src/poros/lowering/eliminate_some_dict.cpp
@@ -0,0 +1,166 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file eliminate_some_dict.cpp
+* @author tianjinjin@baidu.com
+* @date Wed Jan 26 19:41:32 CST 2022
+* @brief
+**/
+#include "poros/lowering/lowering_pass.h"
+
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/common_subexpression_elimination.h>
+#include <torch/csrc/jit/passes/constant_pooling.h>
+#include <torch/csrc/jit/passes/constant_propagation.h>
+#include <torch/csrc/jit/passes/dead_code_elimination.h>
+
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+namespace {
+using namespace torch::jit;
+
+struct EliminateSomeDict {
+    EliminateSomeDict(std::shared_ptr<Graph> graph) : graph_(std::move(graph)) {}
+
+    void run() {
+        GRAPH_DUMP("before eliminate_some_dicts Graph: ", graph_);
+        bool changed = eliminate_dict_getitems(graph_->block());
+        if (changed) {
+            ConstantPropagation(graph_);
+            EliminateDeadCode(graph_);
+            EliminateCommonSubexpression(graph_);
+            ConstantPooling(graph_);
+        }
+        GRAPH_DUMP("after eliminate_some_dicts Graph: ", graph_);
+        return;
+    }
+
+private:
+    /*
+    * @brief 
+    * 将以下graph:
+    *        %key1 : str = prim::Constant[value="first_key"]()
+    *        %key2 : str = prim::Constant[value="second_key"]()
+    *        %key3 : str = prim::Constant[value="third_key"]()
+    *        %resdict = prim::DictConstruct(%key1, %value1, %key2, %value2, %key3, %value3)
+    *        %res1 = aten::__getitem__(%resdict, %key1)
+    *        %res2 = aten::__getitem__(%resdict, %key2)
+    *        %1 = aten::matmul(%res1, %const.1)
+    *        %2 = aten::matmul(%res2, %const.2)
+    *        
+    * 替换成以下graph：
+    *        %1 = aten::matmul(%value1, %const.1)
+    *        %2 = aten::matmul(%value2, %const.2)
+    * 
+    * 需要特别注意的是：
+    *    1. 如果有多个__getitem__的时候，prim::DictConstruct不可删除，
+    *       当所有的__getitem__都处理完了才能删除prim::DictConstruct
+    * */
+    bool is_dict_getitem_node(Node* node) {
+        if (node->kind() != aten::__getitem__ ||
+            node->inputs().at(1)->node()->kind() != prim::Constant) {
+            return false;
+        }
+        auto producer_node = node->inputs().at(0)->node();
+        if (producer_node->kind() != prim::DictConstruct ||
+            producer_node->owningBlock() != node->owningBlock()) {
+            return false;
+        }
+
+        //TODO: this can be changed to more loose condition
+        for (auto &use: node->inputs().at(0)->uses()) {
+            if (use.user->kind() != aten::__getitem__) {
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    bool eliminate_dict_getitem_after_construct(Node* node) {
+        //提取get_item节点的key信息，即第二个参数的值。
+        c10::optional<IValue> maybe_key = toIValue(node->inputs().at(1)->node()->output());
+        if (!maybe_key.has_value()) {
+            LOG(INFO) << "can not handle get_item node: " << node_info(node);
+            return false;
+        }
+        auto key = maybe_key.value();
+
+        //找到DictConstruct相应的key, 替换成相应的value。
+        auto producer_node = node->inputs().at(0)->node();
+        at::ArrayRef<Value*> producer_inputs = producer_node->inputs();
+        size_t num_inputs = producer_inputs.size();
+        for(size_t index = 0; index < num_inputs / 2; index++) {
+            if (producer_inputs[index * 2]->node()->kind() != prim::Constant) {
+                continue;
+                // LOG(INFO) << "can not handle DictConstruct node: " << node_info(producer_node); 
+                // return false;
+            } else {
+                c10::optional<IValue> ivalue = toIValue(producer_inputs[index * 2]->node()->output());
+                if (ivalue.has_value() && ivalue.value() == key) {
+                    //开启output value 替换大法。
+                    node->outputs()[0]->replaceAllUsesWith(producer_inputs[index * 2 + 1]);
+                    //本node可以destroy了。
+                    LOG(INFO) << "replace all uses from value: %" << node->outputs()[0]->debugName() 
+                            << " to value: %" << producer_inputs[index * 2 + 1]->debugName();
+                    LOG(INFO) << "destroy getitem node now: " << node_info(node);
+                    node->destroy();
+                    break;
+                }
+            }
+        }
+
+        //当只有一个 getitem 节点的时候，producer 可以destroy了。(无需专门删除，EliminateDeadCode会处理掉producer_node)
+        // if (producer_node->outputs()[0]->uses().size() == 0) {
+        //     LOG(INFO) << "destroy dictConstruct node now: " << node_info(producer_node);
+        //     producer_node->destroy();
+        // }
+        return true;
+    }
+
+    bool eliminate_dict_getitems(Block* block) {
+        bool changed = false;
+        for (auto it = block->nodes().begin(); it != block->nodes().end();) {
+            // we might destroy the current node, so we need to pre-increment the iterator
+            Node* node = *it;
+            ++it;
+            for (Block* subblock : node->blocks()) {
+                changed |= eliminate_dict_getitems(subblock);
+            }
+            if (is_dict_getitem_node(node)) {
+                LOG(INFO) << "meet dict getitem after construct node :" << node_info(node);
+                changed |= eliminate_dict_getitem_after_construct(node);
+            }
+        }
+        return changed;        
+    }
+
+    std::shared_ptr<Graph> graph_;
+};
+
+} // namespace
+
+void eliminate_some_dict(std::shared_ptr<torch::jit::Graph> graph) {
+    EliminateSomeDict esd(std::move(graph));
+    esd.run();
+}
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
\ No newline at end of file
diff --git a/poros/src/poros/lowering/eliminate_some_list.cpp b/poros/src/poros/lowering/eliminate_some_list.cpp
new file mode 100644
index 0000000000..72925bb3ec
--- /dev/null
+++ b/poros/src/poros/lowering/eliminate_some_list.cpp
@@ -0,0 +1,340 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file eliminate_some_list.cpp
+* @author tianjinjin@baidu.com
+* @date Thu Sep 23 11:15:49 CST 2021
+* @brief
+**/
+#include "poros/lowering/lowering_pass.h"
+
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/common_subexpression_elimination.h>
+#include <torch/csrc/jit/passes/constant_pooling.h>
+#include <torch/csrc/jit/passes/constant_propagation.h>
+#include <torch/csrc/jit/passes/dead_code_elimination.h>
+
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+namespace {
+using namespace torch::jit;
+
+struct EliminateSomeList {
+    EliminateSomeList(std::shared_ptr<Graph> graph) : graph_(std::move(graph)) {}
+
+    void run() {    
+        GRAPH_DUMP("before eliminate_some_lists Graph: ", graph_);
+        bool changed = eliminate_list_unpacks(graph_->block());
+        if (changed) {
+            ConstantPropagation(graph_);
+            EliminateDeadCode(graph_);
+            EliminateCommonSubexpression(graph_);
+            ConstantPooling(graph_);
+        }
+        changed = eliminate_list_getitems(graph_->block());
+        if (changed) {
+            ConstantPropagation(graph_);
+            EliminateDeadCode(graph_);
+            EliminateCommonSubexpression(graph_);
+            ConstantPooling(graph_);
+        }
+        GRAPH_DUMP("after eliminate_some_lists Graph: ", graph_);
+        return;
+    }
+
+private:
+    /*
+    * @brief 
+    * 将以下graph：
+    *        %reslist = prim::ListConstruct(%1, %2, %3)
+    *        %res1, %res2, %res3 = prim::ListUnpack(%reslist)
+    *        %4 = aten::matmul(%res1, %const.0)
+    *        %5 = aten::matmul(%res2, %const.1)
+    *        %6 = aten::matmul(%res3, %const.2)
+    *
+    * 或者以下graph：
+    *        %reslist = prim::ListConstruct(%1)
+    *        %reslist.2 = aten::append(%reslist, %2)
+    *        %reslist.3 = aten::append(%reslist, %3)
+    *        %res1, %res2, %res3 = prim::ListUnpack(%reslist)
+    *        %4 = aten::matmul(%res1, %const.0)
+    *        %5 = aten::matmul(%res2, %const.1)
+    *        %6 = aten::matmul(%res3, %const.2)
+    *        
+    * 替换成以下graph：
+    *        %4 = aten::matmul(%1, %const.0)
+    *        %5 = aten::matmul(%2, %const.1)
+    *        %6 = aten::matmul(%3, %const.1)
+    * 
+    * 需要特别注意的是：
+    *    1. 如果是ListConstruct + append 的模式，这些节点都得在同一个block，否则如果append在subblock下面，难以确定append的次数。
+    *    2. ListConstruct 和 ListUnpack 间，除了同block下的append之外，不应该有其他可能改变该list的算子出现，否则会带来非预期的影响。
+    */
+    bool is_list_unpack_pattern(Node* node) {
+        if (node->kind() != prim::ListUnpack) {
+            return false;
+        }
+
+        auto input_value = node->inputs().at(0);
+        auto producer_node = node->inputs().at(0)->node();
+
+        if (producer_node->kind() != prim::ListConstruct ||
+            producer_node->owningBlock() != node->owningBlock()) {
+            return false;
+        }
+        
+        for (auto &use: input_value->uses()) {
+            if (use.user->kind() == aten::append) {
+                if (use.user->output()->uses().size() != 0 ||
+                use.user->owningBlock() != node->owningBlock()) {
+                    //aten::apend 的output有被其他节点用，或者不在一个block
+                    LOG(INFO) << "find unmatched pattern: " << node_info(node);
+                    return false;                   
+                }
+            } else if (use.user->kind() == prim::ListUnpack) {
+                continue;
+            } else {
+                //不满足我们找寻的条件
+                LOG(INFO) << "find unmatched pattern: " << node_info(node);
+                return false;
+            }
+        }
+        return true;
+    }
+
+    bool eliminate_list_unpack_after_construct(Node* node) {
+        //auto input_value = node->inputs().at(0);
+        auto producer_node = node->inputs().at(0)->node();
+        
+        auto output_num = node->outputs().size();
+        auto input_num = producer_node->inputs().size() + node->inputs()[0]->uses().size() - 1;
+        if (input_num != output_num) {
+            LOG(WARNING) << "ListConstruct + aten::append input_num not equal prim::ListUnpack output_num, "
+                        << "bypass this node: " << node_info(node);
+            return false;
+        }
+            
+        std::vector<Value*> input_value_list;
+        //prim::ListConstruct 的input 倒腾进去
+        for (auto value : producer_node->inputs()) {
+            input_value_list.push_back(value);
+        }
+
+        //TODO: 是否要排个序？？
+        std::vector<Node*> append_node_list;
+        for (auto &use: node->inputs()[0]->uses()) {
+            if (use.user->kind() == aten::append) {
+                //aten::append 的 第二个 input 倒腾进去
+                input_value_list.push_back(use.user->inputs()[1]);
+                append_node_list.push_back(use.user);
+            }
+        }
+
+        if (input_value_list.size() != output_num) {
+            LOG(WARNING) << "ListConstruct + aten::append input_num not equal prim::ListUnpack output_num, "
+                        << "bypass this node: " << node_info(node);
+            return false;
+        }
+            
+        int index = 0;
+        //开启output value 替换大法。
+        for (auto output_value : node->outputs()) {
+            auto replace_value = input_value_list[index++];
+            output_value->replaceAllUsesWith(replace_value);
+        }
+
+        //本node可以destroy了。
+        LOG(INFO) << "destroy listUnpack node now: " << node_info(node);
+        node->destroy();
+
+        //aten::append可以destroy了。
+        for (auto &append_node: append_node_list) {
+            LOG(INFO) << "destroy aten::append node now: " << node_info(append_node);
+            append_node->destroy();            
+        }
+            
+        //producer_node可以destroy了。
+        LOG(INFO) << "destroy listConstruct node now: " << node_info(producer_node);
+        producer_node->destroy();
+        return true;
+    }
+
+    /*
+    * @brief 
+    * 将以下graph：
+    *        %reslist = prim::ListConstruct(%1, %2, %3)
+    *        %4 : int = prim::Constant[value=-1]()
+    *        %res1 = aten::__getitem__(%reslist, %const.0)
+    *        %5 = aten::matmul(%res1, %const.1)
+    *
+    * 或者以下graph：
+    *        %reslist = prim::ListConstruct(%1)
+    *        %reslist.2 = aten::append(%reslist, %2)
+    *        %reslist.3 = aten::append(%reslist, %3)
+    *        %4 : int = prim::Constant[value=-1]()
+    *        %res1 = aten::__getitem__(%reslist, %4)
+    *        %5 = aten::matmul(%res1, %const.1)
+    *        
+    * 替换成以下graph：
+    *        %5 = aten::matmul(%3, %const.1)
+    * 
+    * 需要特别注意的是：
+    *    1. 如果有多个__getitem__的时候，append信息和listconstruct不能够删除，
+    *       当所有的__getitem__都处理完了才能删除append和listconstruct
+    * */
+    bool is_list_getitem_node(Node* node) {
+        if (node->kind() != aten::__getitem__ ||
+            node->inputs().at(1)->node()->kind() != prim::Constant) {
+            return false;
+        }
+
+        auto producer_node = node->inputs().at(0)->node();
+        if (producer_node->kind() != prim::ListConstruct ||
+            producer_node->owningBlock() != node->owningBlock()) {
+            return false;
+        }
+
+        return true;
+    }
+
+    bool eliminate_list_getitem_after_construct(Node* node) {
+        auto input_value = node->inputs().at(0);
+        auto producer_node = input_value->node();
+
+        int get_item_count = 0;
+        for (auto &use: input_value->uses()) {
+            if (use.user->kind() == aten::append) {
+                if (use.user->output()->uses().size() != 0 ||
+                use.user->owningBlock() != node->owningBlock()) {
+                    //aten::apend 的output有被其他节点用，或者不在一个block
+                    LOG(INFO) << "find unmatched pattern: " << node_info(node);
+                    return false;           
+                }
+            } else if (use.user->kind() == aten::__getitem__) {
+                get_item_count++;
+                continue;
+            } else {
+                //不满足我们找寻的条件
+                LOG(INFO) << "find unmatched pattern: " << node_info(node);
+                return false;
+            }
+        }
+
+        LOG(INFO) << "find list getitem after construct pattern: " << node_info(node);
+        auto input_num = producer_node->inputs().size() + node->inputs()[0]->uses().size() - 1;
+
+        std::vector<Value*> input_value_list;
+        //prim::ListConstruct 的input 倒腾进去
+        for (auto value : producer_node->inputs()) {
+            input_value_list.push_back(value);
+        }
+
+        //TODO: 是否要排个序？？
+        std::vector<Node*> append_node_list;
+        for (auto &use: node->inputs()[0]->uses()) {
+            if (use.user->kind() == aten::append) {
+                //aten::append 的 第二个 input 倒腾进去
+                input_value_list.push_back(use.user->inputs()[1]);
+                append_node_list.push_back(use.user);
+            }
+        }
+
+        //求取index的值。
+        int64_t index = toIValue((node->inputs()[1])->node()->output()).value().toInt();
+        index = index < 0 ? input_num + index : index;
+        LOG(INFO) << "calculate getitem index number is : " << index;
+
+        //开启output value 替换大法。
+        node->outputs()[0]->replaceAllUsesWith(input_value_list[index]);
+
+        //本node可以destroy了。
+        LOG(INFO) << "destroy getitem node now: " << node_info(node);
+        node->destroy();
+
+        //当只有一个 getitem 节点的时候，aten::append 和 producer 都可以destroy了。
+        if (get_item_count == 1) {
+            for (auto &append_node : append_node_list) {
+                LOG(INFO) << "destroy aten::append node now: " << node_info(append_node);
+                append_node->destroy();
+            }
+            /* 以下的迭代方式可能出core。
+            auto users_count = producer_node->output()->uses().size();
+            for (int user_index = users_count; user_index >= 0; user_index--) {
+                auto append_node = producer_node->output()->uses()[user_index].user;
+                if (append_node->kind() == aten::append) {
+                    LOG(INFO) << "destroy aten::append node now: " << node_info(append_node);
+                    append_node->destroy();
+                }
+            }*/
+            //producer_node可以destroy了。
+            LOG(INFO) << "destroy listConstruct node now: " << node_info(producer_node);
+            producer_node->destroy();
+        }
+        return true;
+    }
+
+    bool eliminate_list_unpacks(Block* block) {
+        bool changed = false;
+        for (auto it = block->nodes().begin(); it != block->nodes().end();) {
+            // we might destroy the current node, so we need to pre-increment
+            // the iterator
+            Node* node = *it;
+            ++it;
+            for (Block* subblock : node->blocks()) {
+                changed |= eliminate_list_unpacks(subblock);
+            }
+            if (is_list_unpack_pattern(node)) {
+                LOG(INFO) << "find list unpack after construct pattern: " << node_info(node);
+                changed |= eliminate_list_unpack_after_construct(node);
+            }
+        }
+        return changed;
+    }
+
+    bool eliminate_list_getitems(Block* block) {
+        bool changed = false;
+        for (auto it = block->nodes().begin(); it != block->nodes().end();) {
+            // we might destroy the current node, so we need to pre-increment
+            // the iterator
+            Node* node = *it;
+            ++it;
+            for (Block* subblock : node->blocks()) {
+                changed |= eliminate_list_getitems(subblock);
+            }
+            if (is_list_getitem_node(node)) {
+                //LOG(INFO) << "meet list getitem after construct node :" << node_info(node);
+                changed |= eliminate_list_getitem_after_construct(node);
+            }
+        }
+        return changed;        
+    }
+
+std::shared_ptr<Graph> graph_;
+};
+
+} // namespace
+
+void eliminate_some_list(std::shared_ptr<torch::jit::Graph> graph) {
+    EliminateSomeList esl(std::move(graph));
+    esl.run();
+}
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/lowering/eliminate_subgraph_uesless_nodes.cpp b/poros/src/poros/lowering/eliminate_subgraph_uesless_nodes.cpp
new file mode 100644
index 0000000000..29ef7d904d
--- /dev/null
+++ b/poros/src/poros/lowering/eliminate_subgraph_uesless_nodes.cpp
@@ -0,0 +1,151 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file eliminate_subgraph_useless_nodes.cpp
+* @author tianshaoqing@baidu.com
+* @date Thu May 16 19:49:02 CST 2022
+* @brief
+**/
+#include "poros/lowering/lowering_pass.h"
+
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+using namespace torch::jit;
+
+bool eliminate_subgraph_useless_nodes(std::shared_ptr<torch::jit::Graph> subgraph, 
+                            torch::jit::Node& subgraph_node, 
+                            const bool is_input) {
+    AT_ASSERT(subgraph_node.kind() == torch::jit::prim::CudaFusionGroup);
+    // init useless schema set
+    std::unordered_set<c10::OperatorName> useless_schema_set;
+    useless_schema_set.emplace(torch::jit::parseSchema(
+        "aten::to.device(Tensor self, Device device, ScalarType dtype, bool non_blocking=False," 
+        " bool copy=False, MemoryFormat? memory_format=None) -> Tensor").operator_name());
+    useless_schema_set.emplace(torch::jit::parseSchema(
+        "aten::to.prim_Device(Tensor(a) self, Device? device, int? dtype=None," 
+        " bool non_blocking=False, bool copy=False) -> (Tensor(b|a))").operator_name());
+    useless_schema_set.emplace(torch::jit::parseSchema("aten::contiguous(Tensor(a) self, *, "
+    "MemoryFormat memory_format=contiguous_format) -> Tensor(a)").operator_name());
+    useless_schema_set.emplace(torch::jit::parseSchema("aten::dropout(Tensor input, float p, "
+    "bool train) -> Tensor").operator_name());
+    useless_schema_set.emplace(torch::jit::parseSchema("aten::detach(Tensor(a) self) -> Tensor(a)").operator_name());
+    useless_schema_set.emplace(torch::jit::parseSchema("aten::unsqueeze(Tensor(a) self, int dim) -> Tensor(a)")\
+    .operator_name());
+    
+    // 由于execute_engine会对cpu的input做to(cuda)操作，所以子图输入前的to(cuda)可以删掉
+    if (is_input) {
+         // 先处理子图输入是aten::to.device的
+        at::ArrayRef<torch::jit::Value*> node_inputs = subgraph_node.inputs();
+        for (size_t i = 0; i < node_inputs.size(); i++) {
+            torch::jit::Node* maybe_to_device_node = node_inputs[i]->node();
+            // 如果在set中找到了aten::to.device，其type参数不是默认，则不能删
+            if (maybe_to_device_node->kind() == torch::jit::aten::to &&
+                useless_schema_set.count(maybe_to_device_node->schema().operator_name()) != 0 &&
+                maybe_to_device_node->input(2)->node()->kind() == torch::jit::prim::Constant &&
+                toIValue(maybe_to_device_node->inputs()[2])->isNone()) {
+
+                auto to_device_users = maybe_to_device_node->output(0)->uses();
+                // 需要保证aten::to.device output的所有user只有prim::CudaFusionGroup这一种
+                bool all_users_cfg = true;
+                for (size_t u = 0; u < to_device_users.size(); u++) {
+                    if (to_device_users[u].user->kind() != prim::CudaFusionGroup) {
+                        all_users_cfg = false;
+                        break;
+                    }
+                }
+                if (!all_users_cfg) {
+                    continue;
+                }
+                // 给所有使用aten::to.device的子图替换输入
+                for (size_t u = 0; u < to_device_users.size(); u++) {
+                    to_device_users[u].user->replaceInput(to_device_users[u].offset, maybe_to_device_node->input(0));
+                    LOG(INFO) << "Remove aten::to.device input[" << i << "] of subgraph: " << 
+                    node_info(to_device_users[u].user) << ", which is useless.";
+                }
+                LOG(INFO) << "Destory node schema: [ " << maybe_to_device_node->schema() << " ]";
+                // 删除aten::to.device
+                maybe_to_device_node->destroy();
+            }
+        }
+    } else {
+        int unconst_nodes_num = 0;
+        // 删除子图内部的aten::to.device
+        auto cudafusion_subblock_nodes = subgraph->block()->nodes();
+        for (auto c_it = cudafusion_subblock_nodes.begin(); c_it != cudafusion_subblock_nodes.end(); ) {
+            torch::jit::Node* maybe_useless_node = *c_it;
+            c_it++;
+            if (maybe_useless_node->kind() != torch::jit::prim::Constant) {
+                unconst_nodes_num++;
+            }
+            // 存在schema && 在useless_schema_set之中
+            if (maybe_useless_node->maybeSchema() && 
+                useless_schema_set.count(maybe_useless_node->schema().operator_name()) != 0) {
+                bool is_useless_node = false;
+                // 如果是aten::to.device，则需要额外判断scalartype是否为none，否则不能删
+                if (maybe_useless_node->kind() == torch::jit::aten::to) {
+                    if (maybe_useless_node->input(2)->node()->kind() == torch::jit::prim::Constant && 
+                            toIValue(maybe_useless_node->inputs()[2])->isNone()) {
+                        is_useless_node = true;
+                    }
+                // 对 rank=1 的 tensor 进行 aten::select 后接 aten::unsqueeze 的情况，
+                // 原本 torch 中 rank=1 的 tensor select后 rank 会等于 0，
+                // 有的模型（例如：faster-rcnn）会再加一次 unsqueeze 变回 rank=1，再进行其他操作。
+                // 而 poros aten::select 的实现输出 nvtensor rank 依然是1，因此再 unsqueeze rank=2 就会出错。
+                // 所以在子图里删掉这种情况的 aten::unsqueeze
+                } else if (maybe_useless_node->kind() == torch::jit::aten::unsqueeze && 
+                            maybe_useless_node->inputs().size() == 2 && 
+                            maybe_useless_node->input(1)->node()->kind() == torch::jit::prim::Constant) {
+                    int64_t unsqueeze_dim = toIValue(maybe_useless_node->input(1)).value().toInt();
+                    torch::jit::Node* input0_node = maybe_useless_node->input(0)->node();
+                    if (input0_node->kind() == torch::jit::aten::select && 
+                        input0_node->outputs().size() == 1 && 
+                        input0_node->output(0)->type()->isSubtypeOf(c10::TensorType::get()) && 
+                        unsqueeze_dim == 0) {
+                        auto select_output_type = input0_node->output(0)->type()->cast<c10::TensorType>();
+                        // 通过c10::TensorType求rank
+                        if (select_output_type->sizes().size().value() == 0) {
+                            is_useless_node = true;
+                        }
+                    }
+                } else {
+                    // 其他节点暂时不用判断
+                    is_useless_node = true;
+                }
+
+                if (is_useless_node) {
+                    LOG(INFO) << "Remove " << node_info(maybe_useless_node) << " in subgraph: "<< 
+                    node_info(&subgraph_node) << ", which is useless.";
+                    LOG(INFO) << "Destory node schema: [ "<< maybe_useless_node->schema() << " ]";
+                    maybe_useless_node->output(0)->replaceAllUsesWith(maybe_useless_node->input(0));
+                    maybe_useless_node->destroy();
+                    unconst_nodes_num--;
+                }
+            }
+        }
+        // 如果删完子图中的无用节点后只有constant节点，则返回false unmerge。
+        if (unconst_nodes_num <= 0) {
+            return false;
+        }
+    }
+    return true;
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
\ No newline at end of file
diff --git a/poros/src/poros/lowering/eliminate_useless_copy.cpp b/poros/src/poros/lowering/eliminate_useless_copy.cpp
new file mode 100644
index 0000000000..adfd48e214
--- /dev/null
+++ b/poros/src/poros/lowering/eliminate_useless_copy.cpp
@@ -0,0 +1,118 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file eliminate_useless_copy.cpp
+* @author tianjinjin@baidu.com
+* @date Thu Dec 16 16:27:02 CST 2021
+* @brief
+**/
+#include "poros/lowering/lowering_pass.h"
+
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/common_subexpression_elimination.h>
+#include <torch/csrc/jit/passes/constant_pooling.h>
+#include <torch/csrc/jit/passes/constant_propagation.h>
+#include <torch/csrc/jit/passes/dead_code_elimination.h>
+
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+namespace {
+using namespace torch::jit;
+
+struct EliminateUselessCopy {
+    EliminateUselessCopy(std::shared_ptr<Graph> graph) : graph_(std::move(graph)) {}
+
+    void run() {
+        GRAPH_DUMP("before eliminate_useless_copys Graph: ", graph_);
+        bool changed = eliminate_useless_copys(graph_->block());
+        if (changed) {
+            ConstantPropagation(graph_);
+            EliminateDeadCode(graph_);
+            EliminateCommonSubexpression(graph_);
+            ConstantPooling(graph_);
+        }
+        GRAPH_DUMP("after eliminate_useless_copys Graph: ", graph_);
+        return;
+    }
+
+private:
+    /*
+    * @brief 
+    * 相关schema: aten::copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
+    * pytorch原始实现: https://github.com/pytorch/pytorch/blob/v1.9.0/aten/src/ATen/native/Copy.cpp#L246
+    * 
+    * 针对 %output = aten::copy_(%self, %src, %non_blocking) 形式的node。
+    * 找出符合以下条件的aten::copy_
+    *    1. %output 没有被其他node使用
+    *    2. %self 除了aten::copy_ 本node外，没有被其他node使用
+    *    当一个op同时满足以上两个条件时，认为该node可以直接删除。
+    */
+    bool is_node_useless_copy_pattern(Node* node) {
+        if (node->kind() != aten::copy_) {
+            return false;
+        }
+
+        if (node->inputs().at(0)->uses().size() == 1 &&
+            node->outputs().at(0)->uses().size() == 0) {
+            return true;
+        }
+
+        LOG(INFO) << "find unmatched pattern: " << node_info(node);
+        return false;
+    }
+
+    bool eliminate_useless_copy_node(Node* node) {
+        //本node可以destroy了。
+        LOG(INFO) << "destroy aten::copy_ node now: " << node_info(node);
+        node->destroy();
+        return true;
+    }
+
+    bool eliminate_useless_copys(Block* block) {
+        bool changed = false;
+        for (auto it = block->nodes().rbegin(); it != block->nodes().rend();) {
+            // we might destroy the current node, so we need to pre-increment
+            // the iterator
+            Node* node = *it;
+            ++it;
+            for (Block* subblock : node->blocks()) {
+                changed |= eliminate_useless_copys(subblock);
+            }
+            if (is_node_useless_copy_pattern(node)) {
+                LOG(INFO) << "find useless aten copy pattern: " << node_info(node);
+                changed |= eliminate_useless_copy_node(node);
+            }
+        }
+        return changed;
+    }
+
+std::shared_ptr<Graph> graph_;
+};
+
+} // namespace
+
+void eliminate_useless_copy(std::shared_ptr<torch::jit::Graph> graph) {
+    EliminateUselessCopy euc(std::move(graph));
+    euc.run();
+}
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
\ No newline at end of file
diff --git a/poros/src/poros/lowering/fuse_clip.cpp b/poros/src/poros/lowering/fuse_clip.cpp
new file mode 100644
index 0000000000..dc1ddb6de4
--- /dev/null
+++ b/poros/src/poros/lowering/fuse_clip.cpp
@@ -0,0 +1,88 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file fuse_clip.cpp
+* @author tianshaoqing@baidu.com
+* @date 2022-08-01 16:08:26
+* @brief
+**/
+
+#include "poros/lowering/fuse_clip.h"
+
+#include <torch/csrc/jit/passes/subgraph_rewrite.h>
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+/**
+ * ReplaceClip
+ * @param graph
+ * @return true if graph changed, false if not
+ */
+bool FuseClip::fuse(std::shared_ptr<torch::jit::Graph> graph) {
+    // schema: aten::clip(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor
+    if (try_to_replace_clip(graph->block())) {
+        std::string new_pattern = R"IR(
+            graph(%x, %min, %max):
+                %out : Tensor = aten::clamp(%x, %min, %max)
+                return (%out))IR";
+
+        std::string old_pattern = R"IR(
+            graph(%x, %min, %max):
+                %out : Tensor = aten::clip(%x, %min, %max)
+                return (%out))IR";
+
+        torch::jit::SubgraphRewriter std_rewriter;
+        std_rewriter.RegisterRewritePattern(old_pattern, new_pattern);
+        std_rewriter.runOnGraph(graph);
+
+        return true;
+    }
+    return false;
+}
+
+/**
+ * search for aten::clip recursively, record all findings
+ * @param block
+ * @return true if at least one aten::clip found, false if none found
+ */
+bool FuseClip::try_to_replace_clip(torch::jit::Block *block) {
+    bool graph_changed = false;
+    auto it = block->nodes().begin();
+    while (it != block->nodes().end()) {
+        auto node = *it;
+        ++it;  //++it first, node may be destroyed later。
+        for (auto sub_block: node->blocks()) {
+            if (try_to_replace_clip(sub_block)) {
+                graph_changed = true;
+            }
+        }
+        //只处理 aten::clip场景
+        if (node->kind() == torch::jit::aten::clip) {
+            graph_changed = true;
+            record_transform(torch::jit::aten::clip)->to(torch::jit::aten::clamp);
+        }
+    }
+    return graph_changed;
+}
+
+FuseClip::FuseClip() = default;
+
+REGISTER_OP_FUSER(FuseClip)
+
+}
+}
+}// namespace
\ No newline at end of file
diff --git a/poros/src/poros/lowering/fuse_clip.h b/poros/src/poros/lowering/fuse_clip.h
new file mode 100644
index 0000000000..e947ab0f4b
--- /dev/null
+++ b/poros/src/poros/lowering/fuse_clip.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file fuse_clip.h
+* @author tianshaoqing@baidu.com
+* @date 2022-08-01 16:08:26
+* @brief
+**/
+
+#pragma once
+
+#include "poros/lowering/op_fuse_pass.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class FuseClip : public IFuser {
+public:
+    FuseClip();
+    /**
+     * FuseClip
+     * @param graph
+     * @return true if graph changed, false if not
+     */
+    bool fuse(std::shared_ptr<torch::jit::Graph> graph);
+private:
+    /**
+     * search for aten::clip recursively, record all findings
+     * @param block
+     * @return true if at least one clip found, false if none found
+     */
+    bool try_to_replace_clip(torch::jit::Block *block);
+};
+
+}
+}
+}
\ No newline at end of file
diff --git a/poros/src/poros/lowering/fuse_conv_bn.cpp b/poros/src/poros/lowering/fuse_conv_bn.cpp
new file mode 100644
index 0000000000..af195b20f4
--- /dev/null
+++ b/poros/src/poros/lowering/fuse_conv_bn.cpp
@@ -0,0 +1,163 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file fuse_conv_bn.cpp
+* @author Lin Xiao Chun (linxiaochun@baidu.com)
+* @date 2022-03-31 16:11:19
+* @brief
+**/
+
+#include "poros/lowering/fuse_conv_bn.h"
+
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+struct TORCH_API ConvBNParameters {
+    at::Tensor conv_w;
+    at::Tensor conv_b;
+    at::Tensor bn_rm;
+    at::Tensor bn_rv;
+    double bn_eps = 0.0;
+    at::Tensor bn_w;
+    at::Tensor bn_b;
+};
+
+// calculate weights and bias
+std::tuple<at::Tensor, at::Tensor> CalcFusedConvWeightAndBias(
+        const ConvBNParameters &p) {
+    at::Tensor bn_var_rsqrt = at::rsqrt(p.bn_rv + p.bn_eps);
+    const int64_t ndim = p.conv_w.dim();
+    at::DimVector sizes(ndim, 1);
+    sizes.at(0) = -1;
+    at::Tensor new_w = p.conv_w * (p.bn_w * bn_var_rsqrt).reshape(sizes);
+    at::Tensor new_b = (p.conv_b - p.bn_rm) * bn_var_rsqrt * p.bn_w + p.bn_b;
+    return std::make_tuple(new_w, new_b);
+}
+
+bool FuseConvBatchNorm::fuse(std::shared_ptr<torch::jit::Graph> graph) {
+    graph_ = graph;
+    return try_to_fuse_conv_batchnorm(graph_->block());
+}
+
+FuseConvBatchNorm::FuseConvBatchNorm() = default;
+
+bool FuseConvBatchNorm::try_to_fuse_conv_batchnorm(torch::jit::Block *block) {
+    bool graph_changed = false;
+    auto it = block->nodes().begin();
+    while (it != block->nodes().end()) {
+        auto node = *it;
+        ++it;  //++it first, node may be destroyed later。
+        for (auto sub_block: node->blocks()) {
+            if (try_to_fuse_conv_batchnorm(sub_block)) {
+                graph_changed = true;
+            }
+        }
+        //只处理 aten::conv + batch_norm的场景
+        if (node->kind() != torch::jit::aten::batch_norm) {
+            continue;
+        }
+
+        auto all_users = node->inputs()[0]->uses();
+        if (all_users.size() != 1 || ((node->inputs()[0])->node()->kind() != torch::jit::aten::conv1d &&
+                (node->inputs()[0])->node()->kind() != torch::jit::aten::conv2d &&
+                (node->inputs()[0])->node()->kind() != torch::jit::aten::conv3d && 
+                (node->inputs()[0])->node()->kind() != torch::jit::aten::_convolution)) {
+            continue;
+        }
+
+        auto bn = node;
+        auto conv = (node->inputs()[0])->node();
+
+        // More parameters need to be checked when node is aten::_convolution.
+        if (conv->schema().operator_name() == torch::jit::parseSchema("aten::_convolution(Tensor input, Tensor weight, "
+        "Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, "
+        "bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor ").operator_name()) {
+            bool transposed = toIValue(conv->input(6)).value().toBool();
+            // deconvolution is not supported.
+            if (transposed) {
+                LOG(INFO) << "It is found that the transposed of aten::_convolution is true, which is not support to fuse conv+bn currently.";
+                continue;
+            }
+            // output_padding is not supported.
+            std::vector<int64_t> output_padding = toIValue(conv->input(7)).value().toIntVector();
+            for (int64_t o : output_padding) {
+                if (o != 0) {
+                    LOG(INFO) << "It is found that the output_padding of aten::_convolution is not equal to zero, "
+                    "which is not support to fuse conv+bn currently.";
+                    continue;
+                }
+            }
+            // other parameters like benchmark, deterministic, cudnn_enabled and allow_tf do not need to be checked for now.
+        }
+
+        ConvBNParameters params;
+        // conv weights and bias
+        if (!(conv->inputs()[1])->type()->isSubtypeOf(c10::TensorType::get()) || //conv_weight
+            // !(conv->inputs()[2])->type()->isSubtypeOf(c10::TensorType::get()) || //conv_bias (maybe is None)
+            !(bn->inputs()[1])->type()->isSubtypeOf(c10::TensorType::get()) ||   //bn_weight
+            !(bn->inputs()[2])->type()->isSubtypeOf(c10::TensorType::get()) ||   //bn_bias
+            !(bn->inputs()[3])->type()->isSubtypeOf(c10::TensorType::get()) ||   //bn_mean
+            !(bn->inputs()[4])->type()->isSubtypeOf(c10::TensorType::get()) ||   //bn_var
+            !(bn->inputs()[7])->type()->isSubtypeOf(c10::FloatType::get())) {    //bn_esp(default=1e-5)
+            continue;
+        }
+
+        // record the fusing ops for debug
+        record_transform(conv, bn)->to(conv);
+
+        params.conv_w = toIValue(conv->inputs()[1]).value().toTensor();
+        if (toIValue(conv->inputs()[2]).value().isNone()) {
+            params.conv_b = torch::zeros({params.conv_w.size(0)}, {params.conv_w.device()}).to(params.conv_w.type());
+        } else {
+            params.conv_b = toIValue(conv->inputs()[2]).value().toTensor();
+        }
+        params.bn_w = toIValue(bn->inputs()[1]).value().toTensor();
+        params.bn_b = toIValue(bn->inputs()[2]).value().toTensor();
+        params.bn_rm = toIValue(bn->inputs()[3]).value().toTensor();
+        params.bn_rv = toIValue(bn->inputs()[4]).value().toTensor();
+        params.bn_eps = toIValue(bn->inputs()[7]).value().toDouble();
+
+        // calc new weights and bias
+        auto w_b = CalcFusedConvWeightAndBias(params);
+
+        at::Tensor weights = std::get<0>(w_b);
+        at::Tensor bias = std::get<1>(w_b);
+
+        torch::jit::WithInsertPoint guard(graph_->block()->nodes().front());
+        auto conv_w = graph_->insertConstant(weights);
+        auto conv_b = graph_->insertConstant(bias);
+        conv_w->node()->moveBefore(conv);
+        conv_b->node()->moveBefore(conv);
+
+        conv->replaceInput(1, conv_w);
+        conv->replaceInput(2, conv_b);
+
+        bn->output()->replaceAllUsesWith(conv->output());
+        bn->removeAllInputs();
+        bn->destroy();
+
+        graph_changed = true;
+    }
+    return graph_changed;
+}
+
+REGISTER_OP_FUSER(FuseConvBatchNorm)
+
+}
+}
+}// namespace
\ No newline at end of file
diff --git a/poros/src/poros/lowering/fuse_conv_bn.h b/poros/src/poros/lowering/fuse_conv_bn.h
new file mode 100644
index 0000000000..2c7e612127
--- /dev/null
+++ b/poros/src/poros/lowering/fuse_conv_bn.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file fuse_conv_bn.h
+* @author Lin Xiao Chun (linxiaochun@baidu.com)
+* @date 2022-03-31 16:11:19
+* @brief
+**/
+
+#pragma once
+
+#include "poros/lowering/op_fuse_pass.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class FuseConvBatchNorm : public IFuser {
+public:
+    FuseConvBatchNorm();
+
+    bool fuse(std::shared_ptr<torch::jit::Graph> graph);
+
+private:
+    bool try_to_fuse_conv_batchnorm(torch::jit::Block *block);
+
+    std::shared_ptr<torch::jit::Graph> graph_;
+};
+
+}
+}
+}
\ No newline at end of file
diff --git a/poros/src/poros/lowering/fuse_conv_mul.cpp b/poros/src/poros/lowering/fuse_conv_mul.cpp
new file mode 100644
index 0000000000..4e7863a8a4
--- /dev/null
+++ b/poros/src/poros/lowering/fuse_conv_mul.cpp
@@ -0,0 +1,117 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file: fuse_conv_mul2.cpp
+* @author: zhangfan51@baidu.com
+* @data: 2022-04-24 18:43:02
+* @brief: 
+**/ 
+#include "poros/lowering/fuse_conv_mul.h"
+
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+using namespace torch::jit;
+
+FuseConvMul::FuseConvMul() = default;
+
+/**
+ * FuseConvMul
+ * @param graph
+ * @return true if graph changed, false if not
+ */
+bool FuseConvMul::fuse(std::shared_ptr<torch::jit::Graph> graph) {
+    graph_ = graph;
+    return try_to_fuse_conv_mul(graph_->block());
+}
+
+/**
+ * search for aten::conv + aten::mul patten for fuse
+ * @param block
+ * @return true if fuse success
+ */
+bool FuseConvMul::try_to_fuse_conv_mul(torch::jit::Block *block) {
+    bool graph_changed = false;
+    auto it = block->nodes().begin();
+    while (it != block->nodes().end()) {
+        auto node = *it;
+        ++it;  //先++it, node 可能destroy掉。
+        for (auto sub_block : node->blocks()) {
+            if (try_to_fuse_conv_mul(sub_block)) {
+                graph_changed = true;
+            }
+        }
+        // find the op by "aten::mul".Scalar(Tensor self, Scalar other) -> Tensor"
+        if (node->kind() != aten::mul) {
+            continue;
+        }
+
+        // find "aten::mul.Scalar(Tensor self, Scalar other) -> Tensor"
+        if (!node->inputs()[0]->type()->isSubtypeOf(c10::TensorType::get()) ||
+            node->inputs()[1]->node()->kind() != prim::Constant) {
+            continue;
+        }
+
+        auto conv = node->inputs()[0]->node();
+        if((conv->kind() != aten::conv2d && conv->kind() != aten::conv1d && conv->kind() != aten::conv3d) ||
+            node->inputs()[0]->uses().size() != 1) {
+            continue;
+        }
+
+        if (!(conv->inputs()[1])->type()->isSubtypeOf(c10::TensorType::get()) || // conv_weight
+            conv->inputs()[1]->uses().size() != 1) {
+            continue;
+        }
+        at::Tensor conv_w = toIValue(conv->inputs()[1])->toTensor();
+        float scale = toIValue(node->inputs()[1])->toScalar().toFloat();
+            
+        torch::jit::WithInsertPoint guard(graph_->block()->nodes().front());
+        // check bias
+        if (conv->inputs()[2]->type()->isSubtypeOf(c10::TensorType::get())) {
+            if (conv->inputs()[2]->uses().size() != 1) {
+                continue;
+            }
+            at::Tensor conv_b = toIValue(conv->inputs()[2])->toTensor();
+            auto new_conv_b = graph_->insertConstant(conv_b * scale);
+            new_conv_b->setDebugName(conv->inputs()[2]->debugName() + ".scale");
+            // 替换conv的bias值
+            conv->inputs().at(2)->replaceAllUsesWith(new_conv_b);
+        }
+
+        auto new_conv_w = graph_->insertConstant(conv_w * scale);
+        new_conv_w->setDebugName(conv->inputs()[1]->debugName() + ".scale");
+
+        // 替换conv的weight值
+        conv->inputs().at(1)->replaceAllUsesWith(new_conv_w);
+        // 把所有的aten::mul的output的users更改为conv的output
+        node->output()->replaceAllUsesWith(conv->output());
+
+        LOG(INFO) << "Found fuse_conv2d_mul, node = " << *node;
+        // 删除 aten::mul节点
+        node->removeAllInputs();
+        node->destroy();
+        graph_changed = true;
+    }
+    return graph_changed;
+}
+
+REGISTER_OP_FUSER(FuseConvMul)
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
\ No newline at end of file
diff --git a/poros/src/poros/lowering/fuse_conv_mul.h b/poros/src/poros/lowering/fuse_conv_mul.h
new file mode 100644
index 0000000000..ad67fc34dc
--- /dev/null
+++ b/poros/src/poros/lowering/fuse_conv_mul.h
@@ -0,0 +1,53 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file: fuse_conv_mul.h
+* @author: zhangfan51@baidu.com
+* @data: 2022-04-24 18:41:20
+* @brief: 
+**/ 
+
+#pragma once
+
+#include "poros/lowering/op_fuse_pass.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+/**
+ *  %3 : int = prim::Constant[value=1]()
+ *  %4 : float = prim::Constant[value=2.0]()
+ *  %1 : Tensor = aten::conv2d(%0, %conv_w, %conv_b, %conv_stride, %conv_padding, %conv_dilation, %3)
+ *  %2 : Tensor = aten::mul(%1, %4)
+ *
+ * 如上面的IR，FuseConvMul是将conv + mul中的mul融到conv中，减少一次mul计算，融后在图上可以匹配到更多的针对conv的优化pass；
+ * 限制：aten::mul的输入%4需为constant类型。
+ */
+class FuseConvMul : public IFuser {
+public:
+    FuseConvMul();
+
+    bool fuse(std::shared_ptr<torch::jit::Graph> graph);
+
+private:
+    bool try_to_fuse_conv_mul(torch::jit::Block *block);
+
+    std::shared_ptr<torch::jit::Graph> graph_;
+};
+
+}
+}
+}
diff --git a/poros/src/poros/lowering/fuse_copy.cpp b/poros/src/poros/lowering/fuse_copy.cpp
new file mode 100644
index 0000000000..de94bbc328
--- /dev/null
+++ b/poros/src/poros/lowering/fuse_copy.cpp
@@ -0,0 +1,502 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file: fuse_copy.cpp
+* @author: tianjinjin@baidu.com
+* @data: Wed Jun 16 20:28:36 CST 2021
+* @brief: 
+**/ 
+
+#include "poros/lowering/fuse_copy.h"
+
+#include <torch/csrc/jit/passes/common_subexpression_elimination.h>
+#include <torch/csrc/jit/passes/constant_pooling.h>
+#include <torch/csrc/jit/passes/constant_propagation.h>
+#include <torch/csrc/jit/passes/dead_code_elimination.h>
+#include <torch/csrc/jit/passes/erase_number_types.h>
+#include <torch/csrc/jit/passes/remove_inplace_ops.h>
+#include <torch/csrc/jit/passes/remove_mutation.h>
+#include <torch/csrc/jit/frontend/source_range.h>
+#include <torch/version.h>
+
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+using namespace torch::jit;
+
+FuseCopy::FuseCopy() = default;
+
+/**
+ * FuseCopy
+ * @param graph
+ * @return true if graph changed, false if not
+ */
+bool FuseCopy::fuse(std::shared_ptr<torch::jit::Graph> graph) {
+    graph_ = graph;
+    GRAPH_DUMP("before fuse copy ops Graph: ", graph_);
+    bool fused = try_to_fuse_copy(graph_->block());
+    if (fused) {
+            ConstantPropagation(graph_);
+            EliminateDeadCode(graph_);
+            EliminateCommonSubexpression(graph_);
+            ConstantPooling(graph_);
+            //EraseNumberTypesOnBlock(graph_->block);
+            //EliminateDeadCode(graph_->block, true, DCESideEffectPolicy::ALLOW_DELETING_NODES_WITH_SIDE_EFFECTS);
+    }
+    GRAPH_DUMP("after fuse copy ops Graph: ", graph_);
+    return fused;
+}
+
+/**
+* @brief 创建aten::size节点，用于生成给定dim的size信息
+* **/
+Value* FuseCopy::create_size_of_dim(Value* input, int64_t dim, Node* insertBefore) {
+    auto graph = input->owningGraph();
+    WithInsertPoint guard(insertBefore);
+    auto size = graph->insert(aten::size, {input, dim});
+    LOG(INFO) << "create_size_of_dim before node: " << node_info(insertBefore);
+    LOG(INFO) << "create aten::size node: " << node_info(size->node());
+    return size;
+}
+
+/**
+* @brief 对value进行处理，尤其是需要对维度进行补充的情况（也就是被select给降维的情况需要把对应的维度补回来）。
+* **/
+void FuseCopy::adjust_value(Graph* graph,
+                    Node* index_put_node,
+                    const std::vector<Node*>& slice_and_select_nodes,
+                    Value* orig_data) {
+    //获取常量value的rank信息，如果value的rank为0或者为1，则不需要专门处理(虽然也可以在这里处理...)
+    //如果rank不是0，则这个tensor可能是select 生成的，需要提升维度，使得跟self维度一致后，再broadcast。
+    bool need_unsqueeze_value = true;
+    Value* value =  index_put_node->inputs().at(2);
+    if (value->node()->kind() == prim::Constant) {
+        at::Tensor value_tensor = toIValue(value).value().toTensor();
+        int64_t value_rank = value_tensor.dim();
+        if (value_rank == 0 || value_rank == 1) {
+            need_unsqueeze_value = false;
+        }
+    }
+
+    if (need_unsqueeze_value == true) {
+        int64_t dim_offset = 0;
+        for (auto it = slice_and_select_nodes.rbegin(); it != slice_and_select_nodes.rend(); ++it) {
+            Node* node = *it;
+            int64_t dim = toIValue(node->inputs().at(1)).value().toInt();
+            if (dim < 0) {
+                std::shared_ptr<c10::TensorType> input_type = orig_data->type()->expect<TensorType>();
+                if (input_type->dim().has_value()) {
+                    int64_t rank = static_cast<int64_t>(input_type->dim().value());
+                    dim = dim + rank - dim_offset;
+                }
+            }
+            dim = dim + dim_offset;
+
+            if (node->kind() == aten::select) {
+                //需要对value进行维度的还原。
+                WithInsertPoint guard(index_put_node);
+                Value* unsqueeze = graph->insert(aten::unsqueeze, {index_put_node->inputs().at(2), dim});
+                LOG(INFO) << "create aten::unsqueeze node: " << node_info(unsqueeze->node());
+                index_put_node->replaceInput(2, unsqueeze);
+                dim_offset++;
+            }
+        }
+    }
+    return;
+}
+
+/**
+* @brief 创建aten::tensor节点包装indices信息。
+* **/
+Value* FuseCopy::convert_select_to_index(Value* index, Node* insertBefore) {
+    // Create index tensor based on index input of aten::select node.
+    auto graph = insertBefore->owningGraph();
+    WithInsertPoint guard(insertBefore);
+    Node* indices = graph->create(aten::tensor, {
+        index,
+        graph->insertConstant(c10::ScalarType::Long),
+        //graph->insertConstant(torch::Device(torch::DeviceType::CUDA, 0)),
+        graph->insertConstant(torch::Device(at::kCPU)),
+        graph->insertConstant(false)});
+
+    indices->copyMetadata(insertBefore);
+    indices->insertBefore(insertBefore);
+    LOG(INFO) << "convert_select_to_index before node: " << node_info(insertBefore);
+    LOG(INFO) << "create aten::tensor node: " << node_info(indices);
+    return indices->output();
+}
+
+/**
+* @brief 提取slice节点中的dim，start，end，step等信息，转化成slice tensor
+* **/
+Value* FuseCopy::convert_slice_to_index(Node* slice, Value* size, Node* insertBefore) {
+    // Create index tensor based on aten::slice node.
+    auto graph = slice->owningGraph();
+    WithInsertPoint guard(insertBefore);
+    TORCH_INTERNAL_ASSERT((slice->inputs()).size() == 5);
+    auto start = slice->inputs()[2];
+    auto end = slice->inputs()[3];
+    auto step = slice->inputs()[4];
+    //auto index = graph->insert(aten::arange, {size});
+    auto index = graph->insert(aten::arange, {size}, {NamedValue("dtype", c10::kLong)});
+    LOG(INFO) << "convert_slice_to_index before node: " << node_info(insertBefore);
+    LOG(INFO) << "create aten::arange node: " << node_info(index->node());
+    auto sliced_index_n = graph->create(aten::slice, {
+            index,
+            graph->insertConstant(at::Scalar(0)), 
+            start,
+            end,
+            step});
+    LOG(INFO) << "create aten::slice node: " << node_info(sliced_index_n);
+    sliced_index_n->copyMetadata(insertBefore);
+    auto sliced_index = sliced_index_n->insertBefore(insertBefore)->output();
+    return sliced_index;
+}
+
+//torch.version >= 1.12, Source api发生调整,兼容之
+#if TORCH_VERSION_MAJOR >= 1 && TORCH_VERSION_MINOR >= 12
+#define NODE_SOURCE_TEXT(name)  \
+    name->text_str()
+#else
+#define NODE_SOURCE_TEXT(name)  \
+    name->text()
+#endif
+
+/**
+* @brief 找到跟 copy_ 或者 index_put_ 等op相关联的 slice op
+*        他们来自python的同一行代码，
+*        是为了合作完成list 或者 tensor 的切片功能
+*        比如 y = x[1:3, 0] 这样的形式
+// Example graph: 
+//    %306 : Float(*, 16, 64, 16, 16) = aten::slice(%out.4, %0, %none, %none, %1) 
+//    %307 : Float(*, 15, 64, 16, 16) = aten::slice(%306, %1, %none, %11, %1) 
+//    %308 : Float(*, 15, 8, 16, 16) = aten::slice(%307, %2, %none, %y, %1) 
+//    %309 : Tensor = aten::copy_(%308, %305, %false)  
+* **/
+std::vector<Node*> FuseCopy::fetch_slice_and_select_pattern(const Node* node) {
+    TORCH_INTERNAL_ASSERT(node->kind() == aten::index_put ||
+        node->kind() == aten::index_put_ ||
+        node->kind() == aten::copy_);
+    const auto& node_source = node->sourceRange().source();
+
+    std::vector<Node*> slice_and_select_nodes;
+    auto src_node = node->input(0)->node();
+    while (src_node) {
+        auto& src_node_source = src_node->sourceRange().source();         
+        if ((src_node->kind() == aten::slice || src_node->kind() == aten::select) &&
+            NODE_SOURCE_TEXT(node_source) == NODE_SOURCE_TEXT(src_node_source) &&
+            node_source->starting_line_no() == src_node_source->starting_line_no()) {
+            slice_and_select_nodes.emplace_back(src_node);
+            //常常是连续的slice
+            src_node = src_node->input(0)->node();
+        } else {
+            src_node = nullptr;
+        }
+    }
+    return slice_and_select_nodes;
+}
+
+/**
+* @brief 把相关联的slice 和 select 整合成 indices:
+* **/ 
+std::unordered_map<int64_t, ConvertedIndex> FuseCopy::merge_slice_and_select_to_indices(
+                                        Graph* graph,
+                                        Node* index_put_node,
+                                        const std::vector<Node*>& slice_and_select_nodes,
+                                        Value* orig_data) {
+
+    std::unordered_map<int64_t, ConvertedIndex> dim_index_map;
+    int64_t cur_dim = 0;
+    /* dim_offset 的意义: 当select 和 slice 混合出现，完成对 tensor 的切片功能时，
+        由于select 有降维的效果，aten::select 后面跟的 op (包括 select 和 slice) 的 dim 信息会被影响到，
+        所以需要根据aten::select 已经出现的次数，对后续 op 的dim信息进行修正。
+    */
+    int64_t dim_offset = 0;
+    const auto orig_tensor_indices = index_put_node->input(1)->node()->inputs();
+    // slice_and_select_nodes 的添加过程是逆序的，所以逆向迭代vector 内的 slice 和 select 节点。
+    for (auto it = slice_and_select_nodes.rbegin(); it != slice_and_select_nodes.rend(); ++it) {
+        Node* node = *it;
+        LOG(INFO) << "handle slice or select node info: " << node_info(node);
+        //int64_t dim = node->inputs().at(1)->node()->t(attr::value).item().toLong();
+        int64_t dim = toIValue(node->inputs().at(1)).value().toInt();
+        if (dim < 0) {
+            auto input_type = orig_data->type()->expect<TensorType>();
+            if (input_type->dim().has_value()) {
+                auto rank = static_cast<int64_t>(input_type->dim().value());
+                dim = dim + rank - dim_offset;
+            } else {
+                std::cerr << "Error: Poros handle index Ops - Cannot export ellipsis indexing for input "
+                        << "of unknown rank.";
+            }
+        }
+
+        dim = dim + dim_offset;
+        while (cur_dim < dim) {
+            if (cur_dim - dim_offset >= (int64_t)orig_tensor_indices.size() ||
+                index_put_node->input(1)->node()->input(cur_dim - dim_offset)->node()->mustBeNone()) {
+                auto size = create_size_of_dim(orig_data, cur_dim, index_put_node);
+                WithInsertPoint guard(index_put_node);
+                //auto index_tensor = graph->insert(aten::arange, {size});
+                auto index_tensor = graph->insert(aten::arange, {size}, {NamedValue("dtype", c10::kLong)});
+                LOG(INFO) << "create aten::arange node: " << node_info(index_tensor->node());
+                dim_index_map.emplace(std::piecewise_construct, std::forward_as_tuple(cur_dim),
+                                    std::forward_as_tuple(index_tensor, aten::slice));
+            } else if (cur_dim - dim_offset < (int64_t)orig_tensor_indices.size()) {
+                dim_index_map.emplace(std::piecewise_construct, std::forward_as_tuple(cur_dim),
+                                    std::forward_as_tuple(orig_tensor_indices[cur_dim - dim_offset], aten::index));
+            }
+            cur_dim++;
+        }
+
+        AT_ASSERT(cur_dim == dim);
+        LOG(INFO) << "cur_dim info: " << cur_dim  <<  ", dim_offset: " << dim_offset;
+
+        if (node->kind() == aten::slice) {
+            auto size = create_size_of_dim(orig_data, dim, index_put_node);
+            auto index_tensor = convert_slice_to_index(node, size, index_put_node);
+            dim_index_map.emplace(std::piecewise_construct, std::forward_as_tuple(dim),
+                                std::forward_as_tuple(index_tensor, aten::slice));
+        } else if (node->kind() == aten::select) {
+            auto index_tensor = convert_select_to_index(node->input(2), index_put_node);
+            dim_index_map.emplace(std::piecewise_construct, std::forward_as_tuple(dim),
+                                std::forward_as_tuple(index_tensor, aten::select));
+            dim_offset++;
+        } else {
+            AT_ERROR("Unexpected node kind ", node->kind().toDisplayString(), " Expected aten::slice or aten::select.");
+        }
+        cur_dim++;
+    }
+    
+    while (cur_dim - dim_offset < (int64_t)orig_tensor_indices.size()) {
+        dim_index_map.emplace(std::piecewise_construct, std::forward_as_tuple(cur_dim),
+                            std::forward_as_tuple(orig_tensor_indices[cur_dim - dim_offset], aten::index));
+        cur_dim++;
+    }
+    // Each dimension should have its associated index tensor.
+    AT_ASSERT((int64_t)dim_index_map.size() == cur_dim);
+    return dim_index_map;
+}
+
+std::vector<Value*> FuseCopy::reshape_to_advanced_indexing_format(Graph* graph, Node* index_put_node,
+                    std::unordered_map<int64_t, ConvertedIndex>& dim_index_map) {
+    std::vector<Value*> indices;
+    size_t min_index_dim = dim_index_map.size();
+    size_t max_index_dim = 0;
+    size_t tensor_ind_count = 0;
+    for (size_t i = 0; i < dim_index_map.size(); ++i) {
+        auto index_i = dim_index_map.find(i);
+        AT_ASSERT(index_i != dim_index_map.end());
+        if (index_i->second.orig_node_kind == aten::index) {
+            if (i < min_index_dim)
+                min_index_dim = i;
+            if (i > max_index_dim)
+                max_index_dim = i;
+            tensor_ind_count++;
+        }
+    }
+    
+    if (((max_index_dim - min_index_dim + 1) != tensor_ind_count) && tensor_ind_count != 0) {
+        AT_ERROR("Only consecutive 1-d tensor indices are supported in exporting aten::index_put to POROS.");
+    }
+    
+    size_t tensor_ind_offset = tensor_ind_count == 0 ? 0 : tensor_ind_count - 1;
+    WithInsertPoint guard(index_put_node);
+    for (size_t i = 0; i < dim_index_map.size(); ++i) {
+        size_t ind_size = 0;
+        auto index_i = dim_index_map.find(i);
+        AT_ASSERT(index_i != dim_index_map.end());
+        Value* index = index_i->second.index;
+        switch (index_i->second.orig_node_kind) {
+            case aten::select:
+            case aten::slice: {
+                if (i < min_index_dim) {
+                    ind_size = dim_index_map.size() - tensor_ind_offset - i;
+                } else {
+                    ind_size = dim_index_map.size() - i;
+                }
+                break;
+            }
+            case aten::index: {
+                ind_size = dim_index_map.size() - tensor_ind_offset - min_index_dim;
+                break;
+            }
+            default:
+                AT_ERROR("Unexpected node kind ", index_i->second.orig_node_kind);
+        }
+        
+        if (ind_size != 1) {
+            std::vector<int64_t> view_shape(ind_size, 1);
+            view_shape[0] = -1;
+            auto unsqueezed_index = graph->insert(aten::view, {index, view_shape});
+            LOG(INFO) << "create aten::view node: " << node_info(unsqueezed_index->node());
+            indices.emplace_back(unsqueezed_index);
+        } else {
+            indices.emplace_back(index);
+        }
+    }
+    return indices;
+}
+
+/**
+* @brief 针对aten::index_put / aten::index_put_ 的处理:
+*        将跟他们相关联的slice 和 select 节点整合到一起, 提取indices信息，重新写个index_put。
+* **/
+bool FuseCopy::prepare_index_put(Node* index_put_node) {
+    LOG(INFO) << "prepare for index put node: " << node_info(index_put_node);
+    TORCH_INTERNAL_ASSERT(index_put_node->kind() == aten::index_put ||
+                        index_put_node->kind() == aten::index_put_);
+    //找到相关联的slice 和 select
+    std::vector<Node*> slice_and_select_nodes = fetch_slice_and_select_pattern(index_put_node);
+    if (slice_and_select_nodes.size() == 0) {
+        return false;
+    }
+    LOG(INFO) << "slice_and_select_nodes_size: " << slice_and_select_nodes.size();
+    Node* last_node = slice_and_select_nodes.size() > 0 ? slice_and_select_nodes.back() : index_put_node;
+    //找到最原始的那个被切片的value， 具体到example graph中，原始value 为 %out.4。
+    Value* orig_data = last_node->input(0);
+    //当index_put 所在的node 与被改变的value不在一个block的时候，跳过这种情况。
+    if (orig_data->node()->owningBlock() != index_put_node->owningBlock()) {
+        LOG(INFO) << "orig data comes from different block, bypass this situation";
+        return false;
+    }
+
+    auto graph = index_put_node->owningGraph();
+    //对value进行处理。
+    adjust_value(graph, index_put_node, slice_and_select_nodes, orig_data);
+
+    //把slice和select操作转变成indices。
+    std::unordered_map<int64_t, ConvertedIndex> dim_index_map = 
+        merge_slice_and_select_to_indices(graph, index_put_node, slice_and_select_nodes, orig_data);
+    
+
+    std::vector<Value*> indices = reshape_to_advanced_indexing_format(graph, index_put_node, dim_index_map);
+
+    // Create new index_put node with converted indices.
+    const auto list_indices = graph->createList(OptionalType::ofTensor(), indices)
+                                        ->insertBefore(index_put_node)->output();
+    LOG(INFO) << "create tensorlist node: " << node_info(list_indices->node());
+    auto new_index_put_node = graph->create(aten::index_put, 
+                                {orig_data, list_indices, index_put_node->input(2), index_put_node->input(3)});
+    LOG(INFO) << "create aten::index_put node: " << node_info(new_index_put_node);
+    new_index_put_node->insertBefore(index_put_node);
+    new_index_put_node->copyMetadata(index_put_node);
+    auto new_index_put = new_index_put_node->output();
+    new_index_put->copyMetadata(index_put_node->output());
+    index_put_node->output()->replaceAllUsesWith(new_index_put);
+    orig_data->replaceAllUsesAfterNodeWith(index_put_node, new_index_put);
+    record_transform(index_put_node)->to(new_index_put_node);
+    index_put_node->destroy();
+    return true;
+}
+
+
+/**
+* @brief 针对aten::copy_的处理: 将其用 index_put_ 代替。
+* 此步骤中用到的dummylist 只是一个”站位符“，不能真正用于index_put
+* prepare_index_put 会找到真正的 index信息。
+* **/ 
+
+// Example:
+//    %out: Tensor = aten::copy_(%self, %src, %non_blocking)
+//
+// After this prepare function:
+//    %dummylist : Tensor?[] = prim::ListConstruct()
+//    %newout: Tensor = aten::index_put_(%self, %dummylist, %src, %non_blocking)
+    bool FuseCopy::prepare_copy(Node* node) {
+    TORCH_INTERNAL_ASSERT(node->kind() == aten::copy_);
+    LOG(INFO) << "prepare for copy node: " << node_info(node);
+
+    //找到相关联的slice 和 select
+    std::vector<Node*> slice_and_select_nodes = fetch_slice_and_select_pattern(node);
+    if (slice_and_select_nodes.size() == 0) {
+        return false;
+    }
+
+    //找到最原始的那个被切片的value， 具体到example graph中，原始value 为 %out.4。先解决引用语义的问题。
+    Node* last_node = slice_and_select_nodes.back();
+    Value* orig_data = last_node->input(0);
+    //当copy_ 所在的node 与被改变的value不在一个block的时候，跳过这种情况。
+    if (orig_data->node()->owningBlock() != node->owningBlock()) {
+        LOG(INFO) << "orig data comes from different block, bypass this situation";
+        return false;
+    }
+    orig_data->replaceAllUsesAfterNodeWith(node, node->output());
+
+    //做index_put 的替换
+    WithInsertPoint guard(node);
+    auto graph = node->owningGraph();
+    Value* dummy_list = graph->insertNode(graph->createList(OptionalType::ofTensor(), {}))->output();
+    
+    // 当value的size跟self的size不一致的时候，需要对齐两者的size信息，
+    // 尝试在此处直接用expand_as, 发现单测无法通过，因为index_put支持value的rank为0的情况，
+    // 此时需要修改index_put converter 的实现，兼容value 的rank为0的情况。
+    // Value* expanded_value = graph->insert(aten::expand_as, 
+    //                                     {node->input(1), orig_data});
+    // expanded_value->node()->setSourceRange(node->sourceRange());
+    // expanded_value->copyMetadata(node->input(1));
+    // expanded_value->node()->copyMetadata(node);
+
+    Value* index_put = graph->insert(aten::index_put_, 
+                                    {node->input(0), dummy_list, node->input(1), node->input(2)});
+    index_put->node()->copyMetadata(node);
+    index_put->copyMetadata(node->output());
+    node->output()->replaceAllUsesWith(index_put);
+
+    record_transform(node)->to(index_put->node());
+    bool changed = prepare_index_put(index_put->node());
+    if (changed == true) {
+        node->destroy();
+    }
+    return changed;
+}
+
+/**
+ * search for aten::copy_  or aten::index_put patten for fuse
+ * @param block
+ * @return true if fuse success
+ */
+bool FuseCopy::try_to_fuse_copy(torch::jit::Block *block) {
+    bool graph_changed = false;
+    for (auto it = block->nodes().begin(); it != block->nodes().end();) {
+        Node* node = *it;
+        it++; // node n can be destroyed
+
+        auto nkind = node->kind();
+        //sub_block situation
+        if (nkind == prim::If || nkind == prim::Loop) {
+            for (Block* sub_block : node->blocks()) {
+                try_to_fuse_copy(sub_block);
+            }
+        } else {
+            if (nkind == aten::copy_) {
+                LOG(INFO) << "copy situation meet";
+                graph_changed |= prepare_copy(node);             
+            } else if (nkind == aten::index_put || nkind == aten::index_put_) {
+                LOG(INFO) << "index_put or index_put situation meet";
+                graph_changed |= prepare_index_put(node);
+            }
+        }
+    }
+    return graph_changed;
+}
+
+REGISTER_OP_FUSER(FuseCopy)
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
\ No newline at end of file
diff --git a/poros/src/poros/lowering/fuse_copy.h b/poros/src/poros/lowering/fuse_copy.h
new file mode 100644
index 0000000000..ee4617196a
--- /dev/null
+++ b/poros/src/poros/lowering/fuse_copy.h
@@ -0,0 +1,136 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file: fuse_copy.h
+* @author: tianjinjin@baidu.com
+* @data: Mon Aug 22 11:33:45 CST 2022
+* @brief: 
+**/ 
+
+#pragma once
+
+#include "poros/lowering/op_fuse_pass.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+/***
+ * torchscript 中有个重要的概念，是view，
+ * 当slice这个op出现时（包括连续出现时），不会真正的执行内存的copy，而是通过view去尽可能的复用buffer
+ * 直到出现copy_ 才会进行真正的buffer的拷贝。
+ * 比如下面的graph，执行到最后，真正发生了变化的是 %out.4。
+ * typical graph for copy_
+
+   %none : NoneType = prim::Constant()
+   %false : bool = prim::Constant[value=0]()
+   %0 : int = prim::Constant[value=0]()
+   %1 : int = prim::Constant[value=1]()
+   %2 : int = prim::Constant[value=2]()
+   %11 : int = prim::Constant[value=-1]()
+   %out.4 : Float(*, 16, 64, 16, 16) = aten::zeros_like(%x, %none, %none, %none, %none, %none)
+   %303 : Float(*, 16, 64, 16, 16) = aten::slice(%x, %0, %none, %none, %1) 
+   %304 : Float(*, 15, 64, 16, 16) = aten::slice(%303, %1, %1, %none, %1) 
+   %305 : Float(*, 15, 8, 16, 16) = aten::slice(%304, %2, %none, %y, %1) 
+
+   %306 : Float(*, 16, 64, 16, 16) = aten::slice(%out.4, %0, %none, %none, %1) 
+   %307 : Float(*, 15, 64, 16, 16) = aten::slice(%306, %1, %none, %11, %1) 
+   %308 : Float(*, 15, 8, 16, 16) = aten::slice(%307, %2, %none, %y, %1) 
+   %309 : Tensor = aten::copy_(%308, %305, %false) 
+
+   %310 : Float(*, 15, 64, 16, 16) = aten::slice(%303, %1, %none, %11, %1)
+   %311 : int = aten::mul(%2, %y) 
+   %312 : Float(*, 15, 8, 16, 16) = aten::slice(%310, %2, %y, %311, %1) 
+   %313 : Float(*, 16, 64, 16, 16) = aten::slice(%out.4, %0, %none, %none, %1) 
+   %314 : Float(*, 15, 64, 16, 16) = aten::slice(%313, %1, %1, %none, %1) 
+   %315 : Float(*, 15, 8, 16, 16) = aten::slice(%314, %2, %y, %311, %1) 
+   %316 : Tensor = aten::copy_(%315, %312, %false) 
+
+   %317 : Float(*, 16, 64, 16, 16) = aten::slice(%303, %1, %none, %none, %1) 
+   %318 : Float(*, 16, 48, 16, 16) = aten::slice(%317, %2, %311, %none, %1) 
+   %319 : Float(*, 16, 64, 16, 16) = aten::slice(%out.4, %0, %none, %none, %1) 
+   %320 : Float(*, 16, 64, 16, 16) = aten::slice(%319, %1, %none, %none, %1) 
+   %321 : Float(*, 16, 48, 16, 16) = aten::slice(%320, %2, %311, %none, %1) 
+   %322 : Tensor = aten::copy_(%321, %318, %false)
+   
+   %323 : int[] = prim::ListConstruct(%nt.3, %c.3, %h.3, %w.3)
+   %final : Float(*, 64, 16, 16) = aten::view(%out.4, %323)
+ * 
+ * the implementation of index_put:
+ * aten/src/ATen/native/cuda/indexing.cu
+ * https://github.com/pytorch/pytorch/blob/v1.9.0-rc1/aten/src/ATen/native/cuda/Indexing.cu#L209
+ * ***/
+
+struct ConvertedIndex {
+    ConvertedIndex(torch::jit::Value* index, c10::Symbol orig_node_kind)
+        : index(index), orig_node_kind(orig_node_kind) {}
+      
+    torch::jit::Value* index = nullptr;
+    c10::Symbol orig_node_kind;
+};
+
+/**
+ * 目前可以处理的场景包括:
+ * 1. 纯slice的场景:  out[:, :-1, :3] = x[:, 1:, :3]
+ * 2. slice + 单个select的场景(等号右侧为单值): out[2:3:1, :, :, 0, :] = 1
+ * 3. slice + 多个select的场景(等号右侧为单值): out[2:3:1, 3, :, 0, :] = 1
+ * 4. slice + 单个select的场景(等号右侧为tensor): boxes[:, :, 0] = torch.clamp(boxes[:, :, 0], min=0)
+ * 5. sclie + 多个select的场景(等号右侧为tensor): boxes[:, 0, :, 1] = torch.clamp(boxes[:, 0, :, 1], min=0)
+ * **/
+class FuseCopy : public IFuser {
+public:
+    FuseCopy();
+
+    bool fuse(std::shared_ptr<torch::jit::Graph> graph);
+
+private:
+    bool try_to_fuse_copy(torch::jit::Block *block);
+
+    bool prepare_copy(torch::jit::Node* node);
+    bool prepare_index_put(torch::jit::Node* index_put_node);
+
+    torch::jit::Value* create_size_of_dim(torch::jit::Value* input,
+                                        int64_t dim,
+                                        torch::jit::Node* insertBefore);
+    torch::jit::Value* convert_select_to_index(torch::jit::Value* index,
+                                            torch::jit::Node* insertBefore);
+    torch::jit::Value* convert_slice_to_index(torch::jit::Node* slice,
+                                            torch::jit::Value* size,
+                                            torch::jit::Node* insertBefore);
+
+    std::vector<torch::jit::Node*> fetch_slice_and_select_pattern(const torch::jit::Node* node);
+
+    std::unordered_map<int64_t, ConvertedIndex> merge_slice_and_select_to_indices(
+                                            torch::jit::Graph* graph,
+                                            torch::jit::Node* index_put_node,
+                                            const std::vector<torch::jit::Node*>& slice_and_select_nodes,
+                                            torch::jit::Value* orig_data);
+
+    std::vector<torch::jit::Value*> reshape_to_advanced_indexing_format(
+                                            torch::jit::Graph* graph,
+                                            torch::jit::Node* index_put_node,
+                                            std::unordered_map<int64_t, ConvertedIndex>& dim_index_map);
+
+    void adjust_value(torch::jit::Graph* graph,
+                    torch::jit::Node* index_put_node,
+                    const std::vector<torch::jit::Node*>& slice_and_select_nodes,
+                    torch::jit::Value* orig_data);
+
+    std::shared_ptr<torch::jit::Graph> graph_;
+};
+
+}
+}
+}
diff --git a/poros/src/poros/lowering/fuse_gelu.cpp b/poros/src/poros/lowering/fuse_gelu.cpp
new file mode 100644
index 0000000000..286bdec85f
--- /dev/null
+++ b/poros/src/poros/lowering/fuse_gelu.cpp
@@ -0,0 +1,128 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file fuse_gelu.cpp
+* @author tianshaoqing@baidu.com
+* @date 2022-10-20 14:39:32
+* @brief
+**/
+
+#include "poros/lowering/fuse_gelu.h"
+
+#include <torch/csrc/jit/passes/subgraph_rewrite.h>
+#include <torch/version.h>
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+/**
+ * Rewrite aten::gelu to the fast version:
+ * y = 0.5 * x * (1 + tanh(sqrt(2 / Pi) * (x + 0.044715 * x^3)))
+ * Note: This may result in a small diff.
+ * @param graph
+ * @return 
+ */
+bool FuseGelu::fuse(std::shared_ptr<torch::jit::Graph> graph) {
+    if (try_to_find_gelu(graph->block())) {
+        std::string gelu_pattern;
+        std::string gelu_reduce_pattern;
+
+        if (TORCH_VERSION_MAJOR < 2 && TORCH_VERSION_MINOR < 12) {
+            gelu_pattern = R"IR(
+                graph(%x):
+                    %out : Tensor = aten::gelu(%x)
+                    return (%out))IR";
+
+            gelu_reduce_pattern = R"IR(
+                graph(%x.1 : Tensor):
+                    %1 : float = prim::Constant[value=0.044714999999999998]()
+                    %2 : float = prim::Constant[value=0.79788456080000003]()
+                    %3 : int = prim::Constant[value=3]()
+                    %4 : float = prim::Constant[value=1.0]()
+                    %5 : float = prim::Constant[value=0.5]()
+                    %6 : Tensor = aten::pow(%x.1, %3)
+                    %7 : Tensor = aten::mul(%6, %1)
+                    %8 : Tensor = aten::add(%7, %x.1, %4)
+                    %9 : Tensor = aten::mul(%8, %2)
+                    %10 : Tensor = aten::tanh(%9)
+                    %11 : Tensor = aten::add(%10, %4, %4)
+                    %12 : Tensor = aten::mul(%11, %x.1)
+                    %13 : Tensor = aten::mul(%12, %5)
+                    return (%13))IR";
+        } else {
+            gelu_pattern = R"IR(
+                graph(%x : Tensor, %approximate : str):
+                    %out : Tensor = aten::gelu(%x, %approximate)
+                    return (%out))IR";
+
+            gelu_reduce_pattern = R"IR(
+                graph(%x.1 : Tensor, %approximate):
+                    %1 : float = prim::Constant[value=0.044714999999999998]()
+                    %2 : float = prim::Constant[value=0.79788456080000003]()
+                    %3 : int = prim::Constant[value=3]()
+                    %4 : float = prim::Constant[value=1.0]()
+                    %5 : float = prim::Constant[value=0.5]()
+                    %6 : Tensor = aten::pow(%x.1, %3)
+                    %7 : Tensor = aten::mul(%6, %1)
+                    %8 : Tensor = aten::add(%7, %x.1, %4)
+                    %9 : Tensor = aten::mul(%8, %2)
+                    %10 : Tensor = aten::tanh(%9)
+                    %11 : Tensor = aten::add(%10, %4, %4)
+                    %12 : Tensor = aten::mul(%11, %x.1)
+                    %13 : Tensor = aten::mul(%12, %5)
+                    return (%13))IR";
+        }
+        torch::jit::SubgraphRewriter gelu_rewriter;
+        gelu_rewriter.RegisterRewritePattern(gelu_pattern, gelu_reduce_pattern);
+        gelu_rewriter.runOnGraph(graph);
+
+        return true;
+    }
+    return false;
+}
+
+FuseGelu::FuseGelu() = default;
+
+/**
+ * find out whether gelu exists.
+ * @param block
+ * @return bool: true if aten::gelu exists, else false.
+ */
+bool FuseGelu::try_to_find_gelu(torch::jit::Block *block) {
+    bool graph_changed = false;
+    auto it = block->nodes().begin();
+    while (it != block->nodes().end()) {
+        auto node = *it;
+        ++it;  //++it first, node may be destroyed later。
+        for (auto sub_block: node->blocks()) {
+            if (try_to_find_gelu(sub_block)) {
+                graph_changed = true;
+            }
+        }
+        
+        if (node->kind() == torch::jit::aten::gelu) { 
+            record_transform(torch::jit::aten::gelu)->to(torch::jit::aten::pow, torch::jit::aten::mul,
+                                                            torch::jit::aten::add, torch::jit::aten::tanh);
+            graph_changed = true;
+        }
+    }
+    return graph_changed;
+}
+
+REGISTER_OP_FUSER(FuseGelu)
+
+}
+}
+}// namespace
\ No newline at end of file
diff --git a/poros/src/poros/lowering/fuse_gelu.h b/poros/src/poros/lowering/fuse_gelu.h
new file mode 100644
index 0000000000..f0fe5bf047
--- /dev/null
+++ b/poros/src/poros/lowering/fuse_gelu.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file fuse_gelu.h
+* @author tianshaoqing@baidu.com
+* @date 2022-10-20 14:39:32
+* @brief
+**/
+
+#pragma once
+
+#include "poros/lowering/op_fuse_pass.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class FuseGelu : public IFuser {
+public:
+    FuseGelu();
+
+    bool fuse(std::shared_ptr<torch::jit::Graph> graph);
+
+private:
+    bool try_to_find_gelu(torch::jit::Block *block);
+};
+
+}
+}
+}
\ No newline at end of file
diff --git a/poros/src/poros/lowering/fuse_hard_swish.cpp b/poros/src/poros/lowering/fuse_hard_swish.cpp
new file mode 100644
index 0000000000..e4832a759f
--- /dev/null
+++ b/poros/src/poros/lowering/fuse_hard_swish.cpp
@@ -0,0 +1,96 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file fuse_hard_swish.cpp
+* @author Lin Xiao Chun (linxiaochun@baidu.com)
+* @date 2022-04-07 15:30:35
+* @brief
+**/
+#include "poros/lowering/fuse_hard_swish.h"
+
+#include <torch/csrc/jit/passes/subgraph_rewrite.h>
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+/**
+ * FuseHardSwish
+ * @param graph
+ * @return true if graph changed, false if not
+ */
+bool FuseHardSwish::fuse(std::shared_ptr<torch::jit::Graph> graph) {
+    if (try_to_find_hardswish(graph->block())) {
+        std::string new_pattern = R"IR(
+            graph(%x):
+                %1 : int = prim::Constant[value=1]()
+                %3 : int = prim::Constant[value=3]()
+                %6 : int = prim::Constant[value=6]()
+                %7 : int = prim::Constant[value=0]()
+                %x_1 : Tensor = aten::add(%x, %3, %1)
+                %x_2 : Tensor = aten::clamp(%x_1, %7, %6)
+                %x_3 : Tensor = aten::mul(%x, %x_2)
+                %out : Tensor = aten::div(%x_3, %6)
+                return (%out))IR";
+
+        std::string old_pattern = R"IR(
+            graph(%x):
+                %out: Tensor = aten::hardswish(%x)
+                return (%out))IR";
+
+        torch::jit::SubgraphRewriter std_rewriter;
+        std_rewriter.RegisterRewritePattern(old_pattern, new_pattern);
+        std_rewriter.runOnGraph(graph);
+
+        return true;
+    }
+
+    return false;
+}
+
+/**
+ * search for hardswish activation recursively, record all findings
+ * @param block
+ * @return true if at least one hardswish found, false if none found
+ */
+bool FuseHardSwish::try_to_find_hardswish(torch::jit::Block *block) {
+    bool graph_changed = false;
+    auto it = block->nodes().begin();
+    while (it != block->nodes().end()) {
+        auto node = *it;
+        ++it;  //++it first, node may be destroyed later。
+        for (auto sub_block: node->blocks()) {
+            if (try_to_find_hardswish(sub_block)) {
+                graph_changed = true;
+            }
+        }
+        //只处理 aten::hardswish的场景
+        if (node->kind() != torch::jit::aten::hardswish) {
+            continue;
+        }
+        record_transform(torch::jit::aten::hardswish)->to(torch::jit::aten::add, torch::jit::aten::clamp, torch::jit::aten::div);
+
+        graph_changed = true;
+    }
+    return graph_changed;
+}
+
+FuseHardSwish::FuseHardSwish() = default;
+
+REGISTER_OP_FUSER(FuseHardSwish)
+
+}
+}
+}// namespace
\ No newline at end of file
diff --git a/poros/src/poros/lowering/fuse_hard_swish.h b/poros/src/poros/lowering/fuse_hard_swish.h
new file mode 100644
index 0000000000..3054b02bdf
--- /dev/null
+++ b/poros/src/poros/lowering/fuse_hard_swish.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file fuse_hard_swish.h
+* @author Lin Xiao Chun (linxiaochun@baidu.com)
+* @date 2022-04-07 15:31:26
+* @brief
+**/
+
+#pragma once
+
+#include "poros/lowering/op_fuse_pass.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class FuseHardSwish : public IFuser {
+public:
+    FuseHardSwish();
+
+    /**
+     * FuseHardSwish
+     * @param graph
+     * @return true if graph changed, false if not
+     */
+    bool fuse(std::shared_ptr<torch::jit::Graph> graph);
+private:
+    /**
+     * search for hardswish activation recursively, record all findings
+     * @param block
+     * @return true if at least one hardswish found, false if none found
+     */
+    bool try_to_find_hardswish(torch::jit::Block *block);
+};
+
+}
+}
+}
\ No newline at end of file
diff --git a/poros/src/poros/lowering/fuse_meshgrid.cpp b/poros/src/poros/lowering/fuse_meshgrid.cpp
new file mode 100644
index 0000000000..ef131fbbcd
--- /dev/null
+++ b/poros/src/poros/lowering/fuse_meshgrid.cpp
@@ -0,0 +1,111 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file fuse_meshgrid.cpp
+* @author Lin Xiao Chun (linxiaochun@baidu.com)
+* @date 2022-04-29 14:56:48
+* @brief
+**/
+
+#include "poros/lowering/fuse_meshgrid.h"
+
+#include <torch/csrc/jit/passes/subgraph_rewrite.h>
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+/**
+ * rewrite meshgrid with `ones + transpose + mul`
+ * @param graph
+ * @return
+ */
+bool FuseMeshgrid::fuse(std::shared_ptr<torch::jit::Graph> graph) {
+    if (try_to_find_meshgrid(graph->block())) {
+        std::string old_pattern = R"IR(
+            graph(%x_1 : Tensor, %y_1 : Tensor ):
+                %1 : Tensor[] = prim::ListConstruct(%x_1, %y_1)
+                %2 : Tensor[] = aten::meshgrid(%1)
+                return (%2))IR";
+
+        std::string new_pattern = R"IR(
+            graph(%x_1 : Tensor, %y_1 : Tensor):
+                %device.1 : Device = prim::device(%x_1)
+                %2 : NoneType = prim::Constant()
+                %3 : int = prim::Constant[value=1]()
+                %4 : int = prim::Constant[value=0]()
+                %5 : int[] = aten::size(%y_1)
+                %6 : int = aten::__getitem__(%5, %4)
+                %7 : int[] = prim::ListConstruct(%3, %6)
+                %x_dtype : int = prim::dtype(%x_1)
+                %8 : Tensor = aten::ones(%7, %x_dtype, %2, %device.1, %2)
+                %10 : Tensor = aten::unsqueeze(%x_1, %4)
+                %11 : Tensor = aten::transpose(%10, %4, %3)
+                %12 : Tensor = aten::mul(%8, %11)
+
+                %25 : int[] = aten::size(%x_1)
+                %26 : int = aten::__getitem__(%25, %4)
+                %27 : int[] = prim::ListConstruct(%26, %3)
+                %y_dtype : int = prim::dtype(%y_1)
+                %28 : Tensor = aten::ones(%27, %y_dtype, %2, %device.1, %2)
+                %29 : Tensor = aten::unsqueeze(%y_1, %4)
+                %18 : Tensor = aten::mul(%28, %29)
+
+                %19 : Tensor[] =  prim::ListConstruct(%12, %18)
+                return (%19))IR";
+        torch::jit::SubgraphRewriter std_rewriter;
+        std_rewriter.RegisterRewritePattern(old_pattern, new_pattern);
+        std_rewriter.runOnGraph(graph);
+
+        return true;
+    }
+    return false;
+
+}
+
+FuseMeshgrid::FuseMeshgrid() = default;
+
+/**
+ * find out whether meshgrid exists
+ * @param block
+ * @return bool: true if meshgrid exists, else false
+ */
+bool FuseMeshgrid::try_to_find_meshgrid(torch::jit::Block *block) {
+    bool graph_changed = false;
+    auto it = block->nodes().begin();
+    while (it != block->nodes().end()) {
+        auto node = *it;
+        ++it;  //++it first, node may be destroyed later。
+        for (auto sub_block: node->blocks()) {
+            if (try_to_find_meshgrid(sub_block)) {
+                graph_changed = true;
+            }
+        }
+        //只处理 aten::conv + batch_norm的场景
+        if (node->kind() != torch::jit::aten::meshgrid) {
+            continue;
+        }
+
+        record_transform(torch::jit::aten::meshgrid)->to(torch::jit::aten::ones, torch::jit::aten::unsqueeze,
+                                                         torch::jit::aten::transpose, torch::jit::aten::mul);
+        graph_changed = true;
+    }
+    return graph_changed;
+}
+
+REGISTER_OP_FUSER(FuseMeshgrid)
+
+}
+}
+}// namespace
diff --git a/poros/src/poros/lowering/fuse_meshgrid.h b/poros/src/poros/lowering/fuse_meshgrid.h
new file mode 100644
index 0000000000..d18dfd25d5
--- /dev/null
+++ b/poros/src/poros/lowering/fuse_meshgrid.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file fuse_meshgrid.h
+* @author Lin Xiao Chun (linxiaochun@baidu.com)
+* @date 2022-04-29 14:56:57
+* @brief
+**/
+
+#pragma once
+
+#include "poros/lowering/op_fuse_pass.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+class FuseMeshgrid : public IFuser {
+public:
+    FuseMeshgrid();
+
+    bool fuse(std::shared_ptr<torch::jit::Graph> graph);
+
+private:
+    bool try_to_find_meshgrid(torch::jit::Block *block);
+
+};
+
+}
+}
+}
\ No newline at end of file
diff --git a/poros/src/poros/lowering/input_param_propagate.cpp b/poros/src/poros/lowering/input_param_propagate.cpp
new file mode 100644
index 0000000000..dda36c0170
--- /dev/null
+++ b/poros/src/poros/lowering/input_param_propagate.cpp
@@ -0,0 +1,94 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file input_param_propagate.cpp
+* @author huangben@baidu.com
+* @date 2021-08-18 14:56:57
+* @brief
+**/
+#include "poros/lowering/lowering_pass.h"
+
+#include <ATen/core/jit_type.h>
+#include <ATen/ExpandUtils.h>
+#include <c10/core/ScalarType.h>
+#include <torch/csrc/jit/ir/constants.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/runtime/operator.h>
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+namespace {
+using namespace torch::jit;
+struct InputParamPropagate {
+    InputParamPropagate(std::shared_ptr<Graph> graph) : graph_(std::move(graph)) {}
+
+    void run(std::vector<Stack>& stack_vec) {
+        if (stack_vec.size() == 0) {
+            return;
+        }
+
+        auto check_input_param_unchanged = [](std::vector<Stack>& stack_vec, size_t offset) {
+            if (stack_vec.size() == 1) {
+                return true;
+            }
+            auto ivalue = stack_vec[0][offset];
+            for (size_t idx = 1; idx < stack_vec.size(); ++idx) {
+                if (stack_vec[idx][offset] != ivalue) {
+                    return false;
+                }
+            }
+            return true;
+        };
+
+        auto g_inputs = graph_->inputs();
+        size_t extra_offset = 0;
+        for (size_t offset = 0; offset < stack_vec[0].size(); ++offset) {
+            if (stack_vec[0][offset].isBool() || stack_vec[0][offset].isInt()) {
+                if (check_input_param_unchanged(stack_vec, offset)) {
+                    WithInsertPoint guard(graph_->block()->nodes().front());
+                    auto insert_value = graph_->insertConstant(stack_vec[0][offset]);
+                    if (g_inputs.size() == stack_vec[0].size()) {
+                        g_inputs[offset]->replaceAllUsesWith(insert_value);
+                    } else {
+                        //TODO: this type check is not comprehensive. It may lead some bug with unexpected input data.
+                        while (c10::ClassTypePtr c = g_inputs[offset + extra_offset]->type()->cast<c10::ClassType>()) {
+                            if (c->is_module()) {
+                                extra_offset++;
+                            }
+                        }
+                        g_inputs[offset + extra_offset]->replaceAllUsesWith(insert_value);
+                    }
+                }
+            }
+        }
+        return;
+    }
+
+private:
+    std::shared_ptr<Graph> graph_;
+};
+} // namespace
+
+void input_param_propagate(std::shared_ptr<torch::jit::Graph> graph,
+                        std::vector<std::vector<c10::IValue>>& stack_vec) {
+    InputParamPropagate ipp(std::move(graph));
+    ipp.run(stack_vec);
+}
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/lowering/link_mutable_list_pass.cpp b/poros/src/poros/lowering/link_mutable_list_pass.cpp
new file mode 100644
index 0000000000..b7eed72f5b
--- /dev/null
+++ b/poros/src/poros/lowering/link_mutable_list_pass.cpp
@@ -0,0 +1,204 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file link_mutable_list_pass.cpp
+* @author tianshaoqing@baidu.com
+* @date Thu May 9 11:15:49 CST 2022
+* @brief
+**/
+
+#include "poros/lowering/lowering_pass.h"
+
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/common_subexpression_elimination.h>
+#include <torch/csrc/jit/passes/constant_pooling.h>
+#include <torch/csrc/jit/passes/constant_propagation.h>
+#include <torch/csrc/jit/passes/dead_code_elimination.h>
+
+#include "poros/context/poros_global.h"
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+namespace {
+using namespace torch::jit;
+
+struct LinkMutableList {
+    LinkMutableList(std::shared_ptr<Graph> graph) : graph_(std::move(graph)), 
+    _mutable_list_ops_set(PorosGlobalContext::instance().supported_mutable_ops_set){}
+
+    void run() {    
+        GRAPH_DUMP("Before linking mutable list Graph: ", graph_);
+        bool changed = handle_mutable_list(graph_->block());
+        if (changed) {
+            ConstantPropagation(graph_);
+            EliminateDeadCode(graph_);
+            EliminateCommonSubexpression(graph_);
+            ConstantPooling(graph_);
+        }
+        GRAPH_DUMP("After linking mutable list Graph: ", graph_);
+        return;
+    }
+
+private:
+    std::shared_ptr<Graph> graph_;
+
+    const std::set<c10::Symbol> _mutable_list_ops_set;
+    // handle_mutable_list功能：将mutable的op输入与输出串联起来
+
+    // 通常（不含子block）情况:
+    // ---------------------------------------------
+    // %l1 : Tensor[] = aten::append(%list, %x1)
+    // %l2 : Tensor[] = aten::append(%list, %x2)
+    // %l3 : Tensor[] = aten::append(%list, %x3)
+    // %l4 : Tensor[] = aten::append(%list, %x4)
+    // ---------------------------------------------
+    // 转化为以下IR:
+    // ---------------------------------------------
+    // %l1 : Tensor[] = aten::append(%list, %x1)
+    // %l2 : Tensor[] = aten::append(%l1, %x2)
+    // %l3 : Tensor[] = aten::append(%l2, %x3)
+    // %l4 : Tensor[] = aten::append(%l3, %x4)
+    // ---------------------------------------------
+
+    // 特殊（含子block）情况，以下面的IR为例:
+    // ----------------------------------------------
+    // %list : Tensor[] = prim::ListConstruct(...)
+    // %l1 : Tensor[] = aten::append(%list, %x1) 
+    // %l2 : Tensor[] = aten::append(%list, %x2) 
+    //  = prim::Loop(%5, %2)
+    //   block0(%i : int):
+    //     %l3 : Tensor[] = aten::_set_item(%list, %i, %x3) 
+    //     %l4 : Tensor[] = aten::append(%list, %x4)
+    //     -> (%2)
+    // %l5 : Tensor[] = aten::append(%list, %x5) 
+    // %%list2 : Tensor[] = aten::slice(%list, %4, %3, %4)
+    // -----------------------------------------------
+    // 只对最外层主图中的mutable list op 输入输出串起来，而子block中不串。转化为以下IR:
+    // -----------------------------------------------
+    // %list : Tensor[] = prim::ListConstruct(...)
+    // %l1 : Tensor[] = aten::append(%list, %x1) 
+    // %l2 : Tensor[] = aten::append(%l1, %x2) 
+    //  = prim::Loop(%5, %2)
+    //   block0(%i : int):
+    //     %l3 : Tensor[] = aten::_set_item(%l2, %i, %x3) 
+    //     %l4 : Tensor[] = aten::append(%l2, %x4)
+    //     -> (%2)
+    // %l5 : Tensor[] = aten::append(%l2, %x5)
+    // %%list2 : Tensor[] = aten::slice(%l5, %4, %3, %4)
+    // ----------------------------------------------
+    // 只要保证子block中的mutable list op不合入子图就行，主图中子图的mutable可以不回传
+    bool handle_mutable_list(Block* block) {
+        bool changed = false;
+        for (auto it = block->nodes().begin(); it != block->nodes().end(); ) {
+            Node* node = *it;
+            ++it;
+            if (_mutable_list_ops_set.find(node->kind()) != _mutable_list_ops_set.end()) { 
+                if (node->outputs().size() == 1) {
+                    changed = true;
+                    node->input(0)->replaceAllUsesAfterNodeWith(node, node->output(0)); 
+                } else {
+                    LOG(WARNING) << "mutable op: " << node_info(node) << " output size() != 1. " << 
+                    "This situation is not yet supported.";
+                }
+            }
+        }
+        return changed;
+    }
+    // 以下是曾经实现的版本:
+    // 版本一，本应是最理想的版本，但是无跑通
+    // ----------------------------------------------
+    // %list : Tensor[] = prim::ListConstruct(...)
+    // %l1 : Tensor[] = aten::append(%list, %x1) 
+    // %l2 : Tensor[] = aten::append(%l1, %x2) 
+    //  = prim::Loop(%5, %2)
+    //   block0(%i : int):
+    //     %l3 : Tensor[] = aten::_set_item(%l2, %i, %x3)  <------执行到这步出错
+    //     %l4 : Tensor[] = aten::append(%l2, %x4)
+    //     -> (%2)
+    // %l5 : Tensor[] = aten::append(%l2, %x5)
+    // %%list2 : Tensor[] = aten::slice(%l2, %4, %3, %4)
+    // -------------------------------------------------
+    // *在子block中对某value使用replaceAllUsesAfterNodeWith时，如果block外面也有value的user的话jit会出错
+    // 本例子中，由于l2在子block外部也有users，在给子block以外的%l2替换成%l3时会出错
+    /*
+    bool handle_mutable_list(Block* block) {
+        bool changed = false;
+        for (auto it = block->nodes().begin(); it != block->nodes().end(); ) {
+            Node* node = *it;
+            ++it;
+            for (Block* subblock : node->blocks()) {
+                changed |= handle_mutable_list(subblock);
+            }
+            if (_mutable_list_ops_set.find(node->kind()) != _mutable_list_ops_set.end()) {
+                changed = true;
+                node->input(0)->replaceAllUsesAfterNodeWith(node, node->output(0));
+            }
+        }
+        return changed;
+    }*/
+
+    // 版本二，只替换同一block下且在本node之后的node（mutable）
+    // 执行后:
+    // ----------------------------------------------
+    // %list : Tensor[] = prim::ListConstruct(...)
+    // %l1 : Tensor[] = aten::append(%list, %x1) 
+    // %l2 : Tensor[] = aten::append(%l1, %x2) 
+    //  = prim::Loop(%5, %2)
+    //   block0(%i : int):
+    //     %l3 : Tensor[] = aten::_set_item(%list, %i, %x3) 
+    //     %l4 : Tensor[] = aten::append(%l3, %x4)
+    //     -> (%2)
+    // %l5 : Tensor[] = aten::append(%l2, %x5)
+    // %%list2 : Tensor[] = aten::slice(%l5, %4, %3, %4)
+    // ------------------------------------------------
+    // 作用域只能在自己node归属的block中，导致子block调用了最开始的mutable(%list)，
+    // 主图中子图的mutable需要回传，子block中子图的mutable需要回传，依赖回传。
+    /*
+    bool handle_mutable_list(Block* block) {
+        bool changed = false;
+        for (auto it = block->nodes().begin(); it != block->nodes().end(); ) {
+            Node* node = *it;
+            ++it;
+            for (Block* subblock : node->blocks()) {
+                changed |= handle_mutable_list(subblock);
+            }
+            if (_mutable_list_ops_set.find(node->kind()) != _mutable_list_ops_set.end()) {
+                changed = true;
+                torch::jit::use_list use_list = node->input(0)->uses();
+                for (size_t u = 0; u < use_list.size(); u++) {
+                    // 只替换同一block下且在本node之后的node
+                    if (use_list[u].user->owningBlock() == block && use_list[u].user->isAfter(node)) {
+                        use_list[u].user->replaceInput(use_list[u].offset, node->output(0));
+                    }
+                }
+            }
+        }
+        return changed;
+    }*/
+};
+
+} // namespace
+
+void link_mutable_list(std::shared_ptr<torch::jit::Graph> graph) {
+    LinkMutableList lml(std::move(graph));
+    lml.run();
+}
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
\ No newline at end of file
diff --git a/poros/src/poros/lowering/lowering_pass.h b/poros/src/poros/lowering/lowering_pass.h
new file mode 100644
index 0000000000..e7b8721f6e
--- /dev/null
+++ b/poros/src/poros/lowering/lowering_pass.h
@@ -0,0 +1,190 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file lowering_pass.h
+* @author Lin Xiao Chun (linxiaochun@baidu.com)
+* @date 2022-03-31 16:11:18
+* @brief
+**/
+#pragma once
+
+#include <vector>
+
+#include <ATen/core/ivalue.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+/**
+* @brief 删除graph中，纯粹的prim::RaiseException分支。
+*       （否则graph会被分割成过多的block）
+**/
+void eliminate_exception_pass(std::shared_ptr<torch::jit::Graph> graph);
+
+/**
+* @brief 替换graph中，prim::ListConstruct 类型的节点后面，紧跟 prim::ListUnpack 类型的情况。
+*        prim::ListConstruct 用来将多个元素构建成list
+*        prim::ListUnpack 用来将一个list打散成多个元素。
+*        当这两个节点处理同一个list，且节点间没有其他可能改变该list的情况时，将这两个节点抵消。
+**/
+void eliminate_some_list(std::shared_ptr<torch::jit::Graph> graph);
+
+/**
+* @brief 替换graph中，prim::DictConstruct 类型的节点后面，跟的全部是 aten::__getitem__ 类型的情况（且dict的key是常量）。
+*        prim::DictConstruct 用来将多个元素构建成dict
+*        aten::__getitem__  用来从list 或者 dict 中获取元素。
+*        当DictConstruct生成的dict，只被aten::__getitem__ 调用，且没有其他可能改变该dict的情况时，将这两类op抵消。
+**/
+void eliminate_some_dict(std::shared_ptr<torch::jit::Graph> graph);
+
+/**
+* @brief 删除graph中，未被使用的aten::copy_节点。
+*
+**/
+void eliminate_useless_copy(std::shared_ptr<torch::jit::Graph> graph);
+
+
+/**
+ * @brief 尝试用maxpool 代替 maxpool_with_indeces.
+ * 以 maxpoll2d 为例：
+ * maxpoll2d_with_indices 的schema为：aten::max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
+ * 而 maxpoll 的schema为：aten::max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
+ * 这两个op，输入参数完全一致，输出上，max_pool2d_with_indices有两个输出，第一个输出与max_pool2d的输出完全一致，第二个输出为indeces信息。
+ * 当 max_pool2d_with_indices 的第二个输出indices，后续没有其他op使用该value的时候，
+ * 我们直接用max_pool2d 替代 max_pool2d_with_indices。
+ **/
+void eliminate_maxpool_with_indices(std::shared_ptr<torch::jit::Graph> graph);
+
+/**
+* @brief 将符合条件的loop进行循环展开，避免过多的block，影响子图分割的逻辑。
+*        本function很大程度上参考了jit原生的UnrollLoop的实现，
+*        考虑到原生的UnrollLoop支持的bodysize和loopcount不符合poros的预期，且原生实现不提供修改参数的接口。
+*        故重新实现该function, 调整loop展开的条件和部分细节。
+**/
+void unrolling_loop(std::shared_ptr<torch::jit::Graph> graph);
+
+/**
+* @brief 替换graph中，aten::std 的算子为 aten::var + aten::sqrt。
+*        依据：标准差(aten::std) = 方差(aten::var) 的算术平方根(aten::sqrt)
+**/
+void unpack_std(std::shared_ptr<torch::jit::Graph>& graph);
+
+/**
+* @brief 替换graph中，aten::var 的算子为 aten::mul + aten::mean 等。
+*        参考pytorch-1.9.0 中该算子的实现: https://github.com/pytorch/pytorch/blob/v1.9.0/aten/src/ATen/native/ReduceOps.cpp#L1380
+**/
+void unpack_var(std::shared_ptr<torch::jit::Graph>& graph);
+
+/**
+* @brief 尝试将aten::percentFormat 的结果变成常量。
+*        背景： aten::percentFormat 的功能主要是用于字符串的组装，
+*              且常常配合 prim::If 这个op进行字符串的比较，实现分支选择。
+*        当precentFormat 的输入都是常量的时候，尝试直接计算出这个算子的结果，替换成常量
+*        进一步配合prim::If 的条件判断是否为常量，最终配合达到删除不必要的分支的目的。
+**/
+void freeze_percentformat(std::shared_ptr<torch::jit::Graph> graph);
+
+/**
+* @brief 尝试固定aten::size的结果。需要配合后续aten::size的输出的使用进行判断。
+*        注意： 本function必须在预热数据处理完整个graph之后再使用，且依赖于预热数据覆盖的全面程度。
+**/
+void freeze_aten_size(std::shared_ptr<torch::jit::Graph> graph);
+
+/**
+* @brief 尝试固定aten::len的结果。需要配合后续aten::len的输出的使用进行判断。
+*        注意： 本function必须在预热数据处理完整个graph之后再使用，且依赖于预热数据覆盖的全面程度。
+**/
+void freeze_aten_len(std::shared_ptr<torch::jit::Graph> graph);
+
+/**
+* @brief 尝试固定aten::dim的结果。需要配合后续aten::dim的输出的使用进行判断。
+*        注意： 本function必须在预热数据处理完整个graph之后再使用，且依赖于预热数据覆盖的全面程度。
+**/
+void freeze_aten_dim(std::shared_ptr<torch::jit::Graph> graph);
+
+/**
+* @brief 当遇到使用ListConstruct对1个constant进行append时，可以讲output直接替换为1个constant
+*  例如：float 替换为 (float, float,..)
+**/
+void freeze_list_construct(std::shared_ptr<torch::jit::Graph> graph);
+
+/**
+* @brief 针对graph的简单类型输入【bool or int 类型】，尝试进行剪枝。
+*        当多轮预热数据的简单类型输入保持不变，则认为该输入可以用常量进行替代。
+*        注意： 本function必须在预热数据处理完整个graph之后再使用，且依赖于预热数据覆盖的全面程度。
+**/
+void input_param_propagate(std::shared_ptr<torch::jit::Graph> graph,
+                        std::vector<std::vector<c10::IValue>>& stack_vec);
+
+/**
+* @brief 移除graph中，跟踪bool类型的prim::profile节点。
+*        这些prim::profile在数据预热阶段(IvalueAnalysis)添加进graph，数据预热完成后，需要相应移除这些节点。
+**/
+void remove_simple_type_profile_nodes(std::shared_ptr<torch::jit::Graph> graph);
+
+/**
+* @brief 使用log(softmax())替代log_softmax()
+**/
+void replace_log_softmax(std::shared_ptr<torch::jit::Graph> graph);
+
+/**
+* @brief 使用log(sigmoid())替代log_sigmoid()
+**/
+void replace_log_sigmoid(std::shared_ptr<torch::jit::Graph> graph);
+
+/**
+* @brief 将包含list类型引用语义的op输入与输出串联起来。
+**/
+void link_mutable_list(std::shared_ptr<torch::jit::Graph> graph);
+
+/**
+* @brief 直接删除图中与infer无关的节点。
+**/
+void eliminate_simple_useless_nodes(std::shared_ptr<torch::jit::Graph> graph);
+
+/**
+ * @brief 删除子图内部或输入相关的无用节点。（当前支持aten::to.device，aten::contiguous，aten::dropout和aten::detach）
+ *        注意：1、删除子图内部节点（is_input == false）必须在拷贝的子图上，否则fallback会出错。 
+ *             2、删除（替换）子图输入节点（is_input == true）必须在子图转engine成功后。
+ *
+ * @param [in] subgraph : 要删无用节点的子图
+ * @param [in] subgraph_node : subgraph对应的子图节点，类型必须是prim::CudaFusionGroup
+ * @param [in] is_input : true表示要删除的是子图输入的节点，false表示删除子图内部节点
+ * 
+ * @return bool
+ * @retval true => 删除节点成功  false => 如果删完无用节点后的子图node数量为0，返回false unmerge
+**/
+bool eliminate_subgraph_useless_nodes(std::shared_ptr<torch::jit::Graph> subgraph, 
+                            torch::jit::Node& subgraph_node, 
+                            const bool is_input);
+
+/**
+* @brief 检查并替换有问题的constant
+**/
+void replace_illegal_constant(std::shared_ptr<torch::jit::Graph> graph);
+
+/**
+* @brief 替换aten::pad。该op可视做多种pad的集合，只是用mode来设置pad方式（包括：constant、reflect、replicate还有circular）。
+*        mode == constant时，可替换为aten::constant_pad_nd，已实现。
+*        todo:
+*        mode == refect时，替换为aten::reflection_pad
+*        mode == replicate时，替换为aten::replication_pad
+**/
+void replace_pad(std::shared_ptr<torch::jit::Graph> graph);
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
\ No newline at end of file
diff --git a/poros/src/poros/lowering/op_fuse_pass.cpp b/poros/src/poros/lowering/op_fuse_pass.cpp
new file mode 100644
index 0000000000..d6eb56ec81
--- /dev/null
+++ b/poros/src/poros/lowering/op_fuse_pass.cpp
@@ -0,0 +1,139 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file op_fuse_pass.cpp
+* @author Lin Xiao Chun (linxiaochun@baidu.com)
+* @date 2022-03-31 16:11:18
+* @brief
+**/
+#include "poros/lowering/op_fuse_pass.h"
+
+#include <torch/csrc/jit/passes/common_subexpression_elimination.h>
+#include <torch/csrc/jit/passes/constant_pooling.h>
+#include <torch/csrc/jit/passes/constant_propagation.h>
+#include <torch/csrc/jit/passes/dead_code_elimination.h>
+#include <utility>
+
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+std::string IFuser::info() {
+    std::string info = "OP Fuser:" + IFuser::name_ + " ";
+
+    for (auto ori: IFuser::fused_ops) {
+        info += "[" + ori->info() + "]";
+    }
+    return info;
+
+}
+
+void IFuser::setName(const std::string name) {
+    IFuser::name_ = name;
+}
+
+void IFuser::reset() {
+    IFuser::fused_ops.clear();
+}
+
+std::string FusedOpsRecord::info() {
+    std::string info;
+    for (auto it = from_ops_.begin(); it != from_ops_.end(); it++) {
+        info += std::string(it->toUnqualString());
+        if (it != from_ops_.end() - 1) {
+            info += ",";
+        }
+    }
+    info += " => ";
+    for (auto it = to_ops_.begin(); it != to_ops_.end(); it++) {
+        info += std::string(it->toUnqualString());
+        if (it != to_ops_.end() - 1) {
+            info += ",";
+        }
+    }
+    return info;
+}
+
+void FusedOpsRecord::from() {
+
+}
+
+void FusedOpsRecord::to() {
+
+}
+
+void fuse_ops_preprocess(std::shared_ptr<torch::jit::Graph> graph) {
+    IFuserManager *manager = IFuserManager::get_instance();
+    manager->preprocess_fuse(std::move(graph));
+
+}
+
+void fuse_ops_prewarm(std::shared_ptr<torch::jit::Graph> graph) {
+    IFuserManager *manager = IFuserManager::get_instance();
+    manager->prewarm_fuse(std::move(graph));
+
+}
+
+IFuserManager *IFuserManager::get_instance() {
+    static IFuserManager manager;
+    return &manager;
+}
+
+std::string IFuserManager::register_fuser(const std::shared_ptr<IFuser> &fuser, const std::string &name) {
+    fuser->setName(name);
+    preprocess_fusers.push_back(fuser);
+    prewarm_fusers.push_back(fuser);
+    return name;
+}
+
+void IFuserManager::preprocess_fuse(std::shared_ptr<torch::jit::Graph> graph) {
+    bool graph_changed = false;
+    for (auto &&fuser: preprocess_fusers) {
+        fuser->reset();
+        if (fuser->fuse(graph)) {
+            LOG(INFO) << fuser->info();
+            graph_changed = true;
+        }
+    }
+    if (graph_changed) {
+        ConstantPropagation(graph);
+        EliminateDeadCode(graph);
+        EliminateCommonSubexpression(graph);
+        ConstantPooling(graph);
+    }
+}
+
+void IFuserManager::prewarm_fuse(std::shared_ptr<torch::jit::Graph> graph) {
+    bool graph_changed = false;
+    for (auto &&fuser: prewarm_fusers) {
+        fuser->reset();
+        if (fuser->fuse(graph)) {
+            LOG(INFO) << fuser->info();
+            graph_changed = true;
+        }
+    }
+    if (graph_changed) {
+        ConstantPropagation(graph);
+        EliminateDeadCode(graph);
+        EliminateCommonSubexpression(graph);
+        ConstantPooling(graph);
+    }
+}
+
+}
+}
+}
\ No newline at end of file
diff --git a/poros/src/poros/lowering/op_fuse_pass.h b/poros/src/poros/lowering/op_fuse_pass.h
new file mode 100644
index 0000000000..4b63cd6f94
--- /dev/null
+++ b/poros/src/poros/lowering/op_fuse_pass.h
@@ -0,0 +1,174 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file op_fuse_pass.h
+* @author Lin Xiao Chun (linxiaochun@baidu.com)
+* @date 2022-03-31 16:11:18
+* @brief
+**/
+
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/jit_log.h>
+
+#include <vector>
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+/**
+ * FusedOpsRecord
+ * only used for recording fusion infomation, DO NOT affect actual fusing logic
+ */
+class FusedOpsRecord {
+public:
+    void from();
+
+    template<typename... Rest>
+    void from(torch::jit::Node *first,  Rest ... rest);
+
+    template<typename... Rest>
+    void from(torch::jit::NodeKind first, Rest ... rest);
+
+    void to();
+
+    template<typename... Rest>
+    void to(torch::jit::Node *first,  Rest ... rest);
+
+    template<typename... Rest>
+    void to(torch::jit::NodeKind first, Rest ... rest);
+
+    std::string info();
+
+private:
+
+    std::vector<torch::jit::NodeKind> from_ops_;
+    std::vector<torch::jit::NodeKind> to_ops_;
+};
+
+template<typename... Rest>
+void FusedOpsRecord::from( torch::jit::Node *first,  Rest ... rest) {
+    from_ops_.push_back(first->kind());
+    from(rest...); // recursive call using pack expansion syntax
+}
+
+template<typename... Rest>
+void FusedOpsRecord::from( torch::jit::NodeKind first,  Rest ... rest) {
+    from_ops_.push_back(first);
+    from(rest...); // recursive call using pack expansion syntax
+}
+
+template<typename... Rest>
+void FusedOpsRecord::to(torch::jit::Node *first,  Rest ... rest) {
+    to_ops_.push_back(first->kind());
+    to(rest...);
+}
+
+template<typename... Rest>
+void FusedOpsRecord::to(torch::jit::NodeKind first, Rest ... rest) {
+    to_ops_.push_back(first);
+    to(rest...); // recursive call using pack expansion syntax
+}
+
+/**
+ * IFuser
+ * base class of all fusers
+ */
+class IFuser {
+public:
+    IFuser() = default;;
+
+    virtual ~IFuser() = default;;
+
+    virtual bool fuse(std::shared_ptr<torch::jit::Graph> graph) = 0;
+
+    std::string info();
+
+    void reset();
+
+
+    void setName(const std::string name);
+
+    template<typename First, typename...Rest>
+    std::shared_ptr<FusedOpsRecord> record_transform(First first, Rest ...rest);
+
+private:
+    std::vector<std::shared_ptr<FusedOpsRecord>> fused_ops;
+
+    std::string name_;
+
+};
+
+template<typename First, typename... Rest>
+std::shared_ptr<FusedOpsRecord> IFuser::record_transform(First first, Rest ... rest) {
+    auto f = std::make_shared<FusedOpsRecord>();
+    f->from(first, rest...); // recursive call using pack expansion syntax
+    fused_ops.push_back(f);
+    return f;
+}
+
+/**
+ * IFuserManager
+ * manage the registration and application of fusers
+ */
+class IFuserManager {
+public:
+
+    static IFuserManager *get_instance();
+
+    std::string register_fuser(const std::shared_ptr<IFuser> &fuser, const std::string &name);
+
+    /**
+     * apply all fusers in preprocess_fusers
+     * @param graph
+     */
+    void preprocess_fuse(std::shared_ptr<torch::jit::Graph> graph);
+
+    /**
+     * apply all fusers in prewarm_fusers
+     * @param graph
+     */
+    void prewarm_fuse(std::shared_ptr<torch::jit::Graph> graph);
+
+private:
+    std::vector<std::shared_ptr<IFuser>> preprocess_fusers;
+    std::vector<std::shared_ptr<IFuser>> prewarm_fusers;
+
+};
+
+/**
+ *  trying to fuse ops during preprocessing stage
+ * @param graph
+ */
+void fuse_ops_preprocess(std::shared_ptr<torch::jit::Graph> graph);
+
+/**
+ * trying to fuse ops during pre-warming stage, now it's same to fuse_ops_preprocess.
+ * @param graph
+ */
+void fuse_ops_prewarm(std::shared_ptr<torch::jit::Graph> graph);
+
+#define REGISTER_OP_FUSER(T)                                     \
+    const std::string _G_NAME = []() -> std::string {            \
+         return IFuserManager::get_instance()->register_fuser(   \
+            std::make_shared<T>(), #T);                          \
+    }();
+
+}
+}
+}
diff --git a/poros/src/poros/lowering/remove_simple_type_profile_nodes.cpp b/poros/src/poros/lowering/remove_simple_type_profile_nodes.cpp
new file mode 100644
index 0000000000..78d08f8705
--- /dev/null
+++ b/poros/src/poros/lowering/remove_simple_type_profile_nodes.cpp
@@ -0,0 +1,89 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file remobe_simple_type_profile_nodes.cpp
+* @author tianjinjin@baidu.com
+* @date Mon May 10 11:06:53 CST 2021
+* @brief 
+**/
+#include "poros/lowering/lowering_pass.h"
+
+#include <ATen/ExpandUtils.h>
+#include <ATen/core/jit_type.h>
+#include <c10/core/ScalarType.h>
+#include <torch/csrc/jit/ir/constants.h>
+#include <torch/csrc/jit/runtime/operator.h>
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+namespace {
+using namespace torch::jit;
+
+struct RemoveSimpleTypeProfileNodes {
+    RemoveSimpleTypeProfileNodes(std::shared_ptr<Graph> graph) : graph_(std::move(graph)) {}
+    void run() {
+        remove_profile_nodes(graph_->block());
+    }
+
+private:
+    bool profiled_with_different_types(Value* v) {
+        std::vector<TypePtr> types;
+        for (const auto& use : v->uses()) {
+            if (use.user->kind() == prim::profile) {
+                types.push_back(use.user->ty(attr::profiled_type));
+            }
+        }
+        for (size_t i = 1; i < types.size(); ++i) {
+            if (types.at(i - 1) != types.at(i)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    bool is_simple_type_profile_node(Node* node) {
+        return node->ty(attr::profiled_type) != TensorType::get();
+    }
+
+    void remove_profile_nodes(Block* block) {
+        for (auto itr = block->nodes().begin(); itr != block->nodes().end(); itr++) {
+            if (itr->kind() == prim::profile && is_simple_type_profile_node(*itr)) {  //todo
+                itr->output()->replaceAllUsesWith(itr->input());
+                if (!profiled_with_different_types(itr->input())) {
+                    itr->input()->setType(itr->ty(attr::profiled_type));
+                }
+                itr.destroyCurrent();
+            } else {
+                for (Block* ib : itr->blocks()) {
+                    remove_profile_nodes(ib);
+                }
+            }
+        }
+    }
+
+    std::shared_ptr<Graph> graph_;
+};
+
+} // namespace
+
+void remove_simple_type_profile_nodes(std::shared_ptr<torch::jit::Graph> graph) {
+    RemoveSimpleTypeProfileNodes(graph).run();
+}
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/lowering/replace_illegal_constant.cpp b/poros/src/poros/lowering/replace_illegal_constant.cpp
new file mode 100644
index 0000000000..fe3eeecf52
--- /dev/null
+++ b/poros/src/poros/lowering/replace_illegal_constant.cpp
@@ -0,0 +1,118 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file replace_illegal_constant.cpp
+* @author tianshaoqing@baidu.com
+* @date 2022-06-01 19:34:40
+* @brief
+**/
+#include "poros/lowering/lowering_pass.h"
+
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/common_subexpression_elimination.h>
+#include <torch/csrc/jit/passes/constant_pooling.h>
+#include <torch/csrc/jit/passes/constant_propagation.h>
+#include <torch/csrc/jit/passes/dead_code_elimination.h>
+
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+namespace {
+using namespace torch::jit;
+
+struct ReplaceIllegalConstant {
+    ReplaceIllegalConstant(std::shared_ptr<Graph> graph) : graph_(std::move(graph)) {}
+
+    void run() {
+        GRAPH_DUMP("before replace_illegal_constant Graph: ", graph_);
+        bool changed = find_and_replace_illegal_constant(graph_->block());
+        if (changed) {
+            ConstantPropagation(graph_);
+            EliminateDeadCode(graph_);
+            EliminateCommonSubexpression(graph_);
+            ConstantPooling(graph_);
+        }
+        GRAPH_DUMP("after replace_illegal_constant Graph: ", graph_);
+        return;
+    }
+
+private:
+    bool check_constant_is_illegal(Node* node) {
+        bool is_illegal = false;
+        // 检查constant输出是否是int
+        if (node->kind() == torch::jit::prim::Constant && node->outputs().size() > 0 && 
+            node->output(0)->type()->kind() == c10::TypeKind::IntType) {
+            torch::jit::IValue const_value = toIValue(node->output(0));
+            // 这里的toInt返回的是int64_t
+            long const_double = const_value.toInt();
+            // 判断int是否等于INT64_MAX，且有users
+            if ((const_double == INT64_MAX) && node->output(0)->hasUses()) {
+                is_illegal = true;
+                auto const_node_users = node->output(0)->uses();
+                // 判断逻辑，目前只遇到了slice end输入为非法constant的情况，其他情况遇到再加
+                for (size_t u = 0; u < const_node_users.size(); u++) {
+                    if (const_node_users[u].user->kind() != torch::jit::aten::slice) {
+                        is_illegal = false;
+                        break;
+                    }
+                }
+            }
+        }
+        return is_illegal;
+    }
+
+    bool find_and_replace_illegal_constant(Block* block) {
+        bool graph_changed = false;
+        auto it = block->nodes().begin();
+        while (it != block->nodes().end()) {
+            auto node = *it;
+            ++it;  //++it first, node may be destroyed later。
+            for (auto sub_block: node->blocks()) {
+                if (find_and_replace_illegal_constant(sub_block)) {
+                    graph_changed = true;
+                }
+            }
+            
+            if (node->kind() == torch::jit::prim::Constant && 
+                check_constant_is_illegal(node)) {
+                // 将slice end输入替换为none
+                torch::jit::Node* none_node = graph_->createNone();
+                none_node->insertBefore(node);
+                node->output(0)->replaceAllUsesAfterNodeWith(node, none_node->output(0));
+                LOG(INFO) << "Found illegal constant INT64_MAX used as index by aten::slice. Replace it with Constant None.";
+                node->destroy();
+                graph_changed = true;
+            }
+        }
+        return graph_changed;
+    }
+
+    std::shared_ptr<Graph> graph_;
+    std::unordered_set<c10::OperatorName> useless_schema_set_;
+};
+
+} // namespace
+
+void replace_illegal_constant(std::shared_ptr<torch::jit::Graph> graph) {
+    ReplaceIllegalConstant ric(std::move(graph));
+    ric.run();
+}
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
\ No newline at end of file
diff --git a/poros/src/poros/lowering/replace_pad.cpp b/poros/src/poros/lowering/replace_pad.cpp
new file mode 100644
index 0000000000..4245dd017c
--- /dev/null
+++ b/poros/src/poros/lowering/replace_pad.cpp
@@ -0,0 +1,112 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file replace_pad.cpp
+* @author tianshaoqing@baidu.com
+* @date 2022-11-09 19:34:40
+* @brief
+**/
+#include "poros/lowering/lowering_pass.h"
+
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/common_subexpression_elimination.h>
+#include <torch/csrc/jit/passes/constant_pooling.h>
+#include <torch/csrc/jit/passes/constant_propagation.h>
+#include <torch/csrc/jit/passes/dead_code_elimination.h>
+
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+namespace {
+using namespace torch::jit;
+
+struct ReplacePad {
+    ReplacePad(std::shared_ptr<Graph> graph) : graph_(std::move(graph)) {}
+
+    void run() {
+        GRAPH_DUMP("before replace pad Graph: ", graph_);
+        bool changed = find_and_replace_pad(graph_->block());
+        if (changed) {
+            ConstantPropagation(graph_);
+            EliminateDeadCode(graph_);
+            EliminateCommonSubexpression(graph_);
+            ConstantPooling(graph_);
+        }
+        GRAPH_DUMP("after replace pad Graph: ", graph_);
+        return;
+    }
+
+private:
+    bool check_pad_constant_mode(Node* node) {
+        bool is_constant_mode = false;
+        // 检查schema是否为
+        // aten::pad(Tensor self, int[] pad, str mode="constant", float? value=None) -> (Tensor)
+        if (node->kind() == c10::Symbol::fromQualString("aten::pad") &&
+            node->inputs().size() == 4 && 
+            node->input(1)->type()->isSubtypeOf(c10::ListType::ofInts()) && 
+            node->input(2)->type()->isSubtypeOf(c10::StringType::get())) {
+            std::string pad_mode = toIValue(node->input(2)).value().toStringRef();
+            if (pad_mode == "constant") {
+                is_constant_mode = true;
+            }
+        }
+        return is_constant_mode;
+    }
+
+    bool find_and_replace_pad(Block* block) {
+        bool graph_changed = false;
+        auto it = block->nodes().begin();
+        while (it != block->nodes().end()) {
+            auto node = *it;
+            ++it;  //++it first, node may be destroyed later。
+            for (auto sub_block: node->blocks()) {
+                if (find_and_replace_pad(sub_block)) {
+                    graph_changed = true;
+                }
+            }
+            // replace aten::pad with aten::constant_pad_nd when its padding mode is "constant".
+            if (node->kind() == c10::Symbol::fromQualString("aten::pad") && 
+                check_pad_constant_mode(node)) {
+                torch::jit::Node* constant_pad_nd_node = graph_->create(torch::jit::aten::constant_pad_nd);
+                constant_pad_nd_node->addInput(node->input(0));
+                constant_pad_nd_node->addInput(node->input(1));
+                constant_pad_nd_node->addInput(node->input(3));
+                constant_pad_nd_node->insertBefore(node);
+                node->output(0)->replaceAllUsesAfterNodeWith(node, constant_pad_nd_node->output(0));
+                LOG(INFO) << "Replace aten::pad which padding mode is \"constant\" with aten::constant_pad_nd.";
+                node->destroy();
+                graph_changed = true;
+            }
+        }
+        return graph_changed;
+    }
+
+    std::shared_ptr<Graph> graph_;
+    std::unordered_set<c10::OperatorName> useless_schema_set_;
+};
+
+} // namespace
+
+void replace_pad(std::shared_ptr<torch::jit::Graph> graph) {
+    ReplacePad rp(std::move(graph));
+    rp.run();
+}
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
\ No newline at end of file
diff --git a/poros/src/poros/lowering/segment_post_processing.cpp b/poros/src/poros/lowering/segment_post_processing.cpp
new file mode 100644
index 0000000000..88beb3efd9
--- /dev/null
+++ b/poros/src/poros/lowering/segment_post_processing.cpp
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file segment_post_processing.cpp
+* @author tianshaoqing@baidu.com
+* @date Thu May 27 11:13:02 CST 2022
+* @brief
+**/
+
+#include "poros/lowering/segment_post_processing.h"
+
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+using namespace torch::jit;
+
+void subgraph_outputs_int2long(torch::jit::Graph* parent_graph,
+                        torch::jit::Node& subgraph_node,
+                        std::shared_ptr<torch::jit::Graph> subgraph) {
+    AT_ASSERT(subgraph_node.kind() == torch::jit::prim::CudaFusionGroup);
+    // 检查子图的每个Tensor的输出类型
+    for (size_t i = 0; i < subgraph->outputs().size(); i++) {
+        torch::jit::Value* output_value = subgraph->outputs()[i];
+        if (output_value->type()->isSubtypeOf(c10::TensorType::get())) {
+            auto subgraph_output_type = output_value->type()->cast<c10::TensorType>();
+            if (subgraph_output_type->scalarType() == at::ScalarType::Long) {
+                // 如果子图Tensor输出是Long，则添加aten::to.dtype，schema如下：
+                // aten::to.dtype(Tensor self, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor
+                LOG(INFO) << "Find output type is Long, which is " << node_info(&subgraph_node) << " output[" << i <<
+                "] %" << output_value->debugName() << ". Add aten::to(Long) node.";
+                torch::jit::Node* to_long_node = parent_graph->create(torch::jit::aten::to, 1);
+                to_long_node->insertAfter(&subgraph_node);
+                to_long_node->addInput(subgraph_node.output(i));
+                // 不用setInsertPoint的话默认将constant插入到图的末尾，movebefore将constant移到to_long_node之前
+                torch::jit::Value* false_value = parent_graph->insertConstant(false);
+                false_value->node()->moveBefore(to_long_node);
+                torch::jit::Value* type_value = parent_graph->insertConstant(c10::ScalarType::Long);
+                type_value->node()->moveBefore(to_long_node);
+                torch::jit::Node* none_node = parent_graph->createNone();
+                none_node->insertBefore(to_long_node);
+
+                to_long_node->addInput(type_value);
+                to_long_node->addInput(false_value);
+                to_long_node->addInput(false_value);
+                to_long_node->addInput(none_node->output(0));
+
+                // must set output type
+                to_long_node->output(0)->setType(subgraph_output_type);
+                subgraph_node.output(i)->replaceAllUsesAfterNodeWith(to_long_node, to_long_node->output(0));
+            }
+        }
+    }
+};
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
\ No newline at end of file
diff --git a/poros/src/poros/lowering/segment_post_processing.h b/poros/src/poros/lowering/segment_post_processing.h
new file mode 100644
index 0000000000..d1b7133210
--- /dev/null
+++ b/poros/src/poros/lowering/segment_post_processing.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file segment_post_processing.h
+* @author tianshaoqing@baidu.com
+* @date 2022-05-27 11:11:18
+* @brief
+**/
+
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+/**
+ * @brief 有的子图输出的tensor原本是long类型，但是trtengine只支持int类型。
+ *        那么就需要在engine后添加aten::to(long)的操作将其还原回去。
+ *        避免有的op会强制检查long类型（例如：aten::index）
+ *
+ * @param [in] parent_graph : subgraph_node的owning_graph
+ * @param [in] subgraph_node : 子图节点，类型必须是prim::CudaFusionGroup
+ * @param [in] subgraph : 子图节点所对应的子图
+ * 
+ * @return 
+ * @retval
+**/
+void subgraph_outputs_int2long(torch::jit::Graph* parent_graph,
+                        torch::jit::Node& subgraph_node,
+                        std::shared_ptr<torch::jit::Graph> subgraph);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
\ No newline at end of file
diff --git a/poros/src/poros/lowering/try_to_freeze_aten_dim.cpp b/poros/src/poros/lowering/try_to_freeze_aten_dim.cpp
new file mode 100644
index 0000000000..a77592095d
--- /dev/null
+++ b/poros/src/poros/lowering/try_to_freeze_aten_dim.cpp
@@ -0,0 +1,103 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file: /poros/baidu/mirana/poros/src/poros/lowering/try_to_freeze_aten_dim.cpp
+* @author: zhangfan51@baidu.com
+* @data: 2022-03-24 16:02:50
+* @brief: 
+**/ 
+
+#include "poros/lowering/lowering_pass.h"
+
+#include <torch/csrc/jit/passes/constant_pooling.h>
+#include <torch/csrc/jit/passes/constant_propagation.h>
+
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+namespace {
+using namespace torch::jit;
+
+bool has_type_and_dim(const Value* value) {
+    auto op = value->type()->cast<TensorType>();
+    return op->sizes().size().has_value() && op->scalarType().has_value();
+}
+
+struct FreezeAtenDim {
+    FreezeAtenDim(std::shared_ptr<Graph> graph) : graph_(std::move(graph)) {}
+
+    void run() {
+        try_to_freeze_aten_dim(graph_->block());
+        // 运行一遍常量折叠
+        torch::jit::ConstantPropagation(graph_);
+        // torch::jit::ConstantPooling(graph_);
+    }
+
+private:
+    void replace_aten_dim(Node* node, int inplace_number) {
+        LOG(INFO) << "try to replace the output of node :" << node_info(node)
+                << " with constant value " << inplace_number;
+        torch::jit::WithInsertPoint guard(graph_->block()->nodes().front());
+        auto len_const = graph_->insertConstant(inplace_number);
+        node->outputs().at(0)->replaceAllUsesWith(len_const);
+    }
+
+    /*
+    * @brief 尝试将aten::dim的返回值变成常量，如果 aten::dim的input的Tensor经过预热后维度确定，则可以使用常量代替
+    **/    
+    void try_to_freeze_aten_dim(Block* block) {
+        auto it = block->nodes().begin();
+        while (it != block->nodes().end()) {
+            auto node = *it;
+            ++it;  //先++it, node 可能destroy掉。
+            for (auto block : node->blocks()) {
+                try_to_freeze_aten_dim(block);
+            }
+
+            //只handle aten::dim 的场景
+            if (node->kind() != aten::dim) {
+                continue;
+            }
+
+            //输入是tensor，并且该tensor包含shape信息
+            if ((node->inputs()[0])->type()->isSubtypeOf(c10::TensorType::get()) &&
+                has_type_and_dim(node->inputs()[0])) {
+                auto sizes = (node->inputs()[0])->type()->cast<TensorType>()->sizes();
+                // if (sizes[0].has_value()) {
+                    auto ndim = sizes.size();
+                    replace_aten_dim(node, *ndim);
+                // }
+                continue;
+            }
+        }
+    }
+
+    std::shared_ptr<Graph> graph_;
+};
+
+} // namespace
+
+void freeze_aten_dim(std::shared_ptr<torch::jit::Graph> graph) {
+    LOG(INFO) << "Running poros freeze_aten_len passes";
+    FreezeAtenDim pss(std::move(graph));
+    pss.run();
+}
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
\ No newline at end of file
diff --git a/poros/src/poros/lowering/try_to_freeze_aten_len.cpp b/poros/src/poros/lowering/try_to_freeze_aten_len.cpp
new file mode 100644
index 0000000000..d63fea3a76
--- /dev/null
+++ b/poros/src/poros/lowering/try_to_freeze_aten_len.cpp
@@ -0,0 +1,167 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file try_to_freeze_aten_len.cpp
+* @author tianjinjin@baidu.com
+* @date Sun Sep 26 20:00:01 CST 2021
+* @brief
+**/
+
+#include "poros/lowering/lowering_pass.h"
+
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+namespace {
+using namespace torch::jit;
+
+bool has_type_and_dim(const Value* value) {
+    auto op = value->type()->cast<TensorType>();
+    return op->sizes().size().has_value() && op->scalarType().has_value();
+}
+
+struct FreezeAtenLen {
+    FreezeAtenLen(std::shared_ptr<Graph> graph) : graph_(std::move(graph)) {}
+
+    void run() {
+        try_to_freeze_aten_len(graph_->block());
+    }
+
+private:
+    void replace_aten_len(Node* node, int inplace_number) {
+        LOG(INFO) << "try to replace the output of node :" << node_info(node)
+                << " with constant value " << inplace_number;
+        torch::jit::WithInsertPoint guard(graph_->block()->nodes().front());
+        auto len_const = graph_->insertConstant(inplace_number);
+        node->outputs().at(0)->replaceAllUsesWith(len_const);
+    }
+
+    bool try_to_replace_listconstruct_len(Node* node, Node* len_node) {
+        if (node->kind() != prim::ListConstruct) {
+            return false;
+        }
+        for (auto &use : node->outputs()[0]->uses()) {
+            if (use.user->owningBlock() != node->owningBlock() ||
+                use.user->kind() == aten::append) {
+                return false;
+            }
+        }
+        replace_aten_len(len_node, node->inputs().size());
+        return true;
+    }
+
+    /**
+    * @brief 尝试将aten::len的返回值变成常量，当前支持的场景：
+    *       1. aten::len 的输入是一个tensor(前提是我们认为tensor的size可能是dynamic的，但是len是确定的)
+    *       2. aten::len 的输入是prim::ListConstruct构建的list，当list的长度可以明确的时候，进行常量替换。
+    *       3. aten::len 的输入是aten::unbind，根据该算子的语义，获取其len并替换。
+    *       4. aten::len 的输入是aten::meshgrid，由于这类算子不改变输入的len信息，进一步获取算子的输入，尝试进行常量替换。
+    **/    
+    void try_to_freeze_aten_len(Block* block) {
+        auto it = block->nodes().begin();
+        while (it != block->nodes().end()) {
+            auto node = *it;
+            ++it;  //先++it, node 可能destroy掉。
+            for (auto block : node->blocks()) {
+                try_to_freeze_aten_len(block);
+            }
+
+            //只handle aten::len 的场景
+            if (node->kind() != aten::len) {
+                continue;
+            }
+            
+            //输入是一个tensor的场景, aten::len的结果
+            if ((node->inputs()[0])->type()->isSubtypeOf(c10::TensorType::get()) &&
+                has_type_and_dim(node->inputs()[0])) {
+                LOG(INFO) << "input is tensor situation.";
+                auto sizes = (node->inputs()[0])->type()->cast<TensorType>()->sizes();
+                if (sizes[0].has_value()) {
+                    int len = sizes[0].value();
+                    replace_aten_len(node, len);
+                }
+                continue;
+                // std::vector<int64_t> dims;
+                // if (gen_dims_for_tensor(node->inputs()[0], dims)) {
+                //     int len = (dims.size()) & INT_MAX;
+                //     replace_aten_len(node, len);
+                // }
+                // continue;
+            }
+
+            //输入非tensor的场景，根据输入类型节点的类型简单判断。
+            auto input_node = (node->inputs()[0])->node();
+            switch (input_node->kind()) {
+            // unbind: 等于第一个输入的
+                case aten::unbind: {
+                    LOG(INFO) <<  "input is produced by aten::unbind situation.";
+                    if (has_type_and_dim(input_node->inputs()[0])) {
+                        std::vector<int64_t> dims;
+                        if (gen_dims_for_tensor(input_node->inputs()[0], dims) &&
+                            input_node->inputs()[1]->node()->kind() == prim::Constant) {
+                            int dim = toIValue(input_node->inputs()[1]->node()->output()).value().toInt();
+                            dim = dim < 0 ? dims.size() + dim : dim;
+                            //非dynamic的维度
+                            if (dims[dim] != -1) {
+                                auto len = dims[dim];
+                                replace_aten_len(node, len);
+                                torch::jit::WithInsertPoint guard(graph_->block()->nodes().front());
+                                auto len_const = graph_->insertConstant(len);
+                                node->outputs().at(0)->replaceAllUsesWith(len_const);
+                            }
+                        }
+                    }
+                    break;
+                }
+                //这些op的输入输出的len不会发生变化。再找一下这类op的输入。
+                case aten::meshgrid: {
+                    LOG(INFO) <<  "input is produced by aten:meshgrid situation.";
+                    if ((input_node->inputs()[0])->node()->kind() == prim::ListConstruct) {
+                        try_to_replace_listconstruct_len((input_node->inputs()[0])->node(), node);
+                    }
+                    break;
+                }
+                //prim::ListConstruct 的情况
+                case prim::ListConstruct: {
+                    LOG(INFO) <<  "input is produced by prim::ListConstruct situation.";
+                    try_to_replace_listconstruct_len(input_node, node);
+                    break;
+                }
+                default: {
+                    //遇到目前不支持的类型，直接返回，不做处理。
+                    LOG(INFO) <<  "unsupported situation. input_node is: " << node_info(input_node);
+                    break;
+                }         
+            }
+        }
+    }
+
+    std::shared_ptr<Graph> graph_;
+};
+
+} // namespace
+
+void freeze_aten_len(std::shared_ptr<torch::jit::Graph> graph) {
+    LOG(INFO) << "Running poros freeze_aten_len passes";
+    FreezeAtenLen fal(std::move(graph));
+    fal.run();
+}
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
\ No newline at end of file
diff --git a/poros/src/poros/lowering/try_to_freeze_aten_size.cpp b/poros/src/poros/lowering/try_to_freeze_aten_size.cpp
new file mode 100644
index 0000000000..344f609f20
--- /dev/null
+++ b/poros/src/poros/lowering/try_to_freeze_aten_size.cpp
@@ -0,0 +1,226 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file try_to_freeze_aten_size.cpp
+* @author tianjinjin@baidu.com
+* @date Fri Nov 26 11:35:16 CST 2021
+* @brief
+**/
+
+#include "poros/lowering/lowering_pass.h"
+
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/common_subexpression_elimination.h>
+#include <torch/csrc/jit/passes/constant_pooling.h>
+#include <torch/csrc/jit/passes/constant_propagation.h>
+#include <torch/csrc/jit/passes/dead_code_elimination.h>
+
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+namespace {
+using namespace torch::jit;
+
+bool has_type_and_dim(const Value* value) {
+    auto op = value->type()->cast<TensorType>();
+    return op->sizes().size().has_value() && op->scalarType().has_value();
+}
+
+static std::string output_vec_size(const std::vector<int64_t>& vec_size) {
+    if (vec_size.empty()) {
+        return std::string("");
+    } else {
+        std::string output_str = "[";
+        for (int64_t i : vec_size) {
+            output_str += (std::to_string(i) + std::string(", "));
+        }
+        output_str.pop_back();
+        output_str.pop_back();
+        output_str.push_back(']');
+        return output_str;
+    }
+}
+
+struct FreezeAtenSize {
+    FreezeAtenSize(std::shared_ptr<Graph> graph) : graph_(std::move(graph)) {}
+
+    void run() {
+        GRAPH_DUMP("before freeze_aten_sizes Graph: ", graph_);
+        bool changed = freeze_aten_sizes(graph_->block());
+        if (changed) {
+            ConstantPropagation(graph_);
+            EliminateDeadCode(graph_);
+            EliminateCommonSubexpression(graph_);
+            ConstantPooling(graph_);
+        }
+        GRAPH_DUMP("after freeze_aten_sizes Graph: ", graph_);
+    }
+
+private:
+
+    bool is_aten_size_node(Node* node) {
+        if (node->kind() != aten::size) {
+            return false;
+        }
+        //TODO: may be add more check situation
+        return true;
+    }
+
+    void replace_int_list(Node* node, const std::vector<int64_t>& inplace_number) {
+        LOG(INFO) << "try to replace the output of node :" << node_info(node)
+                << " with constant value " << output_vec_size(inplace_number);
+        torch::jit::WithInsertPoint guard(graph_->block()->nodes().front());
+        auto int_list_const = graph_->insertConstant(inplace_number);
+        node->outputs().at(0)->replaceAllUsesWith(int_list_const);
+    }
+
+    /**
+     * try to calculate the result of aten::slice.
+     * the schema of aten::slice is:
+     * "aten::slice(t[] l, int? start=None, int? end=None, int step=1) -> t[]"
+     * **/
+    bool calculate_aten_slice(const torch::jit::Node* slice_node, 
+                            const std::vector<int64_t>& input, 
+                            std::vector<int64_t>& output) {
+
+        if (slice_node->inputs().at(1)->node()->kind() != prim::Constant ||
+            slice_node->inputs().at(2)->node()->kind() != prim::Constant ||
+            slice_node->inputs().at(3)->node()->kind() != prim::Constant) {
+            return false;
+        }
+
+        const int64_t input_len = input.size();
+        auto maybe_start = toIValue(slice_node->inputs().at(1));
+        auto start_index = maybe_start->isNone() ? 0 : maybe_start.value().toInt();
+        const int64_t normalized_start = (start_index < 0) ? (input_len + start_index) : start_index;
+
+        auto maybe_end = toIValue(slice_node->inputs().at(2));
+        auto temp_end_index = maybe_end->isNone() ? INT64_MAX : maybe_end.value().toInt();
+        auto end_idx = std::min(temp_end_index, input_len);
+        const int64_t normalized_end = (end_idx < 0) ? (input_len + end_idx) : end_idx;
+
+        if (normalized_end <= normalized_start) {
+            return false;
+        }
+        int64_t step = toIValue(slice_node->inputs().at(3)).value().toInt();
+
+        output.reserve(normalized_end - normalized_start);
+        for (auto i = normalized_start; i < normalized_end;) {
+            output.push_back(input[i]);
+            i += step;
+        }
+
+        LOG(INFO) << "calculate_aten_slice done, input size: " << output_vec_size(input)
+                << ", start_index: " << normalized_start
+                << ", end_index: " << normalized_end
+                << ", step: " << step
+                << ", ouput size: " << output_vec_size(output);
+
+        auto it = std::find_if(output.begin(), output.end(), [&](const int64_t& v) {return v == -1;});
+        //不满足条件，output中有-1的值，说明存在dynamic的dim，不能替换成常量。
+        if (it != output.end()) {
+            return false;
+        }
+
+        return true;
+    }
+
+    /**
+    * @brief 尝试解析aten::size的数据
+    *        如果aten::size返回的list的后续使用，可以解除与动态变化的维度的关系，则相应值进行常量替换。
+    **/ 
+    bool try_to_freeze_aten_size(Node* node) {
+
+        std::vector<int64_t> dims;
+        if ((node->inputs()[0])->type()->isSubtypeOf(c10::TensorType::get()) &&
+            has_type_and_dim(node->inputs()[0])) {
+            gen_dims_for_tensor(node->inputs()[0], dims);
+        } else {
+            return false;
+        }
+        if (node->inputs().size() == 2) {
+            return false;
+        }
+
+        //输入非tensor的场景，根据输入类型节点的类型简单判断。
+        auto output_value = node->outputs()[0]; // should be a int[]
+        auto users_count = (node->outputs()[0])->uses().size();
+
+        //situation one: 如果aten::size算子本身的计算结果里面没有-1，则表示该tensor非dynamic，
+        //则可以不管后面跟的是什么op，直接替换size的输出。
+        auto it = std::find_if(dims.begin(), dims.end(), [&](const int64_t& v) {return v == -1;});
+        if (it == dims.end()) {
+            LOG(INFO) <<  "aten size output memebers are all constant situation, dim info: " << output_vec_size(dims);
+            replace_int_list(node, dims);
+            node->destroy();
+            return true;
+        }
+
+        //situation two: aten::size is dynamic but the user is aten::slice
+        if (users_count == 1 && (output_value->uses()[0]).user->kind() == aten::slice) {
+            LOG(INFO) <<  "aten size user is aten::slice situation.";
+            auto slice_node = (output_value->uses()[0]).user;
+            std::vector<int64_t> sliced_list;
+            if (calculate_aten_slice(slice_node, dims, sliced_list)) {
+                //满足条件，替换节点
+                replace_int_list(slice_node, sliced_list);
+                //slice node 可以析构掉了
+                slice_node->destroy();
+                //当前的aten::size节点也可以析构掉了
+                node->destroy();
+                return true;
+            }
+        } else {
+            LOG(INFO) << "not supported situation now.";
+        }
+        return false;
+    }
+
+    bool freeze_aten_sizes(Block* block) {
+        bool changed = false;
+        //fix bug: 可能连续后面几个节点被删除(比如aten::slice + aten::size), iterator改成从后往前迭代。
+        for (auto it = block->nodes().rbegin(); it != block->nodes().rend();) {
+            // we might destroy the current node, so we need to pre-increment
+            // the iterator
+            Node* node = *it;
+            ++it;
+            for (Block* subblock : node->blocks()) {
+                changed |= freeze_aten_sizes(subblock);
+            }
+            if (is_aten_size_node(node)) {
+                LOG(INFO) << "find aten::size node: " << node_info(node);
+                changed |= try_to_freeze_aten_size(node);
+            }
+        }
+        return changed;
+    }
+
+    std::shared_ptr<Graph> graph_;
+};
+
+} // namespace
+
+void freeze_aten_size(std::shared_ptr<torch::jit::Graph> graph) {
+    LOG(INFO) << "Running poros freeze_aten_size passes";
+    FreezeAtenSize fas(std::move(graph));
+    fas.run();
+}
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
\ No newline at end of file
diff --git a/poros/src/poros/lowering/try_to_freeze_list_construct.cpp b/poros/src/poros/lowering/try_to_freeze_list_construct.cpp
new file mode 100644
index 0000000000..bd6d3a2ecf
--- /dev/null
+++ b/poros/src/poros/lowering/try_to_freeze_list_construct.cpp
@@ -0,0 +1,174 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file: try_to_freeze_array.cpp
+* @author: zhangfan51@baidu.com
+* @data: 2022-03-23 15:53:29
+* @brief: 
+**/ 
+#include "poros/lowering/lowering_pass.h"
+
+#include <torch/csrc/jit/passes/constant_propagation.h>
+
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+namespace {
+using namespace torch::jit;
+/**
+    * @brief 尝试展开for循环内list.append()的情况
+    * try to expand
+    * graph(%feat.29 : Tensor):
+    *     %256 : int = prim::Constant[value=2]()
+    *     %257 : float = prim::Constant[value=2.]()
+    *     %scale_factors.88 : float[] = prim::ListConstruct()
+    *         = prim::Loop(%256, %2146)
+    *             block0(%746 : int):
+    *                 %747 : float[] = aten::append(%scale_factors.88, %257)
+    *             -> (%2146)
+    * as
+    * graph(%feat.29 : Tensor):
+    *     %256 : int = prim::Constant[value=2]()
+    *     %257 : float = prim::Constant[value=2.]()
+    *     %scale_factors.88 : float[] = prim::ListConstruct()
+    *     %747 : float[] = aten::append(%scale_factors.88, %257)
+    *     %748 : float[] = aten::append(%scale_factors.88, %257)
+    **/ 
+
+
+struct FreeezeListConstruct {
+    FreeezeListConstruct(std::shared_ptr<Graph> graph) : graph_(std::move(graph)) {}
+
+    void run() {
+        try_to_freeze_list_construct(graph_->block());
+        // 运行一遍常量折叠
+        torch::jit::ConstantPropagation(graph_);
+    }
+
+private:
+    template<typename T>
+    void replace_contant_list_construct(Node* node, std::vector<T> &data_array) {
+        LOG(INFO) << "try to replace the output of node :" << node_info(node)
+                << " with constant value " << data_array;
+        torch::jit::WithInsertPoint guard(graph_->block()->nodes().front());
+        auto list_const = graph_->insertConstant(data_array);
+        node->outputs().at(0)->replaceAllUsesWith(list_const);
+    }
+
+    void try_to_freeze_list_construct(Block* block) {
+        auto it = block->nodes().begin();
+        while (it != block->nodes().end()) {
+            auto node = *it;
+            ++it;  //先++it, node 可能destroy掉。
+            for (auto block : node->blocks()) {
+                try_to_freeze_list_construct(block);
+            }
+
+            // 找到 prim::ListConstruct
+            if (node->kind() != prim::ListConstruct) {
+                continue;
+            }
+            // 判断 ListConstruct的output为float[] or int[]
+            if (!(node->outputs()[0])->type()->isSubtypeOf(c10::ListType::ofFloats()) && 
+                !(node->outputs()[0])->type()->isSubtypeOf(c10::ListType::ofInts())) {
+                continue;
+            }
+            // 判断 ListConstruct的inputs应为空，否则会有值不相同
+            if (node->inputs().size() != 0) {
+                continue;
+            }
+            //only do for float[] and int[]
+            // 判断该ListConstruct的所有使用者，是否仅做过一次aten::append修改
+            int use_flag = 0;
+            Node* app_node = nullptr;
+            for (auto &use : node->outputs()[0]->uses()) {
+                if (use.user->owningBlock() != node->owningBlock() && 
+                    use.user->kind() == aten::append) {
+                    use_flag++;
+                    app_node = use.user;
+                }
+            }
+            if (use_flag != 1) {
+                continue;
+            }
+            // 判断append的block 是放在prim::Loop中的, 且在该Loop里只有1个block
+            Block* app_block = app_node->owningBlock();
+            Node* loop_node = app_block->owningNode();
+            // 目前先仅考虑owingNode为prim::Loop的情况，如后面遇到其他类似pattern再做相应判断调整
+            if (loop_node->kind() != prim::Loop || loop_node->blocks().size() > 1) {
+                continue;
+            }
+
+            auto app_it = app_block->nodes().begin();
+            std::vector<Node*> app_block_nodes;
+            while (app_it != app_block->nodes().end()) {
+                app_block_nodes.push_back(*app_it);
+                ++app_it;
+            }
+            // 仅处理形如这种的情况：
+            //         block0(%746 : int):
+            //             %747 : float[] = aten::append(%scale_factors.88, %257)
+            //         -> (%2146)
+            // block 中仅包括1个append
+            if (app_block_nodes.size() != 1) {
+                LOG(INFO) << "freeze_list_construct: append block nodes size is more than 1. ";
+                continue;
+            }
+            // prim::Loop的 循环次数必须为prim::Constant, append的value也必须为prim::Constant.
+            if ((loop_node->inputs()[0])->node()->kind() != prim::Constant ||
+                (app_node->inputs()[1])->node()->kind() != prim::Constant ) {
+                LOG(INFO) << "freeze_list_construct: append's input or loop's input type is not prim::Constant.";
+                continue;
+            }
+            auto loop_max = toIValue(loop_node->inputs()[0]->node()->output()).value().toInt();
+            auto loop_cond = toIValue(loop_node->inputs()[1]->node()->output()).value().toBool();
+            // loop_cond must be true here, check again.
+            if (!loop_cond) {
+                continue;
+            }
+            auto value = toIValue((app_node->inputs()[1])->node()->output()).value();
+            if (value.isInt()) {
+                std::vector<int64_t> array_value(loop_max, value.toInt());
+                replace_contant_list_construct(node, array_value);
+            } else if (value.isDouble()) {
+                std::vector<double> array_value(loop_max, value.toDouble()); 
+                replace_contant_list_construct(node, array_value);
+            } else {
+                continue;
+            }
+            //destroy app_block下所有node
+            for (size_t i = 0; i < app_block_nodes.size(); ++i) {
+                app_block_nodes[i]->destroy();
+            }
+            // loop_node->destroy();
+        }
+    }
+    std::shared_ptr<Graph> graph_;
+};
+
+} // namespace
+
+void freeze_list_construct(std::shared_ptr<torch::jit::Graph> graph) {
+    LOG(INFO) << "Running poros freeze_list_construct passes";
+    FreeezeListConstruct flc(std::move(graph));
+    flc.run();
+}
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
\ No newline at end of file
diff --git a/poros/src/poros/lowering/try_to_freeze_percentformat.cpp b/poros/src/poros/lowering/try_to_freeze_percentformat.cpp
new file mode 100644
index 0000000000..91985ded02
--- /dev/null
+++ b/poros/src/poros/lowering/try_to_freeze_percentformat.cpp
@@ -0,0 +1,249 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file try_to_freeze_percentformat.cpp
+* @author tianjinjin@baidu.com
+* @date Wed Nov 24 15:13:00 CST 2021
+* @brief
+**/
+
+#include "poros/lowering/lowering_pass.h"
+
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/common_subexpression_elimination.h>
+#include <torch/csrc/jit/passes/constant_pooling.h>
+#include <torch/csrc/jit/passes/constant_propagation.h>
+#include <torch/csrc/jit/passes/dead_code_elimination.h>
+#include <torch/csrc/jit/passes/peephole.h>
+
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+namespace {
+
+using namespace torch::jit;
+
+struct FreezePercentFormat {
+    FreezePercentFormat(std::shared_ptr<Graph> graph) : graph_(std::move(graph)) {}
+
+    void run() {
+        bool changed = freeze_percentformats(graph_->block());
+        if (changed) {
+            ConstantPropagation(graph_);
+            EliminateDeadCode(graph_);
+            EliminateCommonSubexpression(graph_);
+            ConstantPooling(graph_);
+            //PeepholeOptimize(graph_, /*addmm_fusion_enabled*/false);
+            //CheckInplace(graph_);
+            //runRequiredPasses(graph_);
+        }
+        return;
+    }
+
+private:
+
+    bool is_percent_format_node(Node* node) {
+        if (node->kind() != aten::percentFormat) {
+            return false;
+        }
+        //maybe to add more
+        return true;
+    }
+
+    // IValue tags are intentionally private, so we need additional logic to cast
+    // the IValue type to the specified format.
+    void add_formatted_arg(char key,
+                    const IValue& ival,
+                    std::stringstream& ss,
+                    int precision = 6) {
+        // TODO: Implement precison-based formatting
+        std::stringstream tmp;
+        switch (key) {
+            case 'd':
+            case 'i':
+                if (ival.isInt()) {
+                    ss << ival.toInt();
+                } else {
+                    ss << static_cast<int>(ival.toDouble());
+                }
+                break;
+            case 'e':
+            case 'E':
+                tmp << std::setprecision(precision) << std::scientific;
+                if (key == 'E') {
+                    tmp << std::uppercase;
+                }
+                if (ival.isInt()) {
+                    tmp << static_cast<float>(ival.toInt());
+                } else {
+                    tmp << static_cast<float>(ival.toDouble());
+                }
+                ss << tmp.str();
+                break;
+            case 'f':
+            case 'F':
+                tmp << std::setprecision(precision) << std::fixed;
+                if (ival.isInt()) {
+                    tmp << static_cast<float>(ival.toInt());
+                } else {
+                    tmp << static_cast<float>(ival.toDouble());
+                }
+                ss << tmp.str();
+                break;
+            case 'c':
+                if (ival.isInt()) {
+                    ss << static_cast<char>(ival.toInt());
+                } else {
+                    ss << ival.toStringRef();
+                }
+                break;
+            case 's':
+                if (ival.isString()) {
+                    ss << ival.toStringRef();
+                } else {
+                    ss << ival;
+                }
+                break;
+            default:
+                TORCH_CHECK(false, "The specifier %", key, " is not supported in TorchScript format strings");
+        }
+    }
+
+    std::string interprete_percent_format(std::vector<IValue>& stack, size_t num_inputs) {
+        auto format_str = peek(stack, 0, num_inputs).toStringRef();
+        auto args = last(stack, num_inputs - 1)[0];
+        size_t args_size = 1; // assumed size
+        if (args.isTuple()) {
+            args_size = args.toTuple()->elements().size();
+        }
+        std::stringstream ss;
+        size_t used_args = 0;
+        size_t begin = 0;
+        
+        while (true) {
+            size_t percent_idx = format_str.find('%', begin);
+            if (percent_idx == std::string::npos) {
+                ss << format_str.substr(begin);
+                break;
+            }
+            size_t format_idx = percent_idx + 1;
+            TORCH_CHECK(
+                percent_idx < format_str.length() - 1, "Incomplete format specifier");
+            ss << format_str.substr(begin, percent_idx - begin);
+
+            if (format_str.at(format_idx) == '%') {
+                ss << '%';
+                begin = percent_idx + 2; // skip the `%` and the format specifier
+                continue;
+            }
+
+            // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
+            TORCH_CHECK(used_args < args_size, "Too few arguments for format string");
+            char key = format_str.at(format_idx);
+            IValue arg;
+            if (args.isTuple()) {
+                arg = args.toTuple()->elements()[used_args];
+            } else {
+                arg = args;
+            }
+            add_formatted_arg(key, arg, ss);
+            begin = percent_idx + 2;
+            ++used_args;
+        }
+        // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
+        TORCH_CHECK(used_args == args_size, "Too many arguments for format string");
+        std::string result = ss.str();
+        return result;
+    }
+
+    /**
+     * the schema of percentformat is :  "aten::percentFormat(str self, ...) -> str"
+     * **/
+    bool try_to_freeze_percentformat(Node* format_node) {
+        //Graph* graph = format_node->owningGraph();
+        at::ArrayRef<Value*> inputs = format_node->inputs();
+        size_t num_inputs = inputs.size();
+        
+        //no format input situation.
+        if (num_inputs < 2) {
+            LOG(INFO) << "should not freeze node: " << node_info(format_node); 
+            return false;
+        }
+        
+        //bool all_input_constant = true;
+        std::vector<IValue> stack;
+        for(size_t index = 0; index < num_inputs; index++) {
+            if (inputs[index]->node()->kind() != prim::Constant) {
+                LOG(INFO) << "should not freeze node: " << node_info(format_node); 
+                return false;
+            } else {
+                c10::optional<IValue> ivalue = toIValue(inputs[index]->node()->output());
+                if (ivalue.has_value()) {
+                    stack.push_back(ivalue.value());
+                }
+            }
+        }
+
+        if (stack.size() != num_inputs) {
+            LOG(INFO) << "should not freeze node: " << node_info(format_node);     
+            return false;
+        }
+
+        //if we reach here, that means all inputs are constant. let's calculate the result
+        std::string result = interprete_percent_format(stack, num_inputs);
+        LOG(INFO) << "try to replace the output of node :" << node_info(format_node)
+                << " with constant value: " << result;
+        WithInsertPoint guard(graph_->block()->nodes().front());
+        Value* string_const = graph_->insertConstant(result);
+        format_node->outputs().at(0)->replaceAllUsesWith(string_const);
+        format_node->destroy();
+        return true;
+    }
+
+    bool freeze_percentformats(Block* block) {
+        bool changed = false;
+        for (auto it = block->nodes().begin(); it != block->nodes().end();) {
+            // we might destroy the current node, so we need to pre-increment
+            // the iterator
+            Node* node = *it;
+            ++it;
+            for (Block* subblock : node->blocks()) {
+                changed |= freeze_percentformats(subblock);
+            }
+            if (is_percent_format_node(node)) {
+                LOG(INFO) << "meet percent format node :" << node_info(node);
+                changed |= try_to_freeze_percentformat(node);
+            }
+        }
+        return changed;        
+    }
+
+std::shared_ptr<Graph> graph_;
+};
+
+} // namespace
+
+void freeze_percentformat(std::shared_ptr<torch::jit::Graph> graph) {
+    LOG(INFO) << "Running poros freeze_percentformat passes";
+    FreezePercentFormat fpf(std::move(graph));
+    fpf.run();
+}
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/lowering/unpack_certain_ops.cpp b/poros/src/poros/lowering/unpack_certain_ops.cpp
new file mode 100644
index 0000000000..7c0c82da4f
--- /dev/null
+++ b/poros/src/poros/lowering/unpack_certain_ops.cpp
@@ -0,0 +1,127 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Part of the following code in this file refs to
+// https://github.com/pytorch/TensorRT/blob/master/core/lowering/passes/unpack_var.cpp
+//
+// Copyright (c) 2020-present, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// Licensed under the BSD 3-Clause "New" or "Revised" License
+
+/**
+* @file unpack_certain_ops.cpp
+* @author tianjinjin@baidu.com
+* @date Thu Sep 23 20:15:53 CST 2021
+* @brief
+**/
+
+#include "poros/lowering/lowering_pass.h"
+
+#include "torch/csrc/jit/passes/subgraph_rewrite.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+void unpack_std(std::shared_ptr<torch::jit::Graph>& graph) {
+    std::string std_pattern = R"IR(
+    graph(%1, %dim, %unbiased, %keepdim):
+        %out: Tensor = aten::std(%1, %dim, %unbiased, %keepdim)
+        return (%out))IR";
+
+    std::string unpacked_pattern = R"IR(
+    graph(%1, %dim, %unbiased, %keepdim):
+        %z: Tensor = aten::var(%1, %dim, %unbiased, %keepdim)
+        %out: Tensor = aten::sqrt(%z)
+        return (%out))IR";
+
+    torch::jit::SubgraphRewriter std_rewriter;
+    std_rewriter.RegisterRewritePattern(std_pattern, unpacked_pattern);
+    std_rewriter.runOnGraph(graph);
+}
+
+void unpack_var(std::shared_ptr<torch::jit::Graph>& graph) {
+    std::string var_pattern = R"IR(
+    graph(%input, %dim, %unbiased, %keepdim):
+        %out: Tensor = aten::var(%input, %dim, %unbiased, %keepdim)
+        return (%out))IR";
+    std::string unpacked_pattern = R"IR(
+    graph(%input, %dims, %unbiased, %keepdim):
+        %none: None = prim::Constant()
+        %false: bool = prim::Constant[value=0]()
+        %0: int = prim::Constant[value=0]()
+        %f32_dtype: int = prim::Constant[value=6]()
+        %1: int = prim::Constant[value=1]()
+        %sqrd: Tensor = aten::mul(%input, %input)
+        %sqrdmean: Tensor = aten::mean(%sqrd, %dims, %keepdim, %none)
+        %mean: Tensor = aten::mean(%input, %dims, %keepdim, %none)
+        %meansqrd: Tensor = aten::mul(%mean, %mean)
+        %var: Tensor = aten::sub(%sqrdmean, %meansqrd, %1)
+        %varout : Tensor = prim::If(%unbiased)
+            block0():
+                %shape: int[] = aten::size(%input)
+                %shapet: Tensor = aten::tensor(%shape, %f32_dtype, %none, %false)
+                %dim: int = prim::ListUnpack(%dims)
+                %reduceddims: Tensor = aten::select(%shapet, %0, %dim)
+                %numel: Tensor = aten::prod(%reduceddims, %dim, %keepdim, %none)
+                %mul: Tensor = aten::mul(%var, %numel)
+                %sub: Tensor = aten::sub(%numel, %1, %1)
+                %v: Tensor = aten::div(%mul, %sub)
+                -> (%v)
+            block1():
+                -> (%var)
+        return(%varout))IR";
+
+    torch::jit::SubgraphRewriter var_rewriter;
+    var_rewriter.RegisterRewritePattern(var_pattern, unpacked_pattern);
+    var_rewriter.runOnGraph(graph);
+}
+
+void replace_log_softmax(std::shared_ptr<torch::jit::Graph> graph) {
+    std::string old_pattern = R"IR(
+    graph(%1, %dim, %dtype):
+        %out: Tensor = aten::log_softmax(%1, %dim, %dtype)
+        return (%out))IR";
+
+    std::string new_pattern = R"IR(
+    graph(%1, %dim, %dtype):
+        %2: Tensor = aten::softmax(%1, %dim, %dtype)
+        %out: Tensor = aten::log(%2)
+        return (%out))IR";
+
+    torch::jit::SubgraphRewriter std_rewriter;
+    std_rewriter.RegisterRewritePattern(old_pattern, new_pattern);
+    std_rewriter.runOnGraph(graph);
+}
+
+void replace_log_sigmoid(std::shared_ptr<torch::jit::Graph> graph) {
+    std::string old_pattern = R"IR(
+    graph(%1):
+        %out: Tensor = aten::log_sigmoid(%1)
+        return (%out))IR";
+
+    std::string new_pattern = R"IR(
+    graph(%1):
+        %2: Tensor = aten::sigmoid(%1)
+        %out: Tensor = aten::log(%2)
+        return (%out))IR";
+
+    torch::jit::SubgraphRewriter std_rewriter;
+    std_rewriter.RegisterRewritePattern(old_pattern, new_pattern);
+    std_rewriter.runOnGraph(graph);
+}
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
\ No newline at end of file
diff --git a/poros/src/poros/lowering/unrolling_loop.cpp b/poros/src/poros/lowering/unrolling_loop.cpp
new file mode 100644
index 0000000000..29cf66d29d
--- /dev/null
+++ b/poros/src/poros/lowering/unrolling_loop.cpp
@@ -0,0 +1,386 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Part of the following code in this file refs to
+// https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit/passes/loop_unrolling.cpp
+//
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// Licensed under the 3-Clause BSD License
+
+/**
+* @file unrolling_loop.cpp
+* @author tianjinjin@baidu.com
+* @date Mon Nov 22 16:59:25 CST 2021
+* @brief this file is modified from torch/csrc/jit/passes/loop_unrolling.cpp
+*        and some parameters are different from the original funciton
+**/
+#include "poros/lowering/lowering_pass.h"
+
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/common_subexpression_elimination.h>
+#include <torch/csrc/jit/passes/constant_pooling.h>
+#include <torch/csrc/jit/passes/constant_propagation.h>
+#include <torch/csrc/jit/passes/dead_code_elimination.h>
+
+#include "poros/util/poros_util.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+namespace {
+using namespace torch::jit;
+
+static constexpr int64_t UnrollFactor = 8;
+static constexpr int64_t MaxBodySize = 256;
+static constexpr int64_t MaxBodyRepeats = 64;
+static constexpr int64_t MaxLoopMulResult = 32 * 64;
+
+struct UnrollingLoop {
+    UnrollingLoop(std::shared_ptr<Graph> graph) : graph_(std::move(graph)) {}
+
+    void run() {
+        bool changed = unroll_loops(graph_->block(), true);
+        GRAPH_DUMP("afte unroll_loop graph:", graph_);
+        changed |= eliminate_useless_loop_count_body(graph_->block());
+        GRAPH_DUMP("afte eliminate_useless_loop_count_body graph:", graph_);
+        if (changed) {
+            ConstantPropagation(graph_);
+            EliminateDeadCode(graph_);
+            EliminateCommonSubexpression(graph_);
+            ConstantPooling(graph_);
+        }
+        return;
+    }
+
+private:
+
+    bool is_for_loop(Node* node) {
+        if (node->kind() != prim::Loop) {
+            return false;
+        }
+        Value* start_cond = node->inputs().at(1);
+        c10::optional<bool> maybe_start_value = constant_as<bool>(start_cond);
+        Value* continue_cond = node->blocks().at(0)->outputs().at(0);
+        c10::optional<bool> maybe_continue_value = constant_as<bool>(continue_cond);
+        return maybe_start_value && *maybe_start_value && maybe_continue_value && *maybe_continue_value;
+    }
+
+    int64_t limited_block_size(Block* body, int64_t limit) {
+        auto it = body->nodes().begin();
+        auto end = body->nodes().end();
+        for (int64_t i = 0; i < limit; ++it) {
+            for (Block* subblock : it->blocks()) {
+                i += limited_block_size(subblock, limit - i);
+            }
+            if (!it->notExecutedOp()) {
+                ++i;
+            }
+            if (it == end) {
+                return i;
+            }
+        }
+        return limit;
+    }
+
+    int64_t calculate_block_size(Block* body) {
+        auto it = body->nodes().begin();
+        int64_t count = 0;
+        while (it != body->nodes().end()) {
+            auto node = *it;
+            ++it;  //先++it
+            for (auto block : node->blocks()) {
+                count += calculate_block_size(block);
+            }
+            if (!node->notExecutedOp()) {
+                ++count;
+            }
+        }
+        return count;
+    }
+
+    bool is_small_block(Block* body) {
+        return limited_block_size(body, MaxBodySize + 1) <= MaxBodySize;
+    }
+
+    // XXX: This function can only be called with a loop that is guaranteed to
+    // execute EXACTLY ONCE.
+    void inline_body(Node* loop) {
+        auto graph = loop->owningGraph();
+        auto body = loop->blocks().at(0);
+        WithInsertPoint insert_point_guard{loop};
+
+        std::unordered_map<Value*, Value*> value_map;
+        auto get_value = [&](Value* v) {
+            auto it = value_map.find(v);
+            if (it != value_map.end())
+            return it->second;
+            return v;
+        };
+
+        // Loop node has extra (max_iters, initial_cond) inputs,
+        // body has an extra (loop_counter) input.
+        for (size_t i = 2; i < loop->inputs().size(); ++i) {
+            value_map[body->inputs()[i - 1]] = loop->inputs()[i];
+        }
+
+        for (Node* orig : body->nodes()) {
+            Node* clone = graph->insertNode(graph->createClone(orig, get_value));
+            for (size_t i = 0; i < orig->outputs().size(); ++i) {
+                value_map[orig->outputs()[i]] = clone->outputs()[i];
+            }
+        }
+        for (size_t i = 0; i < loop->outputs().size(); ++i) {
+            loop->outputs().at(i)->replaceAllUsesWith(
+                get_value(body->outputs().at(i + 1)));
+        }
+        // XXX: it is extremely important to destroy the loop in here. DCE might not
+        // be able to conclude that it's safe, because the loop might contain side
+        // effects.
+        loop->destroy();
+    }
+
+    // inserts a copy of body, passing inputs to the inputs of the block
+    // it returns the a list of the Values for the output of the block
+    std::vector<Value*> insert_block_copy(Graph& graph,
+                            Block* body,
+                            at::ArrayRef<Value*> inputs) {
+        TORCH_INTERNAL_ASSERT(inputs.size() == body->inputs().size());
+        std::unordered_map<Value*, Value*> value_map;
+        auto get_value = [&](Value* v) {
+            auto it = value_map.find(v);
+            if (it != value_map.end())
+            return it->second;
+            return v;
+        };
+        auto inputs_it = inputs.begin();
+        for (Value* input : body->inputs()) {
+            value_map[input] = *inputs_it++;
+        }
+        for (Node* node : body->nodes()) {
+            Node* new_node = graph.insertNode(graph.createClone(node, get_value));
+            auto outputs_it = new_node->outputs().begin();
+            for (Value* output : node->outputs()) {
+            value_map[output] = *outputs_it++;
+            }
+        }
+        return fmap(body->outputs(), get_value);  //maybe not recognized
+    }
+
+    void repeat_body(Block* body, size_t times, Block* dest) {
+        auto graph = body->owningGraph();
+        WithInsertPoint insert_point_guard(dest);
+        for (Value* input : body->inputs()) {
+            dest->addInput()->copyMetadata(input);
+        }
+
+        std::vector<Value*> io = dest->inputs().vec();
+        TORCH_INTERNAL_ASSERT(
+            !body->inputs().at(0)->hasUses(), "loop counter should be unused");
+        for (size_t i = 0; i < times; ++i) {
+            io[0] = body->inputs().at(0);
+            io = insert_block_copy(*graph, body, io);
+        }
+        for (Value* output : io) {
+            dest->registerOutput(output);
+        }
+
+        // It's likely that we have some dead nodes now - for example the "true"
+        // constant that prevents the loop from breaking. We shouldn't wait too long
+        // before removing them because they might artificially increase the loop size
+        // and prevent outer loop unrolling.
+        torch::jit::EliminateDeadCode(dest, false);
+    }
+
+    // Replaces the builtin loop counter with a "mutable" variable outside of the
+    // loop.
+    void replace_loop_counter(Node* loop) {
+        Graph* graph = loop->owningGraph();
+        Block* body = loop->blocks().at(0);
+        WithInsertPoint guard(loop);
+        Value* init_counter = graph->insertConstant(0);
+
+        loop->insertInput(2, init_counter);
+        loop->insertOutput(0)->setType(IntType::get());
+
+        Value* internal_counter = body->insertInput(1)->setType(init_counter->type());
+        body->inputs()[0]->replaceAllUsesWith(internal_counter);
+
+        WithInsertPoint insertPointGuard{body->return_node()};
+        Value* result = graph->insert(aten::add, {internal_counter, 1});
+        body->insertOutput(1, result);
+    }
+
+    bool unroll(Node* loop) {
+        Graph* graph = loop->owningGraph();
+        Block* body = loop->blocks().at(0);
+
+        int64_t block_size = calculate_block_size(body);
+        if (block_size > MaxBodySize) {
+            return false;
+        }
+
+        // if (!is_small_block(body)) {
+        //     return false;
+        // }
+
+        // We will be using a "mutable" counter outside of the loop instead of the
+        // default one, because this will allow us to share it between the unrolled
+        // loop and its epilogue. This is necessary only if the loop counter is
+        // actually used in the body.
+        if (body->inputs()[0]->uses().size() > 0)
+            replace_loop_counter(loop);
+
+        // Some optimization for constant-length loops. If we know they won't run too
+        // many times, then we can unroll them entirely.
+        Value* trip_count = loop->inputs().at(0);
+        c10::optional<int64_t> const_len = constant_as<int64_t>(trip_count);
+        //auto loop_mul_result = block_size * const_len;
+        if (const_len && *const_len < MaxBodyRepeats && (block_size * (*const_len)) < MaxLoopMulResult) {
+            Block* dest = loop->addBlock();
+            repeat_body(body, *const_len, dest);
+            loop->eraseBlock(0);
+            inline_body(loop);
+            return true;
+        }
+
+        WithInsertPoint insert_point_guard{loop};
+
+        // Clone the loop before we unroll it. The clone will become the epilogue.
+        Node* loop_epilogue =
+            graph->createClone(loop, [](Value* v) { return v; })->insertAfter(loop);
+        for (size_t i = 0; i < loop->outputs().size(); ++i) {
+            loop->outputs()[i]->replaceAllUsesWith(loop_epilogue->outputs()[i]);
+            loop_epilogue->replaceInput(i + 2, loop->outputs()[i]);
+        }
+
+        Block* dest = loop->addBlock();
+        repeat_body(body, UnrollFactor, dest);
+        loop->eraseBlock(0);
+
+        // Change the iteration counts of both loops
+        Value* iter_count = loop->inputs().at(0);
+        Value* unrolled_iter_count = graph->insert(
+            aten::__round_to_zero_floordiv, {iter_count, UnrollFactor});
+        loop->replaceInput(0, unrolled_iter_count);
+        loop_epilogue->replaceInput(
+            0,
+            graph->insert(
+                aten::sub,
+                {iter_count,
+                graph->insert(aten::mul, {unrolled_iter_count, UnrollFactor})}));
+        return true;
+    }
+
+    bool unroll_loops(Block* block, bool constant_only) {
+        bool changed = false;
+        for (auto it = block->nodes().begin(); it != block->nodes().end();) {
+            // XXX: unroll might destroy the current node, so we need to pre-increment
+            // the iterator
+            Node* node = *it;
+            ++it;
+            for (Block* subblock : node->blocks()) {
+                changed |= unroll_loops(subblock, constant_only);
+            }
+            if (!is_for_loop(node)) {
+                continue;
+            }
+            //only handle max loop is constant situation.
+            if (constant_only && node->inputs().at(0)->node()->kind() != prim::Constant) {
+                continue;
+            }
+            changed |= unroll(node);
+        }
+        return changed;        
+    }
+    
+    // 去掉像以下形式的prim::loop计数block
+    // %943 : int = prim::Loop(%65, %4641, %idx.5)
+    //   block0(%944 : int, %945 : int):
+    //     %0 : int = prim::Constant[value=1]()
+    //     %7285 : int = aten::add(%945, %0)
+    //     %947 : bool = aten::lt(%7285, %idx.13)
+    //     %948 : bool = aten::__and__(%947, %4641)
+    //     -> (%948, %7285)
+    // 其中原来的节点已经展开到parent graph中了，只剩下计数模块，没有实质作用，可以删掉。
+    bool eliminate_useless_loop_count_body(Block* block) {
+        bool changed = false;
+        for (auto it = block->nodes().begin(); it != block->nodes().end();) {
+            // XXX: unroll might destroy the current node, so we need to pre-increment
+            // the iterator
+            Node* node = *it;
+            ++it;
+            for (Block* subblock : node->blocks()) {
+                changed |= eliminate_useless_loop_count_body(subblock);
+            }
+            // pattern
+            if (!is_uesless_loop_count_body(node)) {
+                continue;
+            }
+            changed |= destory_useles_loop_count_body(node);
+        }
+        return changed;        
+    }
+    // 判断是否是无用的prim::loop计数模块
+    bool is_uesless_loop_count_body(Node* node) {
+        // 输入node类型必须是prim::loop
+        if (node->kind() != prim::Loop) {
+            return false;
+        }
+        // prim::loop的输出必须无user
+        if (node->hasUses()) {
+            return false;
+        }
+        auto loop_block = node->blocks().at(0);
+        std::vector<Node*> loop_block_nodes;
+        for (auto it = loop_block->nodes().begin(); it != loop_block->nodes().end(); it++) {
+            loop_block_nodes.push_back(*it);
+            if (loop_block_nodes.size() > 3) {
+                return false;
+            }
+        }
+        // block必须只有3个nodes且顺序必须是1-->aten::add、2-->aten::lt、3-->aten::__and__
+        if (loop_block_nodes.size() == 3 &&
+            loop_block_nodes[0]->kind() == aten::add && 
+            loop_block_nodes[1]->kind() == aten::lt &&
+            loop_block_nodes[2]->kind() == aten::__and__) {
+                LOG(INFO) << "Find useless loop counter body on node: [ " << node_info(node) << " ]";
+                return true;
+        }
+        return false;
+    }
+    // 删掉无用的prim::loop计数节点
+    bool destory_useles_loop_count_body(Node* node) {
+        if (node->kind() != prim::Loop) {
+            return false;
+        }
+        LOG(INFO) << "Destory useless loop counter node: [ " << node_info(node) << " ]";
+        node->destroy();
+        return true;
+    }
+
+    std::shared_ptr<Graph> graph_;
+};
+
+} // namespace
+
+void unrolling_loop(std::shared_ptr<torch::jit::Graph> graph) {
+    LOG(INFO) << "Running poros unrolling_loop passes";
+    UnrollingLoop ul(std::move(graph));
+    ul.run();
+}
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
\ No newline at end of file
diff --git a/poros/src/poros/util/graph_test_helper.cpp b/poros/src/poros/util/graph_test_helper.cpp
new file mode 100644
index 0000000000..bf01b69d3c
--- /dev/null
+++ b/poros/src/poros/util/graph_test_helper.cpp
@@ -0,0 +1,167 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file test_util.cpp
+* @author tianshaoqing@baidu.com
+* @date Wed Sep 27 11:24:21 CST 2021
+* @brief 
+**/
+#include "graph_test_helper.h"
+
+#include <torch/csrc/jit/ir/irparser.h>
+#include <torch/csrc/jit/passes/utils/subgraph_utils.h>
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+namespace graphtester {
+
+static inline void clone_tensor_vector(const std::vector<at::Tensor> &old_vec, std::vector<at::Tensor> &new_vec) {
+    for (auto i: old_vec) {
+        new_vec.push_back(i.clone());
+    }
+}
+
+
+static bool write_to_log(const std::string &log_path, const std::string &inform) {
+    if (log_path.empty()) {
+        return true;
+    } else {
+        std::ofstream log_file(log_path, std::ios::app);
+        if (log_file) {
+            log_file << inform;
+            log_file.close();
+            return true;
+        } else {
+            return false;
+        }
+    }
+};
+
+
+static std::vector<at::Tensor> run_graph(const std::shared_ptr<torch::jit::Graph> &graph,
+                                         const std::vector<at::Tensor> &input_data,
+                                         const baidu::mirana::poros::PorosOptions &poros_option,
+                                         const std::string &log_path) {
+    std::vector<at::Tensor> graph_output;
+    // 构建graph exe
+    std::string function_name("test tensor");
+    torch::jit::GraphExecutor graph_exe(graph, function_name);
+    // 将输入导入ivalue vector
+    std::vector<c10::IValue> graph_input;
+    for (size_t i = 0; i < input_data.size(); i++) {
+        graph_input.push_back(input_data[i]);
+    }
+    // 执行graph
+    std::clock_t start, end;
+    start = std::clock();
+    graph_exe.run(graph_input);
+    end = std::clock();
+    std::string log_inform = "graph time:" + std::to_string(double(end - start) / CLOCKS_PER_SEC * 1000.0) + "ms\t";
+    std::cout << log_inform;
+    if (!write_to_log(log_path, log_inform)) {
+        LOG(WARNING) << "write to log failed";
+    }
+    // 提取结果
+    for (size_t i = 0; i < graph_input.size(); i++) {
+        auto tmp_ivalue = graph_input[i];
+        graph_output.push_back(tmp_ivalue.toTensor());
+    }
+    return graph_output;
+};
+
+
+std::vector<at::Tensor>
+replace_input_tensor_to_constant(std::shared_ptr<torch::jit::Graph> graph, const std::vector<at::IValue> &input_data,
+                                 const std::vector<InputTypeEnum> &input_data_type_mask) {
+    torch::jit::WithInsertPoint guard(graph->block()->nodes().front());
+    std::vector<at::Tensor> graph_input_tensor;
+    std::vector<size_t> eraseInputIdx;
+    for (size_t i = 0; i < input_data_type_mask.size() && i < graph->inputs().size() && i < input_data.size(); i++) {
+        switch (input_data_type_mask[i]) {
+            case InputTensor: //正常输入Tensor
+                graph_input_tensor.push_back(input_data[i].toTensor());
+                break;
+            case ConstantTensor: //op的weights和bias
+                graph->inputs()[i]->replaceAllUsesWith(graph->insertConstant(input_data[i]));
+                eraseInputIdx.push_back(i);
+                break;
+            case ConstantIntVector: // int[] = prim::Constant[value=[1, 1, 1]]()
+                graph->inputs()[i]->replaceAllUsesWith(graph->insertConstant(input_data[i].toIntList()));
+                eraseInputIdx.push_back(i);
+                break;
+        }
+    }
+    // 从后向前删除多余的input
+    for (auto it = eraseInputIdx.rbegin(); it != eraseInputIdx.rend(); it++) {
+        graph->eraseInput(*it);
+    }
+
+    return graph_input_tensor;
+}
+
+bool run_graph_and_fused_graph(const std::string &graph_IR,
+                               const baidu::mirana::poros::PorosOptions &poros_option,
+                               std::shared_ptr<baidu::mirana::poros::IFuser> fuser,
+                               const std::vector<at::IValue> &input_data,
+                               const std::vector<InputTypeEnum> &input_data_type_mask,
+                               std::vector<at::Tensor> &original_graph_output,
+                               std::vector<at::Tensor> &fused_graph_output,
+                               std::string log_path) {
+    try {
+        fuser->reset();
+        // 解析graph
+        auto graph = std::make_shared<torch::jit::Graph>();
+        torch::jit::parseIR(graph_IR, graph.get());
+        auto input_tensor = replace_input_tensor_to_constant(graph, input_data, input_data_type_mask);
+        // 冷启动运行原始graph
+        std::vector<at::Tensor> input_of_replaced_graph;
+        clone_tensor_vector(input_tensor, input_of_replaced_graph);
+        std::cout << "input replaced ";
+        original_graph_output = run_graph(graph, input_of_replaced_graph, poros_option, log_path);
+
+        // 运行常量化后的graph
+        std::vector<at::Tensor> input_of_ori_graph;
+        clone_tensor_vector(input_tensor, input_of_ori_graph);
+        std::cout << std::endl << "original ";
+        original_graph_output = run_graph(graph, input_of_ori_graph, poros_option, log_path);
+
+        // 运行fuse后的graph
+        std::vector<at::Tensor> input_of_fused_graph;
+        clone_tensor_vector(input_tensor, input_of_fused_graph);
+        fuser->fuse(graph);
+        std::cout << std::endl << "op fused ";
+        fused_graph_output = run_graph(graph, input_of_fused_graph, poros_option, log_path);
+        std::cout << std::endl << fuser->info() << std::endl << std::endl;
+    } catch (...) {
+        return false;
+    }
+    return true;
+}
+
+bool almost_equal(const at::Tensor &a, const at::Tensor &b, const float &threshold) {
+    auto a_float = a.toType(at::kFloat);
+    auto b_float = b.toType(at::kFloat);
+    double maxValue = 0.0;
+    maxValue = fmax(a.abs().max().item<float>(), maxValue);
+    maxValue = fmax(b.abs().max().item<float>(), maxValue);
+    at::Tensor diff = a_float - b_float;
+    return diff.abs().max().item<float>() <= threshold * maxValue;
+}
+
+}// namespace graphtester
+}// namespace poros
+}// namespace mirana
+}// namespace baidu
diff --git a/poros/src/poros/util/graph_test_helper.h b/poros/src/poros/util/graph_test_helper.h
new file mode 100644
index 0000000000..625e08d9e7
--- /dev/null
+++ b/poros/src/poros/util/graph_test_helper.h
@@ -0,0 +1,73 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file test_util.h
+* @author tianshaoqing@baidu.com
+* @date Wed Sep 27 11:24:21 CST 2021
+* @brief 
+**/
+#pragma once
+
+#include "poros/compile/poros_module.h"
+#include "poros/lowering/op_fuse_pass.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+namespace graphtester {
+
+
+enum InputTypeEnum {
+    InputTensor = 0, // op 的输入
+    ConstantTensor, // op的权重和偏置
+    ConstantIntVector, // 如conv2d的stride等要求int[]的参数
+};
+
+/**
+ *
+ * @param graph_IR
+ * @param poros_option : default device is GPU
+ * @param fuser : op fuser to test
+ * @param input_data : vector of at::IValue, which Compatible with Tensor, vector<int64_t/double_t ...> , scalar , etc.
+ * @param input_data_type_mask : tell the func how to deal with the input data, used to trans Tensor, vector or []int to prim::Constant
+ * @param original_graph_output : vector of at::Tensor, graph outputs
+ * @param fused_graph_output : vector of at::Tensor, poros outputs
+ * @param log_path
+ * @return bool
+ */
+bool run_graph_and_fused_graph(const std::string &graph_IR,
+                               const baidu::mirana::poros::PorosOptions &poros_option,
+                               std::shared_ptr<baidu::mirana::poros::IFuser> fuser,
+                               const std::vector<at::IValue> &input_data,
+                               const std::vector<InputTypeEnum> &input_data_type_mask,
+                               std::vector<at::Tensor> &original_graph_output,
+                               std::vector<at::Tensor> &fused_graph_output,
+                               std::string log_path = "");
+
+/**
+ * @brief compare the similarity of two Tensors containing Float
+ *
+ * @param [in] a : first Tensor
+ * @param [in] b : second Tensor
+ * @param [in] threshold : acceptable relative threshold
+ * @return  bool
+ * @retval true => succeed  false => failed
+**/
+bool almost_equal(const at::Tensor &a, const at::Tensor &b, const float &threshold);
+
+}// namespace graphtester
+}// namespace poros
+}// namespace mirana
+}// namespace baidu
diff --git a/poros/src/poros/util/macros.h b/poros/src/poros/util/macros.h
new file mode 100644
index 0000000000..a6888bfbbc
--- /dev/null
+++ b/poros/src/poros/util/macros.h
@@ -0,0 +1,77 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file macros.h
+* @author tianjinjin@baidu.com
+* @date Fri Jun  4 16:16:38 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <sys/time.h>
+
+#include <c10/util/Exception.h>
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+#define POROS_TIME_COST_US(pre, now) ((now.tv_sec - pre.tv_sec) * 1000000 + (now.tv_usec - pre.tv_usec))
+#define POROS_TIME_COST_MS(pre, now) ((now.tv_sec - pre.tv_sec) * 1000 + (now.tv_usec - pre.tv_usec) / 1000)
+
+// ----------------------------------------------------------------------------
+// Error reporting macros
+// ----------------------------------------------------------------------------
+#define POROS_CHECK_RET_EXIT(n, s) {  \
+    if ((n) != 0) {                     \
+        LOG(FATAL) << s;              \
+        exit(1);                      \
+    }                                 \
+}
+                        
+#define POROS_CHECK_RET(n, s) {      \
+    if ((n) != 0) {                    \
+        LOG(WARNING) << s;           \
+        return -1;                   \
+    }                                \
+}
+
+#define POROS_CHECK_TRUE(n, s) {      \
+    if ((n) != true) {                  \
+        LOG(WARNING) << s;            \
+        return false;                 \
+    }                                 \
+}
+
+#define POROS_THROW_ERROR(msg)                                                             \
+    throw ::c10::Error({__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, #msg);
+
+#define POROS_ASSERT(cond, ...)                                                            \
+    if (!(cond)) {                                                                         \
+    POROS_THROW_ERROR(                                                                     \
+        #cond << " ASSERT FAILED at " << __FILE__ << ':' << __LINE__                       \
+              << ", consider filing a bug to cudp@baidu.com \n"                            \
+              << __VA_ARGS__);                                                             \
+    }
+
+#define POROS_CHECK(cond, ...)                                                               \
+    if (!(cond)) {                                                                           \
+    POROS_THROW_ERROR("Expected " << #cond << " to be true but got false\n" << __VA_ARGS__); \
+    }
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
\ No newline at end of file
diff --git a/poros/src/poros/util/poros_util.cpp b/poros/src/poros/util/poros_util.cpp
new file mode 100644
index 0000000000..97e7a6dd64
--- /dev/null
+++ b/poros/src/poros/util/poros_util.cpp
@@ -0,0 +1,339 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file poros_util.cpp
+* @author tianjinjin@baidu.com
+* @date Wed Apr  7 17:52:36 CST 2021
+* @brief 
+**/
+
+#include "poros/util/poros_util.h"
+#include "poros/log/poros_logging.h"
+#include "poros/util/macros.h"
+#include "poros/context/poros_global.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+int merge_graph_to_module(std::shared_ptr<torch::jit::Graph>& to_merge_graph, 
+    torch::jit::Module& module,
+    bool init_module_ptr) {
+    if (init_module_ptr) {
+        //先把model本身作为graph的第一个参数传进去, 此处勿忘!!!!!!
+        auto self = to_merge_graph->insertInput(0, "self");
+        self->setType(module._ivalue()->type());
+    }
+    
+    auto new_method = module._ivalue()->compilation_unit()->create_function("forward", to_merge_graph);
+    std::vector<c10::Argument> args;
+    int index = 0;
+    for (auto in : to_merge_graph->inputs()) {
+        args.push_back(c10::Argument("input" + std::to_string(index), in->type()));
+        index++;
+    }
+
+    index = 0;
+    std::vector<c10::Argument> res;
+    for (auto out : to_merge_graph->outputs()) {
+        res.push_back(c10::Argument("output" + std::to_string(index), out->type()));
+        index++;
+    }
+    auto schema = c10::FunctionSchema(new_method->name(), new_method->name(), args, res);
+    module.type()->addMethod(new_method);
+    new_method->setSchema(schema);
+    return 0;    
+}
+
+torch::jit::Module build_tmp_module(std::shared_ptr<torch::jit::Graph>& sub_graph) {
+    torch::jit::script::Module new_mod("tmp_submodule");
+    auto graph = sub_graph->copy();
+    merge_graph_to_module(graph, new_mod, true);
+    return new_mod;
+}
+
+//好好参考aten/src/ATen/core/type.cpp 里 operator<< 的写法
+bool gen_dims_for_tensor(const torch::jit::Value* value, std::vector<int64_t>& dims) {
+    POROS_CHECK_TRUE((value->type()->isSubtypeOf(c10::TensorType::get())), 
+        "given value for gen_dims_for_tensor is not Tensor as expected");
+    std::vector<int64_t>().swap(dims);
+    c10::TensorTypePtr op = value->type()->cast<c10::TensorType>();
+    if (auto ndim = op->sizes().size()) {
+        for (size_t i = 0; i < *ndim; ++i) {
+            if (auto s = op->sizes()[i]) {
+                dims.push_back(s.value());
+            } else {
+                dims.push_back(-1);
+            }
+        }
+        return true;
+    }
+    return false;
+}
+
+void update_global_context(torch::jit::Value* old_value, torch::jit::Value* new_value) {
+    // copy value_dynamic_shape_map
+    if (old_value->type()->isSubtypeOf(c10::TensorType::get())) {
+        if (PorosGlobalContext::instance()._value_dynamic_shape_map.count(old_value) > 0) {
+            PorosGlobalContext::instance()._value_dynamic_shape_map[new_value] =
+                PorosGlobalContext::instance()._value_dynamic_shape_map[old_value];
+        }
+    } else if (old_value->type()->kind() == c10::TypeKind::IntType) {
+        update_global_int_intlist_map_context(old_value, new_value);
+    } else if (old_value->type()->kind() == c10::TypeKind::ListType) {
+        if (old_value->type()->isSubtypeOf(c10::ListType::ofInts())) {
+            update_global_int_intlist_map_context(old_value, new_value);
+        }
+        PorosGlobalContext::instance()._list_size_map.update_value(old_value, new_value);   
+    } else {
+
+    }
+    //to add more @wangrui39
+}
+
+void update_global_int_intlist_map_context(torch::jit::Value* old_value, torch::jit::Value* new_value) {
+    if (PorosGlobalContext::instance()._int_intlist_values_map.count(old_value) > 0) {
+            PorosGlobalContext::instance()._int_intlist_values_map[new_value] =
+                PorosGlobalContext::instance()._int_intlist_values_map[old_value];
+    }
+}
+
+void update_global_list_size_map_node_key_context(torch::jit::Node* old_node, torch::jit::Node* new_node) {
+    PorosGlobalContext::instance()._list_size_map.update_node(old_node, new_node);   
+}
+
+void unmerge_subgraph(torch::jit::Node* subgraph_node) {
+    // Inline the graph, replace uses of node outputs and destroy the node
+    auto outer_graph = subgraph_node->owningGraph();
+    std::shared_ptr<torch::jit::Graph> sub_graph = subgraph_node->g(torch::jit::attr::Subgraph);
+
+    torch::jit::WithInsertPoint guard(subgraph_node);
+    const auto subgraph_outputs = torch::jit::insertGraph(
+        *outer_graph, *sub_graph, subgraph_node->inputs());
+    AT_ASSERT(subgraph_outputs.size() >= subgraph_node->outputs().size());
+    for (size_t i = 0; i < subgraph_node->outputs().size(); ++i) {
+        subgraph_node->outputs()[i]->replaceAllUsesWith(subgraph_outputs[i]);
+        update_global_context(subgraph_node->outputs()[i], subgraph_outputs[i]);
+    }
+    subgraph_node->destroy();
+}
+
+void find_to_optimized_nodes(torch::jit::Block* block, std::vector<torch::jit::Node*>& to_optimized_nodes) {
+    //bool changed = false;
+    for (auto it = block->nodes().begin(); it != block->nodes().end(); it++) {
+        torch::jit::Node* node = *it;
+        for (torch::jit::Block* subblock : node->blocks()) {
+            find_to_optimized_nodes(subblock, to_optimized_nodes);
+        }
+        if (node->kind() == torch::jit::prim::CudaFusionGroup) {
+            to_optimized_nodes.push_back(node);
+        }
+    }
+}
+
+
+/********************************************************************
+             SOME DEPRECATED FUNCTIONS BELOW
+*********************************************************************/
+//DEPRECATED
+bool gen_dims_for_scarlar(const torch::jit::Value* value, std::vector<int64_t>& dims) {
+   return false; 
+}
+
+// gen dims for tensorlist input
+// DEPRECATED
+bool gen_dims_for_tensorlist(const torch::jit::Value* value, std::vector<int64_t>& dims) {
+    // if we want to treat the tensorlist as a single input to tensort. 
+    // TODO: we should check the tensors in list are of the same size. 
+    // std::vector<int64_t> pre_dims;
+    POROS_CHECK_TRUE(value->type()->isSubtypeOf(c10::ListType::ofTensors()), 
+        "given value for gen_dims_for_tensorlist is not TensorList as expected");
+    std::vector<int64_t>().swap(dims);
+
+    auto producer = value->node();
+    if (producer->kind() == torch::jit::aten::meshgrid) {
+        LOG(INFO) << "to support: torch::jit::aten::meshgrid";
+    
+    // prim::ListConstruct
+    } else if (producer->kind() == torch::jit::prim::ListConstruct) {
+        //situation one: some node like : 
+        // %out : Tensor[] = prim::ListConstruct(%intput)
+        if (producer->inputs().size() > 0) {
+            auto op = producer->inputs()[0]->type()->cast<c10::TensorType>();
+            if (op->sizes().size().has_value() && op->scalarType().has_value()) {
+                dims = op->sizes().concrete_sizes().value();
+                return true;
+            }
+        //situation two: some node like: 
+        //  %out : Tensor[] = prim::ListConstruct()
+        //  %new_out: Tensor[] = aten::append(%out, %item)
+        } else {
+            for (auto use: value->uses()) {
+                LOG(INFO) << "checking user: " << node_info_with_attr(use.user);
+                if(use.user->kind() == torch::jit::aten::append &&
+                    use.user->inputs().size() > 1) {
+                    auto op = use.user->inputs()[1]->type()->cast<c10::TensorType>();
+                    if (op->sizes().size().has_value() && op->scalarType().has_value()) {
+                        dims = op->sizes().concrete_sizes().value();
+                        return true;
+                    }
+                }
+            }
+        }
+        LOG(INFO) <<  "to support: torch::jit::prim::ListConstruct";
+    } else if (producer->kind() == torch::jit::prim::Constant) {
+        LOG(INFO) <<  "to support: torch::jit::prim::Constant";
+    } else {
+        // aten::unbind
+        // prim::Constant
+        LOG(INFO) <<  "to support: some kind of producer: "  << producer->kind().toQualString();
+    }
+    return false;
+}
+
+//TODO: this method is relatively low-level, try to use SubgraphRewriter to handle this one
+//DEPRECATED
+bool is_linear_if_node(torch::jit::Node* node) {
+    /// Check if this Node hosts a pattern like so:
+    ///  %ret = prim::If(%1)
+    ///      block0():
+    ///          %ret1 = aten::addmm(%bias, %input, %weight_t, %beta, %alpha)
+    ///          -> (%ret1)
+    ///      block1():
+    ///          %output = aten::matmul(%input, %weight_t)
+    ///          %ret2 = aten::add(%output, %bias, %alpha)
+    ///          -> (%ret2)
+    if (node->kind() != torch::jit::prim::If || node->blocks().size() != 2) {
+        return false;
+    }
+
+    auto block2vector = [](torch::jit::Block* block, std::vector<torch::jit::Node*>& nodes_vec) {
+        for (auto itr : block->nodes()) {
+            nodes_vec.emplace_back(itr);
+        }
+    };
+
+    std::vector<torch::jit::Node*> true_nodes;
+    std::vector<torch::jit::Node*> false_nodes;
+    block2vector(node->blocks()[0], true_nodes);
+    block2vector(node->blocks()[1], false_nodes);
+
+    if (node->blocks()[0]->outputs().size() != 1 || 
+        true_nodes.size() != 1 ||
+        true_nodes[0]->kind() != torch::jit::aten::addmm) {
+            return false;
+    }
+
+    if (node->blocks()[1]->outputs().size() != 1 || 
+        false_nodes.size() != 2 || 
+        false_nodes[0]->kind() != torch::jit::aten::matmul ||
+        false_nodes[1]->kind() != torch::jit::aten::add ) {
+        return false;
+    }
+    
+    auto is_input_const = [](torch::jit::Node* node, int index) {
+        return (node->inputs()[index])->node()->kind() == torch::jit::prim::Constant;
+    };
+
+    if (true_nodes[0]->inputs().size() != 5 ||
+        !is_input_const(true_nodes[0], 0) ||
+        !is_input_const(true_nodes[0], 2) || 
+        !is_input_const(true_nodes[0], 3) || 
+        !is_input_const(true_nodes[0], 4)) {
+        return false;
+    }
+
+    if (false_nodes[0]->inputs().size() != 2 ||
+        !is_input_const(false_nodes[0], 1) ||
+        false_nodes[0]->inputs()[0] != true_nodes[0]->inputs()[1] ||
+        false_nodes[0]->inputs()[1] != true_nodes[0]->inputs()[2] ||
+        false_nodes[1]->inputs()[0] != false_nodes[0]->outputs()[0] ||
+        false_nodes[1]->inputs()[1] != true_nodes[0]->inputs()[0] ||
+        false_nodes[1]->inputs()[2] != true_nodes[0]->inputs()[4]) {
+        return false;
+    }
+
+    return true;
+}
+
+//DEPRECATED
+std::vector<torch::jit::Value*> extract_linear_input(torch::jit::Node *node) {
+    std::vector<torch::jit::Value*> valid_inputs;
+    if (is_linear_if_node(node)) {
+        valid_inputs.emplace_back(node->inputs()[0]);
+        auto addmm_node = *((node->blocks()[0])->nodes().begin());
+        valid_inputs.emplace_back(addmm_node->inputs()[1]);
+    }
+    return valid_inputs;
+}
+
+//DEPRECATED
+bool is_dim_equal_if_node(torch::jit::Node* node) {
+    /// Check if this Node hosts a pattern like so:
+    /// %const_val : int = prim::Constant[value=2]()
+    /// %dim : int = aten::dim(%input_tensor)
+    /// %eq : bool = aten::eq(%dim, %const_val)
+    /// %ret = prim::If(%eq)
+    ///     block0():
+    ///     ...
+    ///     block1():
+    ///     ...
+    if (node->kind() != torch::jit::prim::If || node->blocks().size() != 2 ||
+        node->inputs().size() != 1) {
+        return false;
+    }
+
+    if (node->input(0)->node()->kind() == torch::jit::aten::eq) {
+        auto eq_node = node->input(0)->node();
+        if (eq_node->inputs().size() == 2 &&
+            eq_node->input(0)->node()->kind() == torch::jit::aten::dim &&
+            eq_node->input(1)->node()->kind() == torch::jit::prim::Constant) {
+            return true;
+        }
+    }
+    return false;
+}
+
+//DEPRECATED
+void inline_if_body(torch::jit::Block* body) {
+    torch::jit::Node* owning_node = body->owningNode();
+    for (auto itr = body->nodes().begin(); itr != body->nodes().end();) {
+        torch::jit::Node* body_node = *itr;
+        // advance iterator because after body_node is moved its next pointer will be to n
+        itr++;
+        body_node->moveBefore(owning_node);
+    }
+    for (size_t i = 0; i < owning_node->outputs().size(); ++i) {
+        owning_node->outputs().at(i)->replaceAllUsesWith(body->outputs().at(i));
+    }
+    owning_node->destroy();
+}
+
+/*
+  bool shapeIsKnown(Value* v) {
+    if (v->type()->cast<TensorType>()) {
+      if (!v->isCompleteTensor()) {
+        return false;
+      }
+      if (*v->type()->castRaw<TensorType>()->dim() == 0) {
+        return false;
+      }
+    }
+    return true;
+  } */
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/util/poros_util.h b/poros/src/poros/util/poros_util.h
new file mode 100644
index 0000000000..a5ce88f1ee
--- /dev/null
+++ b/poros/src/poros/util/poros_util.h
@@ -0,0 +1,175 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file poros_util.h
+* @author tianjinjin@baidu.com
+* @date Wed Apr  7 17:52:36 CST 2021
+* @brief 
+**/
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include <algorithm>
+#include <unordered_map>
+#include <set>
+#include <torch/script.h>
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+
+//以下两个函数输出单个节点的信息
+inline std::string node_info_with_attr(const torch::jit::Node *n) {
+    std::stringstream ss;
+    n->print(ss, 0, {}, /**print_source_locations = */false,
+        /* print_attributes = */true,
+        /* print_scopes = */ false,
+        /* print_body = */ false);
+    return ss.str();
+}
+
+inline std::string node_info(const torch::jit::Node *n) {
+    std::stringstream ss;
+    n->print(ss, 0, {}, /**print_source_locations = */false,
+        /* print_attributes = */false,
+        /* print_scopes = */ false,
+        /* print_body = */ false);
+    return ss.str();
+}
+
+//when a node info like: %1126 : Float(*, 2, strides=[2, 1], requires_grad=0, device=cuda:0) = aten::squeeze(%output1.1, %self.new_length.1075)
+//we set the output like: [aten::squeeze][ouput:%1126][input:%output1.1, %self.new_length.1075]
+inline std::string layer_info(const torch::jit::Node *n) {
+    std::stringstream ss;
+    ss << "[" << n->kind().toQualString() << "]";
+    auto outs = n->outputs();
+    if (outs.size() > 0) {
+        ss << "[output:";
+        size_t i = 0;
+        for (auto out : outs) {
+            if (i++ > 0) {
+                ss << ",";
+            }      
+            ss << "%" << out->debugName();
+        }
+        ss << "]";
+    }
+    auto ins = n->inputs();
+    if (outs.size() == 0 && ins.size() != 0) {
+        ss << "[][input:";
+        size_t i = 0;
+        for (auto in : ins) {
+            if (i++ > 0) {
+                ss << ",";
+            }      
+            ss << "%" << in->debugName();
+        }
+        ss << "]";
+    }
+    return ss.str();
+}
+
+// inline std::string node_info(const torch::jit::Node* n) {
+//     std::stringstream ss;
+//     ss << *n;
+//     std::string node_info = ss.str();
+//     node_info.erase(std::remove(node_info.begin(), node_info.end(), '\n'), node_info.end());
+//     return node_info;
+// }
+
+int merge_graph_to_module(std::shared_ptr<torch::jit::Graph>& to_merge_graph, 
+                            torch::jit::Module& module,
+                            bool init_module_ptr);
+
+torch::jit::Module build_tmp_module(std::shared_ptr<torch::jit::Graph>& sub_graph);
+
+
+bool gen_dims_for_tensor(const torch::jit::Value* value, std::vector<int64_t>& dims);
+
+/**
+ * @brief update global context when some Value copy happened in the Segment progress && engine transform progress.
+ *        当在子图分割阶段或者后续engine转换阶段出现value的复制场景时，调用改函数，完成必要的value全局信息的拷贝。
+ *        当前（2022.01）主要实现 value_dynamic_shape_map 中，value的shape信息的拷贝。
+ * @param [in] old_value : 原value
+ * @param [in] new_value : 新的value，新value的meta信息从原value拷贝而来。
+ * @return null
+ **/
+void update_global_context(torch::jit::Value* old_value, torch::jit::Value* new_value);
+
+/**
+ * @brief update global context when some Value copy happened in the Segment progress && engine transform progress.
+ *        当子图分割中出现node融合时，需要更新node维度的key
+ * @param [in] old_node : 原node
+ * @param [in] new_node : 新的node，新value的meta信息从原value拷贝而来。
+ * @return null
+ **/
+void update_global_list_size_map_node_key_context(torch::jit::Node* old_node, torch::jit::Node* new_node);
+
+/**
+ * @brief update global context when some Value copy happened in the Segment progress && engine transform progress.
+ *        当子图分割中出现node融合时，需要更新node维度的key
+ * @param [in] old_node : 原node
+ * @param [in] new_node : 新的node，新value的meta信息从原value拷贝而来。
+ * @return null
+ **/
+void update_global_int_intlist_map_context(torch::jit::Value* old_value, torch::jit::Value* new_value);
+
+/**
+ * @brief unmerge the subgraph to its parent graph（especially when engine transform failed）
+ *        把子图的节点信息，重新merge的父图里面去（尤其是在子图转engine失败需要fallback的场景）
+ *
+ * @param [in] subgraph_node : 类型为CudaFusionGroup的特殊node。
+ * @return null
+ **/
+void unmerge_subgraph(torch::jit::Node* subgraph_node);
+
+/**
+ * @brief 在输入block及其子block中遍历CudaFusionGroup节点，放入to_optimized_nodes中准备优化。
+ *
+ * @param [in] block : 需要遍历CudaFusionGroup节点的block。
+ * @param [out] to_optimized_nodes : 输出遍历到的CudaFusionGroup节点集合。
+ * @return null
+ **/
+void find_to_optimized_nodes(torch::jit::Block* block, std::vector<torch::jit::Node*>& to_optimized_nodes);
+
+
+/********************************************************************
+             SOME DEPRECATED FUNCTIONS BELOW
+*********************************************************************/
+//DEPRECATED
+bool gen_dims_for_scarlar(const torch::jit::Value* value, std::vector<int64_t>& dims);
+//DEPRECATED
+bool gen_dims_for_tensorlist(const torch::jit::Value* value, std::vector<int64_t>& dims);
+
+//判断某个节点是否是可以不展开的liner节点，否则的话会展开很多的分支，把整个graph切分的过于细碎。
+//DEPRECATED
+bool is_linear_if_node(torch::jit::Node* node);
+
+//DEPRECATED
+std::vector<torch::jit::Value*> extract_linear_input(torch::jit::Node *node);
+
+//判断某个if节点的输入，是否是一个aten::dim 和 const 比较生成的，如果是的，这个if节点依赖的判断条件很可能是一个常量。
+//DEPRECATED
+bool is_dim_equal_if_node(torch::jit::Node* node);
+
+//当if的条件恒成立时，将if条件去掉，相应的block提出来。
+//DEPRECATED
+void inline_if_body(torch::jit::Block* body);
+
+}  // namespace poros 
+}  // namespace mirana
+}  // namespace baidu
diff --git a/poros/src/poros/util/test_util.cpp b/poros/src/poros/util/test_util.cpp
new file mode 100644
index 0000000000..83a000c5eb
--- /dev/null
+++ b/poros/src/poros/util/test_util.cpp
@@ -0,0 +1,367 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file test_util.cpp
+* @author tianshaoqing@baidu.com
+* @date Wed Sep 27 11:24:21 CST 2021
+* @brief 
+**/
+#include "test_util.h"
+
+#include <cuda_runtime.h>
+
+#include <torch/csrc/jit/ir/irparser.h>
+#include <torch/csrc/jit/passes/utils/subgraph_utils.h>
+
+#include "poros/compile/graph_prewarm.h"
+#include "poros/context/poros_global.h"
+#include "poros/engine/iengine.h"
+#include "poros/iplugin/plugin_create.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+namespace testutil {
+
+static inline void clone_tensor_vector(const std::vector<at::Tensor> &old_vec, std::vector<at::Tensor> &new_vec) {
+    for (auto i: old_vec) {
+        new_vec.push_back(i.clone());
+    }
+}
+
+static std::string get_engine_name(const baidu::mirana::poros::PorosOptions &poros_option) {
+    std::string engine_name("");
+    if (poros_option.device == Device::GPU) {
+        engine_name = "TensorrtEngine";
+    } else if (poros_option.device == Device::XPU) {
+        engine_name = "XtclEngine";
+    } else {
+        engine_name = "";
+    }
+    return engine_name;
+}
+
+static bool write_to_log(const std::string &log_path, const std::string &inform) {
+    if (log_path.empty()) {
+        return true;
+    } else {
+        std::ofstream log_file(log_path, std::ios::app);
+        if (log_file) {
+            log_file << inform;
+            log_file.close();
+            return true;
+        } else {
+            return false;
+        }
+    }
+};
+
+static baidu::mirana::poros::IEngine *
+select_engine(const torch::jit::Node *n, const baidu::mirana::poros::PorosOptions &poros_option) {
+    baidu::mirana::poros::IEngine *engine(nullptr);
+    if (n == nullptr || n->kind() != torch::jit::prim::CudaFusionGroup) {
+        return nullptr;
+    }
+    std::string engine_name = get_engine_name(poros_option);
+    if (engine_name.empty()) {
+        return nullptr;
+    }
+    engine = dynamic_cast<baidu::mirana::poros::IEngine *>(create_plugin(engine_name, \
+    baidu::mirana::poros::PorosGlobalContext::instance()._engine_creator_map));
+    if (engine == nullptr || engine->init() < 0) {
+        return nullptr;
+    }
+    return engine;
+};
+
+static bool is_node_supported(const torch::jit::Node *node, const baidu::mirana::poros::PorosOptions &poros_option) {
+    std::string engine_name = get_engine_name(poros_option);
+    auto converter_map = baidu::mirana::poros::PorosGlobalContext::instance().get_converter_map(engine_name);
+    if (converter_map != nullptr && converter_map->node_converterable(node)) {
+        LOG(INFO) << "supported node: " << node->kind().toQualString();
+        return true;
+    } else {
+        if (node->kind() != torch::jit::prim::Loop &&
+            node->kind() != torch::jit::prim::If &&
+            node->kind() != torch::jit::prim::CudaFusionGroup) {
+            LOG(WARNING) << "not supported node: " << node->kind().toQualString()
+                         << ", detail info: " << *node;
+        }
+        return false;
+    }
+}
+
+static std::vector<at::Tensor> run_graph(const std::shared_ptr<torch::jit::Graph> &graph,
+                                         const std::vector<at::Tensor> &input_data,
+                                         const baidu::mirana::poros::PorosOptions &poros_option,
+                                         const std::string &log_path) {
+    std::vector<at::Tensor> graph_output;
+    // 构建graph exe
+    std::string function_name("test tensor");
+    torch::jit::GraphExecutor graph_exe(graph, function_name);
+    // 将输入导入ivalue vector
+    std::vector<c10::IValue> graph_input;
+    for (size_t i = 0; i < input_data.size(); i++) {
+        graph_input.push_back(input_data[i]);
+    }
+    // 执行graph
+    std::clock_t start, end;
+    if (poros_option.device == Device::GPU) {
+        cudaDeviceSynchronize();
+    }
+    start = std::clock();
+    graph_exe.run(graph_input);
+    if (poros_option.device == Device::GPU) {
+        cudaDeviceSynchronize();
+    }
+    end = std::clock();
+    std::string log_inform = "graph time:" + std::to_string(double(end - start) / CLOCKS_PER_SEC * 1000.0) + "ms\t";
+    std::cout << log_inform;
+    if (!write_to_log(log_path, log_inform)) {
+        LOG(WARNING) << "write to log failed";
+    }
+    // 提取结果
+    for (size_t i = 0; i < graph_input.size(); i++) {
+        auto tmp_ivalue = graph_input[i];
+        graph_output.push_back(tmp_ivalue.toTensor());
+    }
+    return graph_output;
+};
+
+static const std::vector<torch::jit::NodeKind> has_constant_tensor_inputs_node() {
+    return {torch::jit::aten::batch_norm,
+            torch::jit::aten::_convolution,
+            torch::jit::aten::conv1d,
+            torch::jit::aten::conv2d,
+            torch::jit::aten::conv3d,
+            torch::jit::aten::layer_norm,
+            torch::jit::aten::lstm,
+            torch::jit::aten::group_norm,
+            torch::jit::aten::instance_norm};
+}
+
+static std::vector<at::Tensor> run_engine(std::shared_ptr<torch::jit::Graph> &graph,
+                                          const baidu::mirana::poros::PorosOptions &poros_option,
+                                          baidu::mirana::poros::IConverter *converter,
+                                          const std::vector<at::Tensor> &input_data,
+                                          const std::string &log_path,
+                                          const std::vector<std::vector<at::Tensor>> &prewarm_data_of_engine) {
+    std::vector<at::Tensor> engine_output;
+    // 避免inplace的op,prewarm后会更改原数据,例如:add_,所以先clone
+    PorosGlobalContext::instance()._value_dynamic_shape_map = {};
+
+    PorosGlobalContext::instance().set_poros_options(poros_option);
+    std::vector<std::vector<at::Tensor>> prewarm_clone;
+    for (auto it = prewarm_data_of_engine.begin(); it != prewarm_data_of_engine.end(); ++it) {
+        std::vector<at::Tensor> input_clone;
+        clone_tensor_vector(*it, input_clone);
+        prewarm_clone.push_back(input_clone);
+    }
+    std::vector<std::vector<c10::IValue>> prewarm_datas;
+    for (auto it = prewarm_clone.begin(); it != prewarm_clone.end(); ++it) {
+        std::vector<c10::IValue> prewarm_input_data;
+        for (size_t i = 0; i < (*it).size(); i++) {
+            prewarm_input_data.push_back((*it)[i]);
+        }
+        prewarm_datas.push_back(prewarm_input_data);
+    }
+
+    // 检查graph中是否包含待converter的node
+    bool converter_node_exist = false;
+    torch::jit::Node *converter_node = nullptr;;
+    std::string converter_node_kind_name;
+    for (auto node_it: graph->nodes()) {
+        for (auto converter_node_kind: converter->node_kind()) { // compare node kind
+            if (node_it->kind().toQualString() == converter_node_kind.toQualString()
+                && is_node_supported(node_it, poros_option)) {
+                converter_node_exist = true;
+                converter_node_kind_name = node_it->kind().toQualString();
+                converter_node = node_it;
+                break;
+            }
+        }
+    }
+    if (!converter_node_exist) {
+        LOG(WARNING) << "Can't find converter node.";
+        return engine_output;
+    }
+
+    // 判断是否是batchnormal类型
+    bool convter_has_constant_tensor_inputs = false;
+    for (auto node_kind_it: has_constant_tensor_inputs_node()) {
+        if (node_kind_it.toQualString() == converter_node->kind().toQualString()) {
+            convter_has_constant_tensor_inputs = true;
+            break;
+        }
+    }
+
+    // 插入constant tensor inputs
+    if (convter_has_constant_tensor_inputs) {
+        torch::jit::WithInsertPoint guard(graph->block()->nodes().front());
+        if (converter_node->kind().toQualString() == has_constant_tensor_inputs_node()[6].toQualString()) {
+            auto lt = c10::List<at::Tensor>({});
+            for (size_t i = 3; i < prewarm_clone[0].size(); i++) {
+                lt.push_back(prewarm_clone[0][i]);
+            }
+            auto lt_ivalue = c10::IValue(lt);
+            auto len_const = graph->insertConstant(lt_ivalue);
+            converter_node->replaceInput(2, len_const);
+        } else {
+            for (size_t i = 1; i < prewarm_datas[0].size(); i++) {
+                auto len_const = graph->insertConstant(prewarm_datas[0][i]);
+                if (converter_node->kind().toQualString() == has_constant_tensor_inputs_node()[5].toQualString()
+                    || converter_node->kind().toQualString() == has_constant_tensor_inputs_node()[7].toQualString()) {
+                    converter_node->replaceInput(i + 1, len_const);
+                } else {
+                    converter_node->replaceInput(i, len_const);
+                }
+            }
+        }
+    }
+    // 得到预热图
+    auto prewarm_graph = baidu::mirana::poros::graph_prewarm(graph, prewarm_datas);
+
+    // 将graph全部加入subgraph
+    torch::jit::Node *subgraph_node = torch::jit::SubgraphUtils::createSingletonSubgraph(
+            *(prewarm_graph->nodes().begin()),
+            torch::jit::prim::CudaFusionGroup);
+    auto node_it = ++prewarm_graph->nodes().begin();
+    while (node_it != prewarm_graph->nodes().end()) {
+        torch::jit::SubgraphUtils::mergeNodeIntoSubgraph(*node_it, subgraph_node);
+        node_it = ++prewarm_graph->nodes().begin();
+    }
+
+    // 选取与初始化engine
+    baidu::mirana::poros::IEngine *engine = select_engine(subgraph_node, poros_option);
+    if (engine == nullptr) {
+        LOG(WARNING) << "select engine failed";
+        return engine_output;
+    }
+
+    // 将graph转到engine(包括op替换)
+    std::shared_ptr<torch::jit::Graph> sub_graph = subgraph_node->g(torch::jit::attr::Subgraph);
+    baidu::mirana::poros::PorosGraph poros_graph = {sub_graph.get(), subgraph_node};
+    if (engine->transform(poros_graph) < 0) {
+        LOG(WARNING) << "engine transform failed";
+        return engine_output;
+    }
+
+    // 测试engine输出
+    std::clock_t start, end;
+    if (poros_option.device == Device::GPU) {
+        cudaDeviceSynchronize();
+    }
+    start = std::clock();
+    if (convter_has_constant_tensor_inputs) {
+        std::vector<at::Tensor> input_data_without_constant;
+        input_data_without_constant.push_back(input_data[0].clone());
+        if (converter_node->kind().toQualString() == has_constant_tensor_inputs_node()[6].toQualString()) {
+            input_data_without_constant.push_back(input_data[1].clone());
+            input_data_without_constant.push_back(input_data[2].clone());
+        }
+        engine_output = engine->excute_engine(input_data_without_constant);
+    } else {
+        engine_output = engine->excute_engine(input_data);
+    }
+    if (poros_option.device == Device::GPU) {
+        cudaDeviceSynchronize();
+    }
+    end = std::clock();
+    std::string log_inform = "engine time:" + std::to_string(double(end - start) / CLOCKS_PER_SEC * 1000.0) + "ms\t" +
+                             converter_node_kind_name + "\n";
+
+    std::cout << log_inform;
+
+    if (!write_to_log(log_path, log_inform)) {
+        LOG(WARNING) << "write to log failed";
+    }
+    return engine_output;
+};
+
+bool run_graph_and_poros(const std::string &graph_IR,
+                         const baidu::mirana::poros::PorosOptions &poros_option,
+                         baidu::mirana::poros::IConverter *converter,
+                         const std::vector<at::Tensor> &input_data,
+                         std::vector<at::Tensor> &graph_output,
+                         std::vector<at::Tensor> &poros_output,
+                         const std::vector<std::vector<at::Tensor>> *prewarm_data,
+                         std::string log_path,
+                         const std::vector<size_t> const_input_indices) {
+    try {
+        // 解析graph
+        auto graph = std::make_shared<torch::jit::Graph>();
+        torch::jit::parseIR(graph_IR, graph.get());
+        std::vector<at::Tensor> real_input;
+        clone_tensor_vector(input_data, real_input);
+
+        if (!const_input_indices.empty()) {
+            torch::jit::WithInsertPoint guard(graph->block()->nodes().front());
+            for (const size_t &index : const_input_indices) {
+                graph->inputs()[index]->replaceAllUsesWith(graph->insertConstant(input_data[index]));
+            }
+            for (auto it = const_input_indices.rbegin(); it != const_input_indices.rend(); it++) {
+                graph->eraseInput(*it);
+                real_input.erase(real_input.begin() + *it);
+            }
+        }
+        // 运行原始graph
+        std::vector<at::Tensor> input_of_graph;
+        clone_tensor_vector(real_input, input_of_graph);
+        graph_output = run_graph(graph, input_of_graph, poros_option, log_path);
+
+        // convert op并运行engine
+        std::vector<at::Tensor> input_of_engine;
+        clone_tensor_vector(real_input, input_of_engine);
+
+        // 准备预热数据
+        std::vector<std::vector<at::Tensor>> prewarm_data_of_engine;
+        if (prewarm_data == nullptr) {
+            prewarm_data_of_engine.push_back(input_of_engine);
+        } else {
+            for (size_t i = 0; i < (*prewarm_data).size(); ++i) {
+                std::vector<at::Tensor> tmp_clone_data;
+                clone_tensor_vector((*prewarm_data)[i], tmp_clone_data);
+                //讲道理，不应该出现这个情况，预防万一...
+                if (!const_input_indices.empty() && tmp_clone_data.size() == input_data.size()) {
+                    for (auto it = const_input_indices.rbegin(); it != const_input_indices.rend(); it++) {
+                        tmp_clone_data.erase(tmp_clone_data.begin() + *it);
+                    }
+                }
+                prewarm_data_of_engine.push_back(tmp_clone_data);
+            }
+        }
+        poros_output = run_engine(graph, poros_option, converter, input_of_engine, log_path, prewarm_data_of_engine);
+    } catch (const char* err) {
+        LOG(ERROR) <<  " Exception: " << err;
+        return false;
+    }
+    return true;
+}
+
+
+bool almost_equal(const at::Tensor &a, const at::Tensor &b, const float &threshold) {
+    auto a_float = a.toType(at::kFloat);
+    auto b_float = b.toType(at::kFloat);
+    double maxValue = 0.0;
+    maxValue = fmax(a.abs().max().item<float>(), maxValue);
+    maxValue = fmax(b.abs().max().item<float>(), maxValue);
+    at::Tensor diff = a_float - b_float;
+    return diff.abs().max().item<float>() <= threshold * maxValue;
+}
+
+}// namespace testutil
+}// namespace poros
+}// namespace mirana
+}// namespace baidu
diff --git a/poros/src/poros/util/test_util.h b/poros/src/poros/util/test_util.h
new file mode 100644
index 0000000000..97ad20c9e9
--- /dev/null
+++ b/poros/src/poros/util/test_util.h
@@ -0,0 +1,70 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file test_util.h
+* @author tianshaoqing@baidu.com
+* @date Wed Sep 27 11:24:21 CST 2021
+* @brief 
+**/
+#pragma once
+
+#include "poros/compile/poros_module.h"
+#include "poros/converter/iconverter.h"
+
+namespace baidu {
+namespace mirana {
+namespace poros {
+namespace testutil {
+/**
+ * @brief run graph and poros, then compare their outputs
+ *
+ * @param [in] graph_IR : string of IR
+ * @param [in] poros_option : default device is GPU
+ * @param [in] converter : converter tested
+ * @param [in] input_data : vector of at::Tensor, once input data of graph
+ * @param [in] log_path : record the running time of the graph and engine. default is none and don't record.
+ * @param [in] prewarm_data : preheating data, default is null and input_data is used for preheating
+ * @param [in] const_input_indices : the data index in input_data, which will trans to constant-tensor before graph run. 
+ *                          (ie. constant weight parameter), this can change the graph and real input datas implicitly.
+ * @param [out] graph_output : vector of at::Tensor, graph outputs
+ * @param [out] poros_output : vector of at::Tensor, poros outputs
+ * @return  bool
+ * @retval true => succeed  false => failed
+**/
+bool run_graph_and_poros(const std::string &graph_IR,
+                         const baidu::mirana::poros::PorosOptions &poros_option,
+                         baidu::mirana::poros::IConverter *converter,
+                         const std::vector<at::Tensor> &input_data,
+                         std::vector<at::Tensor> &graph_output,
+                         std::vector<at::Tensor> &poros_output,
+                         const std::vector<std::vector<at::Tensor>> *prewarm_data = nullptr,
+                         std::string log_path = "",
+                         const std::vector<size_t> const_input_indices = {});
+
+/**
+ * @brief compare the similarity of two Tensors containing Float
+ *
+ * @param [in] a : first Tensor
+ * @param [in] b : second Tensor
+ * @param [in] threshold : acceptable relative threshold
+ * @return  bool
+ * @retval true => succeed  false => failed
+**/
+bool almost_equal(const at::Tensor &a, const at::Tensor &b, const float &threshold);
+
+}// namespace testutil
+}// namespace poros
+}// namespace mirana
+}// namespace baidu
diff --git a/poros/tools/main.cpp b/poros/tools/main.cpp
new file mode 100644
index 0000000000..31ea760885
--- /dev/null
+++ b/poros/tools/main.cpp
@@ -0,0 +1,158 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file main.cpp
+* @author tianjinjin@baidu.com
+* @date Tue Mar  9 14:43:42 CST 2021
+* @brief a tool to change original serialized script module to an optimized one
+**/
+
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <sys/time.h>
+#include <gflags/gflags.h>
+
+#include "poros/compile/compile.h"
+//#include "poros/compile/poros_module.h"
+
+DEFINE_int32(batch_size,
+1, "the batch size for model inference");
+DEFINE_int32(repeated_num,
+1000, "how many repeated test times for a single input data");
+DEFINE_string(test_mode,
+"poros", "which module we test this time: that are only three option: poros/original");
+DEFINE_string(module_file_path,
+"../model/std_pretrained_resnet50_gpu.pt", "the model file path, replace this with a real one");
+DEFINE_bool(is_dynamic,
+false, "the model type, used to choose input data");
+
+void build_test_data(int batch_size,
+        std::vector<std::vector<torch::jit::IValue>> &prewarm_datas, bool is_dynamic) {
+
+
+
+    std::vector<torch::jit::IValue> inputs;
+
+    if (is_dynamic == false) {
+        inputs.push_back(at::randn({ batch_size, 3, 224, 224}, {at::kCUDA}));
+        prewarm_datas.push_back(inputs);
+        return;
+    }
+    //max
+    inputs.push_back(at::randn({16, 3, 224, 224}, {at::kCUDA}));
+    prewarm_datas.push_back(inputs);
+    //min
+    std::vector<torch::jit::IValue> inputs2;
+    inputs2.push_back(at::randn({1, 3, 224, 224}, {at::kCUDA}));
+    prewarm_datas.push_back(inputs2);
+
+    //opt
+    std::vector<torch::jit::IValue> inputs3;
+    inputs3.push_back(at::randn({6, 3, 224, 224}, {at::kCUDA}));
+    prewarm_datas.push_back(inputs3);
+
+}
+
+/* load a serialized script module and optimize it 
+this run as a convertion tool */
+int main(int argc, char *argv[]) {
+    google::ParseCommandLineFlags(&argc, &argv, true);
+//    gflags::SetCommandLineOption("flagfile", "./conf/gflags.conf");
+
+    torch::jit::Module mod;
+    struct timeval start, end;
+    float time_use;
+    //////////////////////////////////////////////////////////////////
+    //step1: load the origin model file
+    //////////////////////////////////////////////////////////////////
+    try {
+        // Deserialize the ScriptModule from a file using torch::jit::load().
+        mod = torch::jit::load(FLAGS_module_file_path);
+    } catch (const c10::Error &e) {
+        std::cerr << "error loading the model\n" << e.msg();
+        return -1;
+    }
+    mod.eval();
+    //mod.to(at::kCPU);
+    mod.to(at::kCUDA);
+    //////////////////////////////////////////////////////////////////
+    //step2: prepare input data
+    //////////////////////////////////////////////////////////////////
+    // Create a vector of inputs for std-resnet50.
+    std::vector<std::vector<torch::jit::IValue> > prewarm_datas;
+    build_test_data(FLAGS_batch_size, prewarm_datas, FLAGS_is_dynamic);
+    //mod.forward(prewarm_datas[0]);
+
+    std::cout << "input data is ok: prewarm_datas size: " << prewarm_datas.size() << std::endl;
+
+    //////////////////////////////////////////////////////////////////
+    //step3: change mode according to given test mode and press
+    //////////////////////////////////////////////////////////////////
+    int warm_up_cycle = 50;
+    if (FLAGS_test_mode == "poros") {
+        baidu::mirana::poros::PorosOptions option;
+        option.device = baidu::mirana::poros::Device::GPU;
+        option.is_dynamic = FLAGS_is_dynamic;
+        option.debug = true;
+
+        auto poros_mod = baidu::mirana::poros::Compile(mod, prewarm_datas, option);
+        //poros_mod->to(at::kCUDA);
+        torch::jit::getProfilingMode() = true;
+        torch::jit::getExecutorMode() = true;
+        torch::jit::setGraphExecutorOptimize(false);
+
+        //warm up
+        for (int i = 1; i < warm_up_cycle; i++) {
+            poros_mod->forward(prewarm_datas[0]);
+        }
+        //real press func
+        gettimeofday(&start, NULL);
+        for (int i = 1; i < FLAGS_repeated_num; i++) {
+            auto output = poros_mod->forward(prewarm_datas[0]);
+        }
+        gettimeofday(&end, NULL);
+
+    } else if (FLAGS_test_mode == "original") {
+
+        GRAPH_DUMP("graph info:", mod.get_method("forward").graph());
+        //warm up
+        for (int i = 1; i < warm_up_cycle; i++) {
+            mod.forward(prewarm_datas[0]);
+        }
+        //real press func
+        gettimeofday(&start, NULL);
+        for (int i = 1; i < FLAGS_repeated_num; i++) {
+            auto output = mod.forward(prewarm_datas[0]);
+        }
+        gettimeofday(&end, NULL);
+        GRAPH_DUMP("torch.jit.last_executed_optimized_graph", torch::jit::lastExecutedOptimizedGraph());
+    } else {
+        std::cerr << "given test module info: " << FLAGS_test_mode.c_str() << " not supported"
+                  << ", only poros/original supported";
+        return -1;
+    }
+
+    //////////////////////////////////////////////////////////////////
+    //step4: print press result
+    //////////////////////////////////////////////////////////////////
+    time_use = (end.tv_sec - start.tv_sec) + (end.tv_usec - start.tv_usec) / (double) 1000000;
+    std::cout << "press mode: " << FLAGS_test_mode.c_str()
+              << ", repeted times: " << FLAGS_repeated_num
+              << ", spend time: " << time_use / FLAGS_repeated_num * 1000
+              << " ms/infer" << std::endl;
+
+    std::cout << "test done QAQ\n";
+}
diff --git a/poros/unittest/CMakeLists.txt b/poros/unittest/CMakeLists.txt
new file mode 100644
index 0000000000..52f44495fa
--- /dev/null
+++ b/poros/unittest/CMakeLists.txt
@@ -0,0 +1,41 @@
+cmake_minimum_required(VERSION 3.21)
+project(unittest)
+set(CMAKE_CXX_STANDARD 14)
+
+enable_testing()
+include(GoogleTest)
+
+set(GRAPHTEST "graph_test"  )
+
+file(
+        GLOB UT_FILES
+        "./op_fuser/*.cpp"
+        "../src/poros/lowering/fuse_*.cpp"
+)
+list(APPEND UT_FILES
+        "../src/poros/lowering/op_fuse_pass.cpp"
+        "../src/poros/util/graph_test_helper.cpp")
+
+add_executable(${GRAPHTEST} ${UT_FILES})
+target_link_libraries(${GRAPHTEST} gtest_main)
+target_link_libraries(${GRAPHTEST} gflags::gflags)
+#target_link_libraries(${UNITTEST} TensorRT::TensorRT)
+target_link_libraries(${GRAPHTEST} torch)
+#target_link_libraries(${UNITTEST} CUDA::cudart CUDA::cusolver CUDA::cublas CUDA::cusolver CUDA::cusparse)
+
+# unit test
+set(UNITTEST "unit_test"  )
+
+file(
+        GLOB UT_FILES
+        "../src/poros/*/*.cpp"
+        "../src/poros/converter/*/*.cpp"
+        "./converter/*.cpp"
+)
+
+add_executable(${UNITTEST} ${UT_FILES})
+target_link_libraries(${UNITTEST} gtest_main)
+target_link_libraries(${UNITTEST} gflags::gflags)
+target_link_libraries(${UNITTEST} TensorRT::TensorRT TensorRT::Plugin)
+target_link_libraries(${UNITTEST} torch)
+target_link_libraries(${UNITTEST} CUDA::cudart CUDA::cusolver CUDA::cublas CUDA::cusolver CUDA::cusparse)
diff --git a/poros/unittest/converter/activation_test.cpp b/poros/unittest/converter/activation_test.cpp
new file mode 100644
index 0000000000..ab6a16bfbb
--- /dev/null
+++ b/poros/unittest/converter/activation_test.cpp
@@ -0,0 +1,228 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file activation_test.cpp
+* @author tianshaoqing@baidu.com
+* @date Wed Sep 27 11:24:21 CST 2021
+* @brief 
+**/
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/converter/gpu/activation.h"
+#include "poros/util/test_util.h"
+
+static void activation_test_helper(const std::string& graph_IR, 
+                                  baidu::mirana::poros::IConverter* converter,
+                                  std::vector<int64_t> shape1 = {5},
+                                  bool single_input = true,
+                                  std::vector<int64_t> shape2 = {5}){
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn(shape1, {at::kCUDA}));
+    if (!single_input){
+        input_data.push_back(at::randn(shape2, {at::kCUDA}));
+    }
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, converter, 
+                input_data, graph_output, poros_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    std::string gelu_node_str("aten::gelu");
+    if (converter->node_kind()[0].toQualString() == gelu_node_str){
+        // NOTE: The official tensorrt plugin applies the Gelu activation x * Phi(x), where Phi is the Gaussian cdf,
+        // approximated by: 0.5 * (1 + tanh(sqrt(2 / M_PI) * (x + 0.044715 * x^3))) and the pytorch uses
+        // c10::cuda::compat::normcdf to compute Phi(x). So there's a difference here and therefore the threshold is slightly
+        // higher than other ops. One in ten runs will give you an out of normal threshold result.
+        ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 5e-2));
+    }else{
+        ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+    }
+}
+
+static std::string gen_single_node_graph(const std::string& op) {
+    return R"IR(
+        graph(%0 : Tensor):
+          %1 : Tensor = aten::)IR" + op + R"IR((%0)
+          return (%1))IR";
+}
+
+static std::string gen_hardtanh_graph(const std::string& op, 
+                                      const std::string& min_val, 
+                                      const std::string& max_val) {
+    return R"IR(
+        graph(%0 : Tensor):
+          %1 : float = prim::Constant[value=)IR" + min_val + R"IR(]()
+          %2 : float = prim::Constant[value=)IR" + max_val + R"IR(]()
+          %3 : Tensor = aten::)IR" + op + R"IR((%0, %1, %2)
+          return (%3))IR";
+}
+static std::string gen_leakyrelu_graph(const std::string& op, const std::string& negative_slope) {
+    return R"IR(
+        graph(%0 : Tensor):
+          %1 : float = prim::Constant[value=)IR" + negative_slope + R"IR(]()
+          %2 : Tensor = aten::)IR" + op + R"IR((%0, %1)
+          return (%2))IR";
+}
+
+static std::string gen_elu_graph(const std::string& alpha) {
+    return R"IR(
+        graph(%0 : Tensor):
+          %1 : float = prim::Constant[value=)IR" + alpha + R"IR(]()
+          %2 : int = prim::Constant[value=1]()
+          %3 : Tensor = aten::elu(%0, %1, %2, %2)
+          return (%3))IR";
+}
+
+TEST(Converters, ATenReluConvertsCorrectly) {
+    // aten::relu(Tensor self) -> Tensor
+    const auto graph_IR = gen_single_node_graph("relu");
+    baidu::mirana::poros::ActivationConverter activationconverter;
+    activation_test_helper(graph_IR, &activationconverter);
+}
+
+TEST(Converters, ATenRelu_ConvertsCorrectly) {
+    // aten::relu_(Tensor(a!) self) -> Tensor(a!)
+   const auto graph_IR = gen_single_node_graph("relu_");
+    baidu::mirana::poros::ActivationConverter activationconverter;
+    activation_test_helper(graph_IR, &activationconverter);
+}
+
+TEST(Converters, ATenRelu6ConvertsCorrectly) {
+    // aten::relu6(Tensor self) -> Tensor
+    const auto graph_IR = gen_single_node_graph("relu6");
+    baidu::mirana::poros::ActivationConverter activationconverter;
+    activation_test_helper(graph_IR, &activationconverter);
+}
+
+TEST(Converters, ATenRelu6_ConvertsCorrectly) {
+    // aten::relu6_(Tensor(a!) self) -> Tensor(a!)
+    const auto graph_IR = gen_single_node_graph("relu6_");
+    baidu::mirana::poros::ActivationConverter activationconverter;
+    activation_test_helper(graph_IR, &activationconverter);
+}
+
+TEST(Converters, ATenSigmoidConvertsCorrectly) {
+    // aten::sigmoid(Tensor self) -> Tensor
+    const auto graph_IR = gen_single_node_graph("sigmoid");
+    baidu::mirana::poros::ActivationConverter activationconverter;
+    activation_test_helper(graph_IR, &activationconverter);
+}
+
+TEST(Converters, ATenSigmoid_ConvertsCorrectly) {
+    // aten::sigmoid_(Tensor(a!) self) -> Tensor(a!)
+    const auto graph_IR = gen_single_node_graph("sigmoid_");
+    baidu::mirana::poros::ActivationConverter activationconverter;
+    activation_test_helper(graph_IR, &activationconverter);
+}
+
+TEST(Converters, ATenTanhConvertsCorrectly) {
+    // aten::tanh(Tensor self) -> Tensor"
+    const auto graph_IR = gen_single_node_graph("tanh");
+    baidu::mirana::poros::ActivationConverter activationconverter;
+    activation_test_helper(graph_IR, &activationconverter);
+}
+
+TEST(Converters, ATenTanh_ConvertsCorrectly) {
+    // aten::tanh_(Tensor(a!) self) -> Tensor(a!)
+    const auto graph_IR = gen_single_node_graph("tanh_");
+    baidu::mirana::poros::ActivationConverter activationconverter;
+    activation_test_helper(graph_IR, &activationconverter);
+}
+
+TEST(Converters, ATenGeluConvertsCorrectly) {
+    std::string graph_IR_str;
+    // aten::gelu schema changed in torch-1.12
+    if (TORCH_VERSION_MAJOR < 2 && TORCH_VERSION_MINOR < 12) {
+        // aten::gelu(Tensor self) -> Tensor
+        graph_IR_str = gen_single_node_graph("gelu");
+    } else {
+        // aten::gelu(Tensor self, *, str approximate='none') -> Tensor
+        graph_IR_str = R"IR(
+            graph(%0 : Tensor):
+                %approximate : str = prim::Constant[value="tanh"]()
+                %1 : Tensor = aten::gelu(%0, %approximate)
+                return (%1))IR";
+    }
+    const auto graph_IR = graph_IR_str;
+    baidu::mirana::poros::GeluActivationConverter geluactivationconverter;
+    activation_test_helper(graph_IR, &geluactivationconverter, {10});
+}
+
+TEST(Converters, ATenLeakyreluConvertsCorrectly) {
+    // aten::leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor
+    const auto graph_IR = gen_leakyrelu_graph("leaky_relu", "0.01");
+    baidu::mirana::poros::ActivationConverter activationconverter;
+    activation_test_helper(graph_IR, &activationconverter);
+}
+
+TEST(Converters, ATenLeakyreluNegSlopeConvertsCorrectly) {
+    // aten::leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor
+    const auto graph_IR = gen_leakyrelu_graph("leaky_relu", "0.05");
+    baidu::mirana::poros::ActivationConverter activationconverter;
+    activation_test_helper(graph_IR, &activationconverter);
+}
+
+TEST(Converters, ATenHardtanhConvertsCorrectly) {
+    // aten::hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) -> Tensor
+    const auto graph_IR = gen_hardtanh_graph("hardtanh", "-1.0", "1.0");
+    baidu::mirana::poros::ActivationConverter activationconverter;
+    activation_test_helper(graph_IR, &activationconverter);
+}
+
+TEST(Converters, ATenHardtanhMinvalMaxvalConvertsCorrectly) {
+    // aten::hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) -> Tensor
+    const auto graph_IR = gen_hardtanh_graph("hardtanh", "-3.5", "2.5");
+    baidu::mirana::poros::ActivationConverter activationconverter;
+    activation_test_helper(graph_IR, &activationconverter);
+}
+
+TEST(Converters, ATenHardtanh_ConvertsCorrectly) {
+    // aten::hardtanh_(Tensor(a!) self, Scalar min_val=-1, Scalar max_val=1) -> Tensor(a!)
+    const auto graph_IR = gen_hardtanh_graph("hardtanh_", "-1.0", "1.0");
+    baidu::mirana::poros::ActivationConverter activationconverter;
+    activation_test_helper(graph_IR, &activationconverter);
+}
+
+TEST(Converters, ATenHardtanh_MinvalMaxvalConvertsCorrectly) {
+    // aten::hardtanh_(Tensor(a!) self, Scalar min_val=-1, Scalar max_val=1) -> Tensor(a!)
+    const auto graph_IR = gen_hardtanh_graph("hardtanh_", "-2.1", "3.8");
+    baidu::mirana::poros::ActivationConverter activationconverter;
+    activation_test_helper(graph_IR, &activationconverter);
+}
+
+TEST(Converters, ATenEluConvertsCorrectly) {
+    // aten::elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor
+    const auto graph_IR = gen_elu_graph("1.0");
+    baidu::mirana::poros::ActivationConverter activationconverter;
+    activation_test_helper(graph_IR, &activationconverter);
+}
+
+TEST(Converters, ATenEluAlphaConvertsCorrectly) {
+    // aten::elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor
+    const auto graph_IR = gen_elu_graph("3.4");
+    baidu::mirana::poros::ActivationConverter activationconverter;
+    activation_test_helper(graph_IR, &activationconverter);
+}
+
+TEST(Converters, ATenEluNegAlphaConvertsCorrectly) {
+    // aten::elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor
+    const auto graph_IR = gen_elu_graph("-2.1");
+    baidu::mirana::poros::ActivationConverter activationconverter;
+    activation_test_helper(graph_IR, &activationconverter);
+}
\ No newline at end of file
diff --git a/poros/unittest/converter/add_test.cpp b/poros/unittest/converter/add_test.cpp
new file mode 100644
index 0000000000..093be5da7e
--- /dev/null
+++ b/poros/unittest/converter/add_test.cpp
@@ -0,0 +1,441 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file add_test.cpp
+* @author tianshaoqing@baidu.com
+* @date Wed Sep 27 11:24:21 CST 2021
+* @brief 
+**/
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/converter/gpu/add.h"
+#include "poros/util/test_util.h"
+
+static void add_test_helper(const std::string& graph_IR, 
+                            baidu::mirana::poros::IConverter* converter,
+                            bool singleInput,
+                            std::vector<int64_t> shape1 = {5}, 
+                            std::vector<int64_t> shape2 = {5}){
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn(shape1, {at::kCUDA}));
+    if (!singleInput){
+        input_data.push_back(at::randn(shape2, {at::kCUDA}));
+    }
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, converter, 
+                input_data, graph_output, poros_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+}
+
+static std::string gen_add_sub_tensor_graph(const std::string& op, 
+                                            const std::string& alpha) {
+    return R"IR(
+        graph(%0 : Tensor, %1 : Tensor):
+          %2 : float = prim::Constant[value=)IR" + alpha + R"IR(]()
+          %3 : Tensor = aten::)IR" + op + R"IR((%0, %1, %2)
+          return (%3))IR";
+}
+
+static std::string gen_add_sub_scalar_graph(const std::string& op, 
+                                            const std::string& scalar,
+                                            const std::string& alpha) {
+    return R"IR(
+        graph(%0 : Tensor):
+          %1 : float = prim::Constant[value=)IR" + scalar + R"IR(]()
+          %2 : float = prim::Constant[value=)IR" + alpha + R"IR(]()
+          %3 : Tensor = aten::)IR" + op + R"IR((%0, %1, %2)
+          return (%3))IR";
+}
+
+TEST(Converters, ATenAddTensorConvertsCorrectly) {
+    // aten::add.Tensor(Tensor self, Tensor other, Scalar alpha=1) -> Tensor
+    const auto graph_IR = gen_add_sub_tensor_graph("add", "1.0");
+    baidu::mirana::poros::AddConverter addconverter;
+    add_test_helper(graph_IR, &addconverter, false);
+    add_test_helper(graph_IR, &addconverter, false, {3, 4}, {4});
+    add_test_helper(graph_IR, &addconverter, false, {4}, {3, 4});
+    add_test_helper(graph_IR, &addconverter, false, {4, 1}, {1, 4});
+    add_test_helper(graph_IR, &addconverter, false, {3, 4, 3}, {4, 3});
+    add_test_helper(graph_IR, &addconverter, false, {4, 3}, {3, 4, 3});
+}
+
+TEST(Converters, ATenAddScalarConvertsCorrectly) {
+    // aten::add.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
+    const auto graph_IR = gen_add_sub_scalar_graph("add", "2.2", "1.0");
+    baidu::mirana::poros::AddConverter addconverter;
+    add_test_helper(graph_IR, &addconverter, true);
+    add_test_helper(graph_IR, &addconverter, true, {3, 4, 3});
+}
+
+TEST(Converters, ATenAdd_TensorConvertsCorrectly) {
+    // aten::add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
+    const auto graph_IR = gen_add_sub_tensor_graph("add_", "1.0");
+    baidu::mirana::poros::AddConverter addconverter;
+    add_test_helper(graph_IR, &addconverter, false);
+    add_test_helper(graph_IR, &addconverter, false, {3, 4}, {4});
+    add_test_helper(graph_IR, &addconverter, false, {3, 4, 3}, {4, 3});
+}
+
+TEST(Converters, ATenAdd_ScalarConvertsCorrectly) {
+    // aten::add_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)
+    const auto graph_IR = gen_add_sub_scalar_graph("add_", "2.2", "1.0");
+    baidu::mirana::poros::AddConverter addconverter;
+    add_test_helper(graph_IR, &addconverter, true);
+    add_test_helper(graph_IR, &addconverter, true, {3, 4, 3});
+}
+
+TEST(Converters, ATenAddTensorAlphaConvertsCorrectly) {
+    // aten::add.Tensor(Tensor self, Tensor other, Scalar alpha=1) -> Tensor
+    const auto graph_IR = gen_add_sub_tensor_graph("add", "2.5");
+    baidu::mirana::poros::AddConverter addconverter;
+    add_test_helper(graph_IR, &addconverter, false);
+    add_test_helper(graph_IR, &addconverter, false, {3, 4}, {4});
+    add_test_helper(graph_IR, &addconverter, false, {4}, {3, 4});
+    add_test_helper(graph_IR, &addconverter, false, {4, 1}, {1, 4});
+    add_test_helper(graph_IR, &addconverter, false, {3, 4, 3}, {4, 3});
+    add_test_helper(graph_IR, &addconverter, false, {4, 3}, {3, 4, 3});
+}
+
+TEST(Converters, ATenAddScalarAlphaConvertsCorrectly) {
+    // aten::add.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)
+    const auto graph_IR = gen_add_sub_scalar_graph("add", "2.2", "2.5");
+    baidu::mirana::poros::AddConverter addconverter;
+    add_test_helper(graph_IR, &addconverter, true);
+    add_test_helper(graph_IR, &addconverter, true, {3, 4, 3});
+}
+
+TEST(Converters, ATenAdd_TensorAlphaConvertsCorrectly) {
+    // aten::add_.Tensor(Tensor self, Tensor other, Scalar alpha=1) -> Tensor
+    const auto graph_IR = gen_add_sub_tensor_graph("add_", "2.5");
+    baidu::mirana::poros::AddConverter addconverter;
+    add_test_helper(graph_IR, &addconverter, false);
+    add_test_helper(graph_IR, &addconverter, false, {3, 4}, {4});
+    add_test_helper(graph_IR, &addconverter, false, {3, 4, 3}, {4, 3});
+}
+
+TEST(Converters, ATenAdd_ScalarAlphaConvertsCorrectly) {
+    // aten::add_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)
+    const auto graph_IR = gen_add_sub_scalar_graph("add_", "2.2", "2.5");
+    baidu::mirana::poros::AddConverter addconverter;
+    add_test_helper(graph_IR, &addconverter, true);
+    add_test_helper(graph_IR, &addconverter, true, {3, 4, 3});
+}
+
+TEST(Converters, ATenSubTensorConvertsCorrectly) {
+    // aten::sub.Tensor(Tensor self, Tensor other, Scalar alpha=1) -> Tensor
+    const auto graph_IR = gen_add_sub_tensor_graph("sub", "1.0");
+    baidu::mirana::poros::SubConverter subconverter;
+    add_test_helper(graph_IR, &subconverter, false);
+    add_test_helper(graph_IR, &subconverter, false, {3, 4}, {4});
+    add_test_helper(graph_IR, &subconverter, false, {4}, {3, 4});
+    add_test_helper(graph_IR, &subconverter, false, {4, 1}, {1, 4});
+    add_test_helper(graph_IR, &subconverter, false, {3, 4, 3}, {4, 3});
+    add_test_helper(graph_IR, &subconverter, false, {4, 3}, {3, 4, 3});
+}
+
+TEST(Converters, ATenSubScalarConvertsCorrectly) {
+    // aten::sub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
+    const auto graph_IR = gen_add_sub_scalar_graph("sub", "2.2", "1.0");
+    baidu::mirana::poros::SubConverter subconverter;
+    add_test_helper(graph_IR, &subconverter, true);
+    add_test_helper(graph_IR, &subconverter, true, {3, 4, 3});
+}
+
+TEST(Converters, ATenSub_TensorConvertsCorrectly) {
+    // aten::sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
+    const auto graph_IR = gen_add_sub_tensor_graph("sub_", "1.0");
+    baidu::mirana::poros::SubConverter subconverter;
+    add_test_helper(graph_IR, &subconverter, false);
+    add_test_helper(graph_IR, &subconverter, false, {3, 4}, {4});
+    add_test_helper(graph_IR, &subconverter, false, {3, 4, 3}, {4, 3});
+}
+
+TEST(Converters, ATenSub_ScalarConvertsCorrectly) {
+    // aten::sub_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)
+    const auto graph_IR = gen_add_sub_scalar_graph("sub_", "2.2", "1.0");
+    baidu::mirana::poros::SubConverter subconverter;
+    add_test_helper(graph_IR, &subconverter, true);
+    add_test_helper(graph_IR, &subconverter, true, {3, 4, 3});
+}
+
+TEST(Converters, ATenSubTensorAlphaConvertsCorrectly) {
+    // aten::sub.Tensor(Tensor self, Tensor other, Scalar alpha=1) -> Tensor
+    const auto graph_IR = gen_add_sub_tensor_graph("sub", "2.5");
+    baidu::mirana::poros::SubConverter subconverter;
+    add_test_helper(graph_IR, &subconverter, false);
+    add_test_helper(graph_IR, &subconverter, false, {3, 4}, {4});
+    add_test_helper(graph_IR, &subconverter, false, {4}, {3, 4});
+    add_test_helper(graph_IR, &subconverter, false, {4, 1}, {1, 4});
+    add_test_helper(graph_IR, &subconverter, false, {3, 4, 3}, {4, 3});
+    add_test_helper(graph_IR, &subconverter, false, {4, 3}, {3, 4, 3});
+}
+
+TEST(Converters, ATenSubScalarAlphaConvertsCorrectly) {
+    // aten::sub.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)
+    const auto graph_IR = gen_add_sub_scalar_graph("sub", "2.2", "2.5");
+    baidu::mirana::poros::SubConverter subconverter;
+    add_test_helper(graph_IR, &subconverter, true);
+    add_test_helper(graph_IR, &subconverter, true, {3, 4, 3});
+}
+
+TEST(Converters, ATenSub_TensorAlphaConvertsCorrectly) {
+    // aten::sub_.Tensor(Tensor self, Tensor other, Scalar alpha=1) -> Tensor
+    const auto graph_IR = gen_add_sub_tensor_graph("sub_", "2.5");
+    baidu::mirana::poros::SubConverter subconverter;
+    add_test_helper(graph_IR, &subconverter, false);
+    add_test_helper(graph_IR, &subconverter, false, {3, 4}, {4});
+    add_test_helper(graph_IR, &subconverter, false, {3, 4, 3}, {4, 3});
+}
+
+TEST(Converters, ATenSub_ScalarAlphaConvertsCorrectly) {
+    // aten::sub_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)
+    const auto graph_IR = gen_add_sub_scalar_graph("sub_", "2.2", "2.5");
+    baidu::mirana::poros::SubConverter subconverter;
+    add_test_helper(graph_IR, &subconverter, true);
+    add_test_helper(graph_IR, &subconverter, true, {3, 4, 3});
+}
+
+TEST(Converters, ATenRsubTensorConvertsCorrectly) {
+    // aten::rsub.Tensor(Tensor self, Tensor other, Scalar alpha=1) -> (Tensor)
+    const auto graph_IR = gen_add_sub_tensor_graph("rsub", "1.0");
+    baidu::mirana::poros::RsubConverter rsubconverter;
+    add_test_helper(graph_IR, &rsubconverter, false);
+    add_test_helper(graph_IR, &rsubconverter, false, {3, 4}, {4});
+    add_test_helper(graph_IR, &rsubconverter, false, {4}, {3, 4});
+    add_test_helper(graph_IR, &rsubconverter, false, {4, 1}, {1, 4});
+    add_test_helper(graph_IR, &rsubconverter, false, {3, 4, 3}, {4, 3});
+    add_test_helper(graph_IR, &rsubconverter, false, {4, 3}, {3, 4, 3});
+}
+
+TEST(Converters, ATenRsubScalarConvertsCorrectly) {
+    // aten::rsub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> (Tensor)
+    const auto graph_IR = gen_add_sub_scalar_graph("rsub", "2.2", "1.0");
+    baidu::mirana::poros::RsubConverter rsubconverter;
+    add_test_helper(graph_IR, &rsubconverter, true);
+    add_test_helper(graph_IR, &rsubconverter, true, {3, 4, 3});
+}
+
+TEST(Converters, ATenRsubTensorAlphaConvertsCorrectly) {
+    // aten::rsub.Tensor(Tensor self, Tensor other, Scalar alpha=1) -> (Tensor)
+    const auto graph_IR = gen_add_sub_tensor_graph("rsub", "3.33");
+    baidu::mirana::poros::RsubConverter rsubconverter;
+    add_test_helper(graph_IR, &rsubconverter, false);
+    add_test_helper(graph_IR, &rsubconverter, false, {3, 4}, {4});
+    add_test_helper(graph_IR, &rsubconverter, false, {4}, {3, 4});
+    add_test_helper(graph_IR, &rsubconverter, false, {4, 1}, {1, 4});
+    add_test_helper(graph_IR, &rsubconverter, false, {3, 4, 3}, {4, 3});
+    add_test_helper(graph_IR, &rsubconverter, false, {4, 3}, {3, 4, 3});
+}
+
+TEST(Converters, ATenRsubScalarAlphaConvertsCorrectly) {
+    // aten::rsub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> (Tensor)
+    const auto graph_IR = gen_add_sub_scalar_graph("rsub", "2.2", "4.44");
+    baidu::mirana::poros::RsubConverter rsubconverter;
+    add_test_helper(graph_IR, &rsubconverter, true);
+    add_test_helper(graph_IR, &rsubconverter, true, {3, 4, 3});
+}
+
+TEST(Converters, ATenRsubTensorTypePromotionConvertsCorrectly) {
+    // aten::rsub.Tensor(Tensor self, Tensor other, Scalar alpha=1) -> (Tensor)
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor, %1 : Tensor):
+          %2 : float = prim::Constant[value=3.33]()
+          %3 : Tensor = aten::rsub(%0, %1, %2)
+          return (%3))IR";
+    baidu::mirana::poros::RsubConverter rsubconverter;
+
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({3,4,3}, {at::kCUDA}));
+    input_data.push_back(at::ones({3,4,3}, {at::kCUDA}).to(at::ScalarType::Int));
+    
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &rsubconverter, 
+                input_data, graph_output, poros_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+}
+
+TEST(Converters, ATenRsubScalarTypePromotionConvertsCorrectly) {
+    // aten::rsub.Tensor(Tensor self, Tensor other, Scalar alpha=1) -> (Tensor)
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %1 : int = prim::Constant[value=5]()
+          %2 : float = prim::Constant[value=3.33]()
+          %3 : Tensor = aten::rsub(%0, %1, %2)
+          return (%3))IR";
+    baidu::mirana::poros::RsubConverter rsubconverter;
+    add_test_helper(graph_IR, &rsubconverter, true);
+}
+
+static void add_sub_dynamic_test_helper(const std::string& graph_IR, 
+                                baidu::mirana::poros::IConverter* converter,
+                                const std::vector<at::Tensor>& input_data,
+                                bool is_dynamic = false,
+                                std::vector<std::vector<at::Tensor>>* prewarm_data = nullptr) {
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = is_dynamic;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, converter, 
+                input_data, graph_output, poros_output, prewarm_data));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(graph_output[0].equal(poros_output[0]));
+}
+
+TEST(Converters, ATenAddIntdynamicConvertsCorrectly) {
+    // aten::add.int(int a, int b) -> (int)
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %1 : int = prim::Constant[value=0]()
+          %2 : int = prim::Constant[value=1]()
+          %3 : int = aten::size(%0, %1)
+          %4 : int = aten::size(%0, %2)
+          %5 : int = aten::add(%3, %4)
+          %6 : Tensor = aten::add(%0, %5, %2)
+          return (%6))IR";
+    baidu::mirana::poros::AddConverter addconverter;
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::zeros({2, 3}, {at::kCUDA}).to(at::ScalarType::Int));
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::zeros({4, 5}, {at::kCUDA}).to(at::ScalarType::Int));
+    prewarm_data[1].push_back(at::zeros({2, 3}, {at::kCUDA}).to(at::ScalarType::Int));
+    prewarm_data[2].push_back(at::zeros({2, 3}, {at::kCUDA}).to(at::ScalarType::Int));
+
+    add_sub_dynamic_test_helper(graph_IR, &addconverter, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, ATenSubIntdynamicConvertsCorrectly) {
+    // aten::sub.int(int a, int b) -> (int)
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %1 : int = prim::Constant[value=0]()
+          %2 : int = prim::Constant[value=1]()
+          %3 : int = aten::size(%0, %1)
+          %4 : int = aten::size(%0, %2)
+          %5 : int = aten::sub(%3, %4)
+          %6 : Tensor = aten::add(%0, %5, %2)
+          return (%6))IR";
+    baidu::mirana::poros::SubConverter subconverter;
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::zeros({2, 3}, {at::kCUDA}).to(at::ScalarType::Int));
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::zeros({4, 5}, {at::kCUDA}).to(at::ScalarType::Int));
+    prewarm_data[1].push_back(at::zeros({2, 3}, {at::kCUDA}).to(at::ScalarType::Int));
+    prewarm_data[2].push_back(at::zeros({2, 3}, {at::kCUDA}).to(at::ScalarType::Int));
+
+    add_sub_dynamic_test_helper(graph_IR, &subconverter, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, ATenAddTdynamicConvertsCorrectly) {
+    // aten::add.t(t[] a, t[] b) -> (t[])
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor, %1 : Tensor):
+          %2 : int[] = aten::size(%0)
+          %3 : int[] = aten::size(%1)
+          %4 : int[] = aten::add(%2, %3)
+          %5 : int = prim::Constant[value=2]()
+          %6 : int = aten::__getitem__(%4, %5) 
+          %7 : int = prim::Constant[value=1]()
+          %8 : Tensor = aten::add(%0, %6, %7)
+          return (%8))IR";
+    baidu::mirana::poros::AddConverter addconverter;
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::zeros({2, 3}, {at::kCUDA}).to(at::ScalarType::Int));
+    input_data.push_back(at::zeros({4, 5}, {at::kCUDA}).to(at::ScalarType::Int));
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::zeros({4, 5}, {at::kCUDA}).to(at::ScalarType::Int));
+    prewarm_data[0].push_back(at::zeros({6, 7}, {at::kCUDA}).to(at::ScalarType::Int));
+    prewarm_data[1].push_back(at::zeros({2, 3}, {at::kCUDA}).to(at::ScalarType::Int));
+    prewarm_data[1].push_back(at::zeros({4, 5}, {at::kCUDA}).to(at::ScalarType::Int));
+    prewarm_data[2].push_back(at::zeros({2, 3}, {at::kCUDA}).to(at::ScalarType::Int));
+    prewarm_data[2].push_back(at::zeros({4, 5}, {at::kCUDA}).to(at::ScalarType::Int));
+
+    add_sub_dynamic_test_helper(graph_IR, &addconverter, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, ATenAddTensordynamicConvertsCorrectly) {
+    //dynamic tensor
+    const auto graph_IR = gen_add_sub_tensor_graph("add", "1.0");
+    baidu::mirana::poros::AddConverter addconverter;
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({15, 1}, {at::kCUDA}));
+    input_data.push_back(at::randn({300}, {at::kCUDA}));
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({40, 1}, {at::kCUDA}));
+    prewarm_data[0].push_back(at::randn({300}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({8, 1}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({300}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({20, 1}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({300}, {at::kCUDA}));
+
+    add_sub_dynamic_test_helper(graph_IR, &addconverter, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, ATenAddTensordynamicMoreConvertsCorrectly) {
+    //dynamic tensor
+    const auto graph_IR = gen_add_sub_tensor_graph("add", "1.0");
+    baidu::mirana::poros::AddConverter addconverter;
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({4, 1}, {at::kCUDA}));
+    input_data.push_back(at::randn({300}, {at::kCUDA}));
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({4, 1}, {at::kCUDA}));
+    prewarm_data[0].push_back(at::randn({400}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({4, 1}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({100}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({4, 1}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({200}, {at::kCUDA}));
+
+    add_sub_dynamic_test_helper(graph_IR, &addconverter, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, ATenAddTensordynamicMore2ConvertsCorrectly) {
+    //dynamic tensor
+    const auto graph_IR = gen_add_sub_tensor_graph("add", "1.0");
+    baidu::mirana::poros::AddConverter addconverter;
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({4, 1, 45}, {at::kCUDA}));
+    input_data.push_back(at::randn({300, 1}, {at::kCUDA}));
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({400, 1, 45}, {at::kCUDA}));
+    prewarm_data[0].push_back(at::randn({400, 1}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({4, 1, 45}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({100, 1}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({100, 1, 45}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({200, 1}, {at::kCUDA}));
+
+    add_sub_dynamic_test_helper(graph_IR, &addconverter, input_data, true, &prewarm_data);
+}
\ No newline at end of file
diff --git a/poros/unittest/converter/aten_eval_test.cpp b/poros/unittest/converter/aten_eval_test.cpp
new file mode 100644
index 0000000000..762906ee1f
--- /dev/null
+++ b/poros/unittest/converter/aten_eval_test.cpp
@@ -0,0 +1,158 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file aten_eval_test.cpp
+* @author wangrui39@baidu.com
+* @date Mon December 13 11:36:11 CST 2021
+* @brief 
+**/
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/converter/gpu/aten_eval.h"
+#include "poros/util/test_util.h"
+
+static void aten_eval_test_helper(const std::string& graph_IR,
+                            const std::vector<at::Tensor>& input_data,
+                            baidu::mirana::poros::IConverter* converter) {
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, converter, 
+                input_data, graph_output, poros_output));
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+}
+
+
+/*TEST(Converters, AppendConverterCorrectly) {
+    //"aten::append.t(t[](a!) self, t(c -> *) el) -> (t[](a!))"
+
+    const auto graph = R"IR(
+        graph(%a.1 : Tensor,
+          %b.1 : Tensor,
+          %c.1 : Tensor):
+          %12 : int = prim::Constant[value=0]()
+          %x.1 : Tensor[] = prim::ListConstruct(%a.1)
+          %7 : Tensor[] = aten::append(%x.1, %b.1) # test.py:26:4
+          %10 : Tensor[] = aten::append(%x.1, %c.1) # test.py:27:4
+          %13 : Tensor = aten::cat(%x.1, %12) # test.py:28:11
+          return (%13))IR";
+
+    std::vector<at::Tensor> input_data;
+    auto input1 = at::randn({3, 4}, {at::kCUDA});
+    auto input2 = at::randn({3, 4}, {at::kCUDA});
+    auto input3 = at::randn({3, 4}, {at::kCUDA});
+    
+    input_data.push_back(input1);
+    input_data.push_back(input2);
+    input_data.push_back(input3);
+
+    baidu::mirana::poros::AppendConverter appendConverter;
+    aten_eval_test_helper(graph, input_data, &appendConverter);
+}*/
+
+TEST(Converters, GetitemConverterCorrectly) {
+    // "aten::__getitem__.t(t[](a) list, int idx) -> (t(*))"*/
+    
+    const auto graph = R"IR(
+        graph(%a.1 : Tensor,
+          %b.1 : Tensor):
+          %12 : int = prim::Constant[value=0]() 
+          %16 : int = prim::Constant[value=1]()
+          %x.1 : Tensor[] = prim::ListConstruct(%a.1)
+          %7 : Tensor[] = aten::append(%x.1, %b.1) 
+          %ret.1 : Tensor = aten::__getitem__(%x.1, %12) 
+          %17 : Tensor = aten::__getitem__(%x.1, %16) 
+          %19 : Tensor = aten::add(%ret.1, %17, %16)
+          return (%19))IR";
+
+    std::vector<at::Tensor> input_data;
+    auto input1 = at::randn({3, 4}, {at::kCUDA});
+    auto input2 = at::randn({3, 4}, {at::kCUDA});
+    
+    input_data.push_back(input1);
+    input_data.push_back(input2);
+
+    baidu::mirana::poros::GetitemConverter getitemconverter;
+    aten_eval_test_helper(graph, input_data, &getitemconverter);
+}
+
+TEST(Converters, SetitemConverterCorrectly) {
+    // aten::_set_item.t(t[](a!) l, int idx, t(b -> *) el) -> (t[](a!))
+    const auto graph = R"IR(
+        graph(%x.1 : Tensor,
+          %y.1 : Tensor):
+          %6 : int = prim::Constant[value=1]() # test.py:28:15
+          %10 : int = prim::Constant[value=0]() # test.py:28:6
+          %a.1 : Tensor[] = prim::ListConstruct(%x.1, %y.1)
+          %8 : Tensor = aten::add(%x.1, %6, %6) # test.py:28:11
+          %11 : Tensor[] = aten::_set_item(%a.1, %10, %8) # test.py:28:4
+          %ret.1 : Tensor = aten::cat(%a.1, %10) # test.py:29:10
+          return (%ret.1))IR";
+
+    std::vector<at::Tensor> input_data;
+    auto input1 = at::randn({3, 4}, {at::kCUDA});
+    auto input2 = at::randn({3, 4}, {at::kCUDA});
+    
+    input_data.push_back(input1);
+    input_data.push_back(input2);
+
+    baidu::mirana::poros::SetitemConverter setitemconverter;
+    aten_eval_test_helper(graph, input_data, &setitemconverter);
+}
+
+static void eval_dynamic_test_helper(const std::string& graph_IR, 
+                                baidu::mirana::poros::IConverter* converter,
+                                const std::vector<at::Tensor>& input_data,
+                                bool is_dynamic = false,
+                                std::vector<std::vector<at::Tensor>>* prewarm_data = nullptr) {
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = is_dynamic;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, converter, 
+                input_data, graph_output, poros_output, prewarm_data));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(graph_output[0].equal(poros_output[0]));
+}
+
+TEST(Converters, ATenGetitemdynamicConvertsCorrectly) {
+    // "aten::__getitem__.t(t[](a) list, int idx) -> (t(*))"*/
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %1 : int[] = aten::size(%0)
+          %2 : int = prim::Constant[value=1]()
+          %3 : int = aten::__getitem__(%1, %2) 
+          %4 : Tensor = aten::add(%0, %3, %2)
+          return (%4))IR";
+    baidu::mirana::poros::GetitemConverter getitemconverter;
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::zeros({2, 3}, {at::kCUDA}).to(at::ScalarType::Int));
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::zeros({4, 5}, {at::kCUDA}).to(at::ScalarType::Int));
+    prewarm_data[1].push_back(at::zeros({2, 3}, {at::kCUDA}).to(at::ScalarType::Int));
+    prewarm_data[2].push_back(at::zeros({2, 3}, {at::kCUDA}).to(at::ScalarType::Int));
+
+    eval_dynamic_test_helper(graph_IR, &getitemconverter, input_data, true, &prewarm_data);
+}
diff --git a/poros/unittest/converter/batch_norm_test.cpp b/poros/unittest/converter/batch_norm_test.cpp
new file mode 100644
index 0000000000..08f5fbb7c2
--- /dev/null
+++ b/poros/unittest/converter/batch_norm_test.cpp
@@ -0,0 +1,146 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file batch_norm_test.cpp
+* @author tianshaoqing@baidu.com
+* @date Wed Sep 27 11:24:21 CST 2021
+* @brief 
+**/
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/converter/gpu/batch_norm.h"
+#include "poros/util/test_util.h"
+
+TEST(Converters, ATenBatchnormalConvertsCorrectly) {
+    // aten::batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> Tensor
+    const auto graph_IR = R"IR(
+      graph(%0 : Tensor, %1: Tensor, %2: Tensor, %3: Tensor, %4: Tensor):
+        %5 : bool = prim::Constant[value=0]()
+        %6 : float = prim::Constant[value=1.0000000000000001e-05]()
+        %7 : float = prim::Constant[value=0.10000000000000001]()
+        %8 : Tensor = aten::batch_norm(%0, %1, %2, %3, %4, %5, %6, %7, %5)
+        return (%8))IR";
+
+        auto in = at::randn({1, 5, 5, 5}, {at::kCUDA});
+        auto gamma = at::randn({5}, {at::kCUDA});
+        auto beta = at::randn({5}, {at::kCUDA});
+        auto mean = at::randn({5}, {at::kCUDA});
+        auto var = at::randn({5}, {at::kCUDA}).abs();
+
+        baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+        baidu::mirana::poros::BatchNormConverter batchnormconverter;
+
+        // 运行原图与engine获取结果
+        std::vector<at::Tensor> graph_output;
+        std::vector<at::Tensor> poros_output;
+        ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &batchnormconverter, 
+                    {in, gamma, beta, mean, var}, graph_output, poros_output));
+
+        ASSERT_EQ(1, graph_output.size());
+        ASSERT_EQ(1, poros_output.size());
+        ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+}
+
+/*
+aten::instance_norm(Tensor input,
+Tensor? weight,
+Tensor? bias,
+Tensor? running_mean,
+Tensor? running_var,
+bool use_input_stats,
+float momentum,
+float eps,
+bool cudnn_enabled) -> Tensor
+*/
+TEST(Converters, ATenInstanceNormConvertsCorrectly) {
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor, %1: Tensor, %2: Tensor):
+          %3 : NoneType = prim::Constant() 
+          %4 : bool = prim::Constant[value=1]()
+          %5 : float = prim::Constant[value=0.10000000000000001]()
+          %6 : float = prim::Constant[value=1.0000000000000001e-05]()
+          %7 : Tensor = aten::instance_norm(%0, %1, %2, %3, %3, %4, %5, %6, %4)
+          return (%7))IR";
+
+      auto input_tensor = at::randn({2, 10, 5, 5}, {at::kCUDA});
+      auto weight = at::randn({10}, {at::kCUDA});
+      auto bias = at::randn({10}, {at::kCUDA});
+
+      baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+      baidu::mirana::poros::InstanceNormConverter instancenormconverter;
+
+      // 运行原图与engine获取结果
+      std::vector<at::Tensor> graph_output;
+      std::vector<at::Tensor> poros_output;
+      ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &instancenormconverter, 
+                  {input_tensor, weight, bias}, graph_output, poros_output));
+
+      ASSERT_EQ(1, graph_output.size());
+      ASSERT_EQ(1, poros_output.size());
+      ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));   
+}
+
+TEST(Converters, ATenInstanceNormConvertsNoWeightCorrectly) {
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %3 : NoneType = prim::Constant() 
+          %4 : bool = prim::Constant[value=1]()
+          %5 : float = prim::Constant[value=0.10000000000000001]()
+          %6 : float = prim::Constant[value=1.0000000000000001e-05]()
+          %7 : Tensor = aten::instance_norm(%0, %3, %3, %3, %3, %4, %5, %6, %4)
+          return (%7))IR";
+
+      auto input_tensor = at::randn({2, 20, 45, 3}, {at::kCUDA});
+
+      baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+      baidu::mirana::poros::InstanceNormConverter instancenormconverter;
+
+      // 运行原图与engine获取结果
+      std::vector<at::Tensor> graph_output;
+      std::vector<at::Tensor> poros_output;
+      ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &instancenormconverter, 
+                  {input_tensor}, graph_output, poros_output));
+
+      ASSERT_EQ(1, graph_output.size());
+      ASSERT_EQ(1, poros_output.size());
+      ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));   
+}
+
+TEST(Converters, ATenInstanceNormConverts3DCorrectly) {
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %3 : NoneType = prim::Constant() 
+          %4 : bool = prim::Constant[value=1]()
+          %5 : float = prim::Constant[value=0.10000000000000001]()
+          %6 : float = prim::Constant[value=1.0000000000000001e-05]()
+          %7 : Tensor = aten::instance_norm(%0, %3, %3, %3, %3, %4, %5, %6, %4)
+          return (%7))IR";
+
+      auto input_tensor = at::randn({2, 20, 45}, {at::kCUDA});
+
+      baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+      baidu::mirana::poros::InstanceNormConverter instancenormconverter;
+
+      // 运行原图与engine获取结果
+      std::vector<at::Tensor> graph_output;
+      std::vector<at::Tensor> poros_output;
+      ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &instancenormconverter, 
+                  {input_tensor}, graph_output, poros_output));
+
+      ASSERT_EQ(1, graph_output.size());
+      ASSERT_EQ(1, poros_output.size());
+      ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));   
+}
\ No newline at end of file
diff --git a/poros/unittest/converter/clone_test.cpp b/poros/unittest/converter/clone_test.cpp
new file mode 100644
index 0000000000..f0153e2139
--- /dev/null
+++ b/poros/unittest/converter/clone_test.cpp
@@ -0,0 +1,79 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file clone_test.cpp
+* @author tianshaoqing@baidu.com
+* @date Tue Nov 23 12:26:28 CST 2021
+* @brief 
+**/
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/converter/gpu/clone.h"
+#include "poros/util/test_util.h"
+
+static void clone_dy_test_helper(const std::string& graph_IR, 
+                                const std::vector<at::Tensor>& input_data,
+                                bool is_dynamic = false,
+                                std::vector<std::vector<at::Tensor>>* prewarm_data = nullptr) {
+    baidu::mirana::poros::CloneConverter cloneconverter;
+
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = is_dynamic;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &cloneconverter, 
+                input_data, graph_output, poros_output, prewarm_data));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+}
+
+TEST(Converters, ATenCloneConvertsCorrectly) {
+    // aten::clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %memory_format : None = prim::Constant[value=0]()
+          %1 : Tensor = aten::clone(%0, %memory_format)
+          %2 : Tensor = aten::relu(%1)
+          return (%2))IR";
+    
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({10, 100, 100, 100}, {at::kCUDA}));
+    
+    clone_dy_test_helper(graph_IR, input_data);
+}
+
+TEST(Converters, ATenCloneConvertsDynamicCorrectly) {
+    // aten::clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %memory_format : None = prim::Constant[value=0]()
+          %1 : Tensor = aten::clone(%0, %memory_format)
+          %2 : Tensor = aten::relu(%1)
+          return (%2))IR";
+    
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({20, 150, 100, 100}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({10, 100, 50, 50}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({10, 100, 50, 50}, {at::kCUDA}));
+
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({10, 100, 50, 50}, {at::kCUDA}));
+    
+    clone_dy_test_helper(graph_IR, input_data, true, &prewarm_data);
+}
\ No newline at end of file
diff --git a/poros/unittest/converter/concat_test.cpp b/poros/unittest/converter/concat_test.cpp
new file mode 100644
index 0000000000..b631fccc21
--- /dev/null
+++ b/poros/unittest/converter/concat_test.cpp
@@ -0,0 +1,91 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file concat_test.cpp
+* @author tianshaoqing@baidu.com
+* @date Wed Sep 27 11:24:21 CST 2021
+* @brief 
+**/
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/converter/gpu/concat.h"
+#include "poros/util/test_util.h"
+
+static void cat_test_helper(const std::string& graph_IR, 
+                                std::vector<int64_t> shape1 = {5}, 
+                                std::vector<int64_t> shape2 = {5},
+                                bool Triple_inputs = false,
+                                std::vector<int64_t> shape3 = {5}){
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn(shape1, {at::kCUDA}));
+    input_data.push_back(at::randn(shape2, {at::kCUDA}));
+    if (Triple_inputs){
+        input_data.push_back(at::randn(shape3, {at::kCUDA}));
+    }
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    baidu::mirana::poros::ConcatConverter concatconverter;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &concatconverter, 
+                input_data, graph_output, poros_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(graph_output[0].equal(poros_output[0]));
+}
+
+static std::string gen_double_inputs_cat_graph(const std::string& dim) {
+    return R"IR(
+        graph(%0 : Tensor, %1 : Tensor):
+          %2 : Tensor[] = prim::ListConstruct(%0, %1)
+          %3 : int = prim::Constant[value=)IR" + dim + R"IR(]()
+          %4 : Tensor = aten::cat(%2, %3)
+          return (%4))IR";
+}
+
+static std::string gen_triple_inputs_cat_graph(const std::string& dim) {
+    return R"IR(
+        graph(%0 : Tensor, %1 : Tensor, %2 : Tensor):
+          %3 : Tensor[] = prim::ListConstruct(%0, %1, %2)
+          %4 : int = prim::Constant[value=)IR" + dim + R"IR(]()
+          %5 : Tensor = aten::cat(%3, %4)
+          return (%5))IR";
+}
+
+TEST(Converters, ATenCatPureTensorConvertsCorrectly) {
+    // aten::cat(Tensor[] tensors, int dim=0) -> Tensor
+    const auto graph_IR = gen_double_inputs_cat_graph("0");
+    cat_test_helper(graph_IR);
+}
+
+TEST(Converters, ATenCatPureTensorNegDimConvertsCorrectly) {
+    // aten::cat(Tensor[] tensors, int dim=0) -> Tensor
+    const auto graph_IR = gen_double_inputs_cat_graph("-1");
+    cat_test_helper(graph_IR, {5, 3}, {5, 4});
+}
+
+TEST(Converters, ATenCatTripleTensorConvertsCorrectly) {
+    // aten::cat(Tensor[] tensors, int dim=0) -> Tensor
+    const auto graph_IR = gen_triple_inputs_cat_graph("1");
+    cat_test_helper(graph_IR, {5, 2, 2}, {5, 7, 2}, true, {5, 3, 2});
+}
+
+TEST(Converters, ATenCatTripleTensorNegdimConvertsCorrectly) {
+    // aten::cat(Tensor[] tensors, int dim=0) -> Tensor
+    const auto graph_IR = gen_triple_inputs_cat_graph("-1");
+    cat_test_helper(graph_IR, {5, 6, 7}, {5, 6, 3}, true, {5, 6, 5});
+}
\ No newline at end of file
diff --git a/poros/unittest/converter/constant_pad_nd_test.cpp b/poros/unittest/converter/constant_pad_nd_test.cpp
new file mode 100644
index 0000000000..f7aad85040
--- /dev/null
+++ b/poros/unittest/converter/constant_pad_nd_test.cpp
@@ -0,0 +1,160 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file constant_pad_nd_test.cpp
+* @author tianshaoqing@baidu.com
+* @date Thur Dec 2 14:29:20 CST 2021
+* @brief 
+**/
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/util/test_util.h"
+#include "poros/converter/gpu/constant_pad_nd.h"
+
+static void constant_pad_nd_test_helper(const std::string& graph_IR, 
+                            std::vector<at::Tensor> input_data,
+                            bool is_dynamic = false,
+                            std::vector<std::vector<at::Tensor>>* prewarm_data = nullptr) {
+
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = is_dynamic;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    baidu::mirana::poros::ConstantPadNdConverter constantpadndconverter;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &constantpadndconverter, 
+                input_data, graph_output, poros_output, prewarm_data));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+
+}
+
+static std::string gen_constant_pad_nd_graph(const std::string& padding_shape_str, 
+                                            const std::string& value_str,
+                                            const bool padding_value_is_int = false) {
+    if (padding_value_is_int) {
+        return R"IR(
+            graph(%0 : Tensor):
+                %1 : int[] = prim::Constant[value=[)IR" + padding_shape_str + R"IR(]]()
+                %2 : int = prim::Constant[value=)IR" + value_str + R"IR(]()
+                %3 : Tensor = aten::constant_pad_nd(%0, %1, %2)
+                return (%3))IR";
+
+    } else {
+        return R"IR(
+            graph(%0 : Tensor):
+                %1 : int[] = prim::Constant[value=[)IR" + padding_shape_str + R"IR(]]()
+                %2 : float = prim::Constant[value=)IR" + value_str + R"IR(]()
+                %3 : Tensor = aten::constant_pad_nd(%0, %1, %2)
+                return (%3))IR";
+    }
+   
+}
+
+TEST(Converters, TestAtenConstantPadNdCorrectly) {
+    const auto graph_IR = gen_constant_pad_nd_graph("1, 2, 3, 4", "1.5");
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({4, 5, 6, 7}, {at::kCUDA}));
+    constant_pad_nd_test_helper(graph_IR, input_data);
+}
+
+TEST(Converters, TestAtenConstantPadNdLastDimCorrectly) {
+    const auto graph_IR = gen_constant_pad_nd_graph("1, 2", "1.5");
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({4, 5, 6, 7}, {at::kCUDA}));
+    constant_pad_nd_test_helper(graph_IR, input_data);
+}
+
+TEST(Converters, TestAtenConstantPadNdZerosPaddingDimsCorrectly) {
+    const auto graph_IR = gen_constant_pad_nd_graph("0, 1, 2, 0", "1.5");
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({4, 5, 6, 7}, {at::kCUDA}));
+    constant_pad_nd_test_helper(graph_IR, input_data);
+}
+
+TEST(Converters, TestAtenConstantPadNdIntCorrectly) {
+    const auto graph_IR = gen_constant_pad_nd_graph("1, 2, 3, 4", "1", true);
+    std::vector<at::Tensor> input_data;
+    auto options_pyt = torch::TensorOptions().device(torch::kCUDA, 0).dtype(torch::kInt);
+    input_data.push_back(at::randint(0, 10, {4, 5, 6, 7}, options_pyt));
+    constant_pad_nd_test_helper(graph_IR, input_data);
+}
+
+TEST(Converters, TestAtenConstantPadNdInputSingleDimCorrectly) {
+    const auto graph_IR = gen_constant_pad_nd_graph("1, 2", "1.5");
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({6}, {at::kCUDA}));
+    constant_pad_nd_test_helper(graph_IR, input_data);
+}
+
+TEST(Converters, TestAtenConstantPadNdDynamicFloatCorrectly) {
+    const auto graph_IR = gen_constant_pad_nd_graph("1, 2, 3, 4", "1.5");
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({3, 4, 5, 6}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({2, 3, 4, 5}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({2, 3, 4, 5}, {at::kCUDA}));
+
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({2, 3, 4, 5}, {at::kCUDA}));
+
+    constant_pad_nd_test_helper(graph_IR, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, TestAtenConstantPadNdDynamicFloatTwoPaddingDimsZerosCorrectly) {
+    const auto graph_IR = gen_constant_pad_nd_graph("2, 0, 0, 2", "1.5");
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({3, 4, 5, 6}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({2, 3, 4, 5}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({2, 3, 4, 5}, {at::kCUDA}));
+
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({2, 3, 4, 5}, {at::kCUDA}));
+
+    constant_pad_nd_test_helper(graph_IR, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, TestAtenConstantPadNdDynamicFloatSingleDimCorrectly) {
+    const auto graph_IR = gen_constant_pad_nd_graph("1, 2", "1.5");
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({10}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({5}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({5}, {at::kCUDA}));
+
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({5}, {at::kCUDA}));
+
+    constant_pad_nd_test_helper(graph_IR, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, TestAtenConstantPadNdDynamicIntCorrectly) {
+    const auto graph_IR = gen_constant_pad_nd_graph("1, 2, 3, 4", "2", true);
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    auto options_pyt = torch::TensorOptions().device(torch::kCUDA, 0).dtype(torch::kFloat);
+    prewarm_data[0].push_back(at::randint(0, 10, {3, 4, 5, 6}, options_pyt));
+    prewarm_data[1].push_back(at::randint(0, 10, {2, 3, 4, 5}, options_pyt));
+    prewarm_data[2].push_back(at::randint(0, 10, {2, 3, 4, 5}, options_pyt));
+
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randint(0, 10, {2, 3, 4, 5}, {at::kCUDA}));
+
+    constant_pad_nd_test_helper(graph_IR, input_data, true, &prewarm_data);
+}
\ No newline at end of file
diff --git a/poros/unittest/converter/conv2d_test.cpp b/poros/unittest/converter/conv2d_test.cpp
new file mode 100644
index 0000000000..bb38183c67
--- /dev/null
+++ b/poros/unittest/converter/conv2d_test.cpp
@@ -0,0 +1,69 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file conv2d_test.cpp
+* @author tianshaoqing@baidu.com
+* @date Wed Sep 27 11:24:21 CST 2021
+* @brief 
+**/
+#include <string>
+#include <vector>
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/util/test_util.h"
+#include "poros/converter/gpu/convolution.h"
+
+static void conv2d_test_helper(const std::string& graph_IR, 
+                               baidu::mirana::poros::IConverter* converter,
+                               std::vector<int64_t> shape_inputs,
+                               std::vector<int64_t> shape_weights,
+                               std::vector<int64_t> shape_bias) {
+    std::vector<at::Tensor> input_data;
+    // auto in = at::randn({1, 3, 10, 10}, {at::kCUDA});
+    // auto w = at::randn({8, 3, 5, 5}, {at::kCUDA});
+    // auto b = at::randn({8}, {at::kCUDA});
+    auto in = at::randn(shape_inputs, {at::kCUDA});
+    auto w = at::randn(shape_weights, {at::kCUDA});
+    auto b = at::randn(shape_bias, {at::kCUDA});
+    input_data.push_back(in);
+    input_data.push_back(w);
+    input_data.push_back(b);
+    
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, converter, input_data, graph_output, poros_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    //ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 0.0001));
+}
+
+TEST(Converters, ATenConv2dVggishTestConvertsCorrectly) {  
+    // aten::conv2d(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor
+    const auto graph_IR = R"IR(
+      graph(%0 : Tensor, %1 : Tensor, %2 : Tensor):
+        %3 : int[] = prim::Constant[value=[1, 1]]()
+        %4 : int[] = prim::Constant[value=[1, 1]]()
+        %5 : int[] = prim::Constant[value=[1, 1]]()
+        %6 : int = prim::Constant[value=1]()
+        %7 : Tensor = aten::conv2d(%0, %1, %2, %3, %4, %5, %6)
+        return (%7))IR";
+    baidu::mirana::poros::ConvolutionConverter convolutionconverter;
+    conv2d_test_helper(graph_IR, &convolutionconverter, {60, 256, 12, 8}, {512, 256, 3, 3}, {512});
+}
diff --git a/poros/unittest/converter/einsum_test.cpp b/poros/unittest/converter/einsum_test.cpp
new file mode 100644
index 0000000000..ca43249f9f
--- /dev/null
+++ b/poros/unittest/converter/einsum_test.cpp
@@ -0,0 +1,150 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file einsum_test.cpp
+* @author tianshaoqing@baidu.com
+* @date Wed Jul 06 11:24:51 CST 2022
+* @brief 
+**/
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/converter/gpu/einsum.h"
+#include "poros/util/test_util.h"
+
+static void aten_einsum_test_helper(const std::string& equation, 
+                                    at::Tensor input1, 
+                                    at::Tensor input2 = at::Tensor()) {
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(input1);
+    if (input2.defined()) {
+        input_data.push_back(input2);
+    }
+    
+    std::string graph_IR;
+    if (input_data.size() == 2) {
+        graph_IR = R"IR(
+            graph(%0 : Tensor, %1 : Tensor):
+                %eq : str = prim::Constant[value=")IR" + equation + R"IR("]()
+                %2 : Tensor[] = prim::ListConstruct(%0, %1)
+                %3 : Tensor = aten::einsum(%eq, %2)
+                return (%3))IR";
+    } else {
+        graph_IR = R"IR(
+            graph(%0 : Tensor):
+                %eq : str = prim::Constant[value=")IR" + equation + R"IR("]()
+                %2 : Tensor[] = prim::ListConstruct(%0)
+                %3 : Tensor = aten::einsum(%eq, %2)
+                return (%3))IR";
+    }
+    
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    baidu::mirana::poros::EinsumConverter einsumconverter;
+
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &einsumconverter, 
+                input_data, graph_output, poros_output));
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+}
+
+TEST(Converters, ATenEinsumConverterCorrectly) {
+// aten::einsum(str equation, Tensor[] tensors) -> (Tensor)
+    const auto graph_IR = R"IR(
+      graph(%0 : Tensor, %1 : Tensor):
+        %eq : str = prim::Constant[value="bfnd,ndh->bfh"]()
+        %2 : Tensor[] = prim::ListConstruct(%0, %1)
+        %3 : Tensor = aten::einsum(%eq, %2)
+        return (%3))IR";
+
+    std::vector<at::Tensor> input_data;
+    
+    auto options_pyt_float = torch::TensorOptions().device(torch::kCUDA, 0).dtype(torch::kFloat);
+    input_data.push_back(at::randn({20, 30, 12, 26}, options_pyt_float));
+    input_data.push_back(at::randn({12, 26, 312}, options_pyt_float));
+
+    baidu::mirana::poros::EinsumConverter einsumconverter;
+
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = false;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &einsumconverter, 
+                input_data, graph_output, poros_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+}
+
+TEST(Converters, ATenEinsumTorchExamplesTestConverterCorrectly) {
+    // Test cases from https://gist.github.com/rockt/15ee013889d65342088e9260a377dc8f
+    auto options_pyt_float = torch::TensorOptions().device(torch::kCUDA, 0).dtype(torch::kFloat);
+    at::Tensor x = at::randn({5}, options_pyt_float);
+    at::Tensor y = at::randn({7}, options_pyt_float);
+    at::Tensor A = at::randn({3, 5}, options_pyt_float);
+    at::Tensor B = at::randn({2, 5}, options_pyt_float);
+    at::Tensor C = at::randn({2, 3, 5}, options_pyt_float);
+    at::Tensor D = at::randn({2, 5, 7}, options_pyt_float);
+    at::Tensor E = at::randn({7, 9}, options_pyt_float);
+    at::Tensor F = at::randn({2, 3, 3, 5}, options_pyt_float);
+    at::Tensor G = at::randn({5, 4, 6}, options_pyt_float);
+    at::Tensor H = at::randn({4, 4}, options_pyt_float);
+    at::Tensor I = at::randn({2, 3, 2}, options_pyt_float);
+
+    // vector operations
+    aten_einsum_test_helper("i->", x);                        // sum
+    aten_einsum_test_helper("i,i->", x, x);                   // dot
+    aten_einsum_test_helper("i,i->i", x, x);                  // vector element-wisem mul
+    aten_einsum_test_helper("i,j->j", x, y);                  // outer
+
+    // Matrix operations
+    aten_einsum_test_helper("ij->ji", A);                     // transpose
+    aten_einsum_test_helper("ij->j", A);                      // row sum
+    aten_einsum_test_helper("ij->i", A);                      // col sum
+    aten_einsum_test_helper("ij,ij->ij", A, A);               // matrix element-wise mul
+    aten_einsum_test_helper("ij,j->i", A, x);                 // matrix vector multiplication
+    aten_einsum_test_helper("ij,kj->ik", A, B);               // matmul
+    aten_einsum_test_helper("ij,ab->ijab", A, E);             // matrix outer product
+
+    // Tensor operations
+    aten_einsum_test_helper("Aij,Ajk->Aik", C, D);            // batch matmul
+    aten_einsum_test_helper("ijk,jk->i", C, A);               // tensor matrix contraction
+    aten_einsum_test_helper("aij,jk->aik", D, E);             // tensor matrix contraction
+    aten_einsum_test_helper("abCd,dfg->abCfg", F, G);         // tensor tensor contraction
+    aten_einsum_test_helper("ijk,jk->ik", C, A);              // tensor matrix contraction with double indices
+    aten_einsum_test_helper("ijk,jk->ij", C, A);              // tensor matrix contraction with double indices
+    aten_einsum_test_helper("ijk,ik->j", C, B);               // non contiguous
+    aten_einsum_test_helper("ijk,ik->jk", C, B);              // non contiguous with double indices
+
+    // Diagonal operations are not permitted in poros
+    // aten_einsum_test_helper("ii", H);                          // trace
+    // aten_einsum_test_helper("ii->i", H);                       // diagonal
+    // aten_einsum_test_helper("iji->j", I);                      // non-contiguous trace
+    // aten_einsum_test_helper("ngrg...->nrg...", at::randn({2, 1, 3, 1, 4}, options_pyt_float));
+
+    // Ellipsis equations are not permitted in poros
+    // aten_einsum_test_helper("i...->...", H);
+    // aten_einsum_test_helper("ki,...k->i...", A.t(), B);
+    // aten_einsum_test_helper("k...,jk->...", A.t(), B);
+    // aten_einsum_test_helper('...ik, ...j -> ...ij', C, x);
+    // aten_einsum_test_helper('Bik,k...j->i...j', C, at::randn({5, 3}, options_pyt_float));
+    // aten_einsum_test_helper('i...j, ij... -> ...ij', C, at::randn({2, 5, 2, 3}, options_pyt_float));
+}
\ No newline at end of file
diff --git a/poros/unittest/converter/element_wise_test.cpp b/poros/unittest/converter/element_wise_test.cpp
new file mode 100644
index 0000000000..80d23b4a27
--- /dev/null
+++ b/poros/unittest/converter/element_wise_test.cpp
@@ -0,0 +1,305 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file element_wise_test.cpp
+* @author tianshaoqing@baidu.com
+* @date Wed Sep 27 11:24:21 CST 2021
+* @brief 
+**/
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/converter/gpu/element_wise.h"
+#include "poros/util/test_util.h"
+
+static void poros_test_helper(const std::string& graph_IR,
+                            baidu::mirana::poros::IConverter* converter,
+                            const std::vector<at::Tensor>& input_data){
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, converter, 
+                input_data, graph_output, poros_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    std::string pow_node_name("aten::pow");
+    if(converter->node_kind()[0].toQualString() == pow_node_name){
+        ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+    }else{
+        ASSERT_TRUE(graph_output[0].equal(poros_output[0]));
+    }
+}
+
+static void pow_test_examples(const std::string& graph_IR,
+                            baidu::mirana::poros::IConverter* converter,
+                            bool singleInput,
+                            std::vector<int64_t> shape1 = {5}, 
+                            std::vector<int64_t> shape2 = {5}){
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn(shape1, {at::kCUDA}));
+    if (!singleInput){
+        input_data.push_back(at::randint(-5, 5, shape2, {at::kCUDA}));
+    }
+    poros_test_helper(graph_IR, converter, input_data);
+}
+
+TEST(Converters, ATenPowTensorConvertsCorrectly) {
+    // aten::pow.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor
+    const auto graph_IR = R"IR(
+       graph(%1 : Tensor, %2 : Tensor):
+          %3 : Tensor = aten::pow(%1, %2)
+          return (%3))IR";
+    baidu::mirana::poros::PowOrFloordivideConverter poworfloordivideconverter;
+    pow_test_examples(graph_IR, &poworfloordivideconverter, false);
+    pow_test_examples(graph_IR, &poworfloordivideconverter, false, {3, 4}, {4});
+    pow_test_examples(graph_IR, &poworfloordivideconverter, false, {4}, {3, 4});
+    pow_test_examples(graph_IR, &poworfloordivideconverter, false, {3, 4, 3}, {4, 3});
+    pow_test_examples(graph_IR, &poworfloordivideconverter, false, {4, 3}, {3, 4, 3});
+}
+
+TEST(Converters, ATenPowScalarConvertsCorrectly) {
+    // aten::pow.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor
+    const auto graph_IR = R"IR(
+            graph(%1 : Tensor):
+              %2 : float = prim::Constant[value=2.0]()
+              %3 : Tensor = aten::pow(%1, %2)
+              return (%3))IR";
+    baidu::mirana::poros::PowOrFloordivideConverter poworfloordivideconverter;
+    pow_test_examples(graph_IR, &poworfloordivideconverter, true);
+    pow_test_examples(graph_IR, &poworfloordivideconverter, true, {3, 4});
+}
+
+static void elementwise_tensor_test_examples(const std::string& op,
+                                            baidu::mirana::poros::IConverter* converter){
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor, %1 : Tensor):
+          %2 : Tensor = aten::)IR" + op + R"IR((%0, %1)
+          return (%2))IR";
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({2, 2}, {at::kCUDA}));
+    input_data.push_back(at::randn({2, 2}, {at::kCUDA}));
+    poros_test_helper(graph_IR, converter, input_data);
+
+    input_data.clear();
+    input_data.push_back(at::randn({2, 2}, {at::kCUDA}));
+    input_data.push_back(at::randn({2, 2}, {at::kCUDA}));
+    input_data[0][0][0] = 2.5;
+    input_data[1][0][0] = 2.5;
+    poros_test_helper(graph_IR, converter, input_data);
+
+    input_data.clear();
+    input_data.push_back(at::randn({3, 4, 3}, {at::kCUDA}));
+    input_data.push_back(at::randn({4, 3}, {at::kCUDA}));
+    input_data[0][0][0][0] = 2.5;
+    input_data[1][0][0] = 2.5;
+    poros_test_helper(graph_IR, converter, input_data);
+
+    input_data.clear();
+    input_data.push_back(at::randn({4, 3}, {at::kCUDA}));
+    input_data.push_back(at::randn({3, 4, 3}, {at::kCUDA}));
+    input_data[0][0][0] = 2.5;
+    input_data[1][0][0][0] = 2.5;
+    poros_test_helper(graph_IR, converter, input_data);
+}
+
+static void elementwise_scalar_test_examples(const std::string& op,
+                                            const std::string& scalar,
+                                            baidu::mirana::poros::IConverter* converter){
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %1 : float = prim::Constant[value=)IR" + scalar + R"IR(]()
+          %2 : Tensor = aten::)IR" + op + R"IR((%0, %1)
+          return (%2))IR";
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({2, 2}, {at::kCUDA}));
+    poros_test_helper(graph_IR, converter, input_data);
+
+    input_data.clear();
+    input_data.push_back(at::randn({2, 2}, {at::kCUDA}));
+    input_data[0][0][0] = 2.5;
+    poros_test_helper(graph_IR, converter, input_data);
+
+    input_data.clear();
+    input_data.push_back(at::randn({1}, {at::kCUDA}));
+    input_data[0][0] = 2.5;
+    poros_test_helper(graph_IR, converter, input_data);
+}
+
+TEST(Converters, ATenEqualTensorConvertsCorrectly) {
+    // aten::eq.Tensor(Tensor self, Tensor other) -> Tensor
+    baidu::mirana::poros::EqualOrNotequalConverter equalorbotequalconverter;
+    elementwise_tensor_test_examples("eq", &equalorbotequalconverter);
+}
+
+TEST(Converters, ATenEqualScalarConvertsCorrectly) {
+    // aten::eq.Scalar(Tensor self, Scalar other) -> Tensor
+    baidu::mirana::poros::EqualOrNotequalConverter equalorbotequalconverter;
+    elementwise_scalar_test_examples("eq", "2.5",&equalorbotequalconverter);
+}
+
+TEST(Converters, ATenNotEqualTensorConvertsCorrectly) {
+    // aten::ne.Tensor(Tensor self, Tensor other) -> Tensor
+    baidu::mirana::poros::EqualOrNotequalConverter equalorbotequalconverter;
+    elementwise_tensor_test_examples("ne", &equalorbotequalconverter);
+}
+
+TEST(Converters, ATenNotEqualScalarConvertsCorrectly) {
+    // aten::ne.Scalar(Tensor self, Scalar other) -> Tensor
+    baidu::mirana::poros::EqualOrNotequalConverter equalorbotequalconverter;
+    elementwise_scalar_test_examples("ne", "2.5", &equalorbotequalconverter);
+}
+
+TEST(Converters, ATenGtTensorConvertsCorrectly) {
+    // aten::gt.Tensor(Tensor self, Tensor other) -> Tensor
+    baidu::mirana::poros::GreaterOrLessConverter greaterorlessconverter;
+    elementwise_tensor_test_examples("gt", &greaterorlessconverter);
+}
+
+TEST(Converters, ATenGtScalarConvertsCorrectly) {
+    // aten::gt.Scalar(Tensor self, Scalar other) -> Tensor
+    baidu::mirana::poros::GreaterOrLessConverter greaterorlessconverter;
+    elementwise_scalar_test_examples("gt", "2.5", &greaterorlessconverter);
+}
+
+TEST(Converters, ATenLtTensorConvertsCorrectly) {
+    // aten::lt.Tensor(Tensor self, Tensor other) -> Tensor
+    baidu::mirana::poros::GreaterOrLessConverter greaterorlessconverter;
+    elementwise_tensor_test_examples("lt", &greaterorlessconverter);
+}
+
+TEST(Converters, ATenLtScalarConvertsCorrectly) {
+    // aten::lt.Scalar(Tensor self, Scalar other) -> Tensor
+    baidu::mirana::poros::GreaterOrLessConverter greaterorlessconverter;
+    elementwise_scalar_test_examples("lt", "2.5", &greaterorlessconverter);
+}
+
+TEST(Converters, ATenGeTensorConvertsCorrectly) {
+    // aten::ge.Tensor(Tensor self, Tensor other) -> Tensor
+    baidu::mirana::poros::GreaterOrLessConverter greaterorlessconverter;
+    elementwise_tensor_test_examples("ge", &greaterorlessconverter);
+}
+
+TEST(Converters, ATenGeScalarConvertsCorrectly) {
+    // aten::ge.Scalar(Tensor self, Scalar other) -> Tensor
+    baidu::mirana::poros::GreaterOrLessConverter greaterorlessconverter;
+    elementwise_scalar_test_examples("ge", "2.5", &greaterorlessconverter);
+}
+
+TEST(Converters, ATenLeTensorConvertsCorrectly) {
+    // aten::le.Tensor(Tensor self, Tensor other) -> Tensor
+    baidu::mirana::poros::GreaterOrLessConverter greaterorlessconverter;
+    elementwise_tensor_test_examples("le", &greaterorlessconverter);
+}
+
+TEST(Converters, ATenLeScalarConvertsCorrectly) {
+    // aten::le.Scalar(Tensor self, Scalar other) -> Tensor
+    baidu::mirana::poros::GreaterOrLessConverter greaterorlessconverter;
+    elementwise_scalar_test_examples("le", "2.5", &greaterorlessconverter);
+}
+
+static std::string gen_clamp_graph(const std::string& op,
+                                  const std::string& min_val,
+                                  const std::string& max_val){
+    if (op == "clamp"){
+        std::string min_val_IR;
+        std::string max_val_IR;
+        if (min_val.empty()){
+            min_val_IR = "None = prim::Constant()";
+        }else{
+            min_val_IR = "float = prim::Constant[value=" + min_val + "]()";
+        }
+        if (max_val.empty()){
+            max_val_IR = "None = prim::Constant()";
+        }else{
+            max_val_IR = "float = prim::Constant[value=" + max_val + "]()";
+        }
+        return R"IR(
+            graph(%0 : Tensor):
+              %1 : )IR" + min_val_IR + R"IR(
+              %2 : )IR" + max_val_IR + R"IR(
+              %3 : Tensor = aten::)IR" + op + R"IR((%0, %1, %2)
+              return (%3))IR";
+    }else if (op == "clamp_min"){
+        return R"IR(
+            graph(%0 : Tensor):
+              %1 : float = prim::Constant[value=)IR" + min_val + R"IR(]()
+              %2 : Tensor = aten::)IR" + op + R"IR((%0, %1)
+              return (%2))IR";
+    }else if (op == "clamp_max"){
+        return R"IR(
+            graph(%0 : Tensor):
+              %1 : float = prim::Constant[value=)IR" + max_val + R"IR(]()
+              %2 : Tensor = aten::)IR" + op + R"IR((%0, %1)
+              return (%2))IR";
+    }else{
+        return "";
+    }
+}
+
+TEST(Converters, ATenClampMinConvertsCorrectly) {
+    // aten::clamp(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor
+    const auto graph_IR = gen_clamp_graph("clamp", "1.5", "");
+    baidu::mirana::poros::ClampConverter clampconverter;
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({10}, {at::kCUDA}));
+    poros_test_helper(graph_IR, &clampconverter, input_data);
+}
+
+TEST(Converters, ATenClampMaxConvertsCorrectly) {
+    // aten::clamp(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor
+    const auto graph_IR = gen_clamp_graph("clamp", "", "0.5");
+    baidu::mirana::poros::ClampConverter clampconverter;
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({10}, {at::kCUDA}));
+    poros_test_helper(graph_IR, &clampconverter, input_data);
+}
+
+TEST(Converters, ATenClampMinMaxConvertsCorrectly) {
+    // aten::clamp(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor
+    const auto graph_IR = gen_clamp_graph("clamp", "-0.5", "0.5");
+    baidu::mirana::poros::ClampConverter clampconverter;
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({10}, {at::kCUDA}));
+    poros_test_helper(graph_IR, &clampconverter, input_data);
+}
+
+TEST(Converters, ATenClampMaximumConvertsCorrectly) {
+    // aten::clamp_max(Tensor self, Scalar max) -> Tensor
+    const auto graph_IR = gen_clamp_graph("clamp_max", "", "0.5");
+    baidu::mirana::poros::ClampConverter clampconverter;
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({10}, {at::kCUDA}));
+    poros_test_helper(graph_IR, &clampconverter, input_data);
+}
+
+TEST(Converters, ATenClampMinimumConvertsCorrectly) {
+    // aten::clamp_min(Tensor self, Scalar min) -> Tensor
+    const auto graph_IR = gen_clamp_graph("clamp_min", "-0.5", "");
+    baidu::mirana::poros::ClampConverter clampconverter;
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({10}, {at::kCUDA}));
+    poros_test_helper(graph_IR, &clampconverter, input_data);
+}
+
+TEST(Converters, ATenClampMinGtMaxConvertsCorrectly) {
+    // aten::clamp_min(Tensor self, Scalar min) -> Tensor
+    const auto graph_IR = gen_clamp_graph("clamp", "0.5", "-0.5");
+    baidu::mirana::poros::ClampConverter clampconverter;
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({10}, {at::kCUDA}));
+    poros_test_helper(graph_IR, &clampconverter, input_data);
+}
\ No newline at end of file
diff --git a/poros/unittest/converter/expand_test.cpp b/poros/unittest/converter/expand_test.cpp
new file mode 100644
index 0000000000..e2221cef9b
--- /dev/null
+++ b/poros/unittest/converter/expand_test.cpp
@@ -0,0 +1,233 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file expand_test.cpp
+* @author tianshaoqing@baidu.com
+* @date Wed Sep 27 11:24:21 CST 2021
+* @brief 
+**/
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/converter/gpu/expand.h"
+#include "poros/util/test_util.h"
+
+static void expand_test_helper(const std::string& graph_IR,
+                            baidu::mirana::poros::IConverter* converter,
+                            bool singleInput,
+                            std::vector<int64_t> shape1 = {3, 1},
+                            std::vector<int64_t> shape2 = {3, 1}){
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn(shape1, {at::kCUDA}));
+    if (!singleInput){
+        input_data.push_back(at::randn(shape2, {at::kCUDA}));
+    }
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, converter, 
+                input_data, graph_output, poros_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    // ASSERT_TRUE(baidu::mirana::poros::testutil::almostEqual(graph_output[0], poros_output[0], 2e-6));
+    ASSERT_TRUE(graph_output[0].equal(poros_output[0]));
+}
+
+static std::string gen_expand_graph(const std::string& size, const std::string& implicit) {
+    return R"IR(
+        graph(%0 : Tensor):
+          %1 : int[] = prim::Constant[value=[)IR" + size + R"IR(]]()
+          %2 : bool = prim::Constant[value=)IR" + implicit + R"IR(]()
+          %3 : Tensor = aten::expand(%0, %1, %2)
+          return (%3))IR";
+}
+
+static std::string gen_repeat_graph(const std::string& size) {
+    return R"IR(
+        graph(%0 : Tensor):
+          %1 : int[] = prim::Constant[value=[)IR" + size + R"IR(]]()
+          %2 : Tensor = aten::repeat(%0, %1)
+          return (%2))IR";
+}
+
+TEST(Converters, ATenExpandSameDimConvertsCorrectly) {
+    // aten::expand(Tensor(a) self, int[] size, *, bool implicit=False) -> Tensor(a)
+    const auto graph_IR = gen_expand_graph("3, 4", "0");
+    baidu::mirana::poros::ExpandConverter expandconverter;
+    expand_test_helper(graph_IR, &expandconverter, true);
+}
+
+TEST(Converters, ATenExpandTileConvertsCorrectly) {
+    // aten::expand(Tensor(a) self, int[] size, *, bool implicit=False) -> Tensor(a)
+    // 若%2参数个数大于%1,则expand从后向前对齐
+    // [3,1] [2,3,4] -> [2,3,4]
+    // [3,1] [1,3,4] -> [1,3,4]
+    // [3,1] [3,-1,4] -> [3,3,4]
+    const auto graph_IR = gen_expand_graph("2, 3, 4", "0");          
+    baidu::mirana::poros::ExpandConverter expandconverter;
+    expand_test_helper(graph_IR, &expandconverter, true);
+}
+
+TEST(Converters, ATenExpandTileLastConvertsCorrectly) {
+    // aten::expand(Tensor(a) self, int[] size, *, bool implicit=False) -> Tensor(a)
+    const auto graph_IR = gen_expand_graph("1, 3, 4", "0");
+    baidu::mirana::poros::ExpandConverter expandconverter;
+    expand_test_helper(graph_IR, &expandconverter, true);
+}
+
+TEST(Converters, ATenExpandNegativeSizeConvertsCorrectly) { 
+    // aten::expand(Tensor(a) self, int[] size, *, bool implicit=False) -> Tensor(a)
+    // 1 means not changing the size of that dimension
+    const auto graph_IR = gen_expand_graph("3, -1, 4", "0");
+    baidu::mirana::poros::ExpandConverter expandconverter;
+    expand_test_helper(graph_IR, &expandconverter, true);
+}
+
+TEST(Converters, ATenRepeatConvertsCorrectly) {
+    // aten::repeat(Tensor self, int[] repeats) -> Tensor
+    // output shape计算方法:参数向后对齐(如果%1与%2维度不同的话,同expand)，依次相乘
+    // [3,1] [4,2] -> [12,2]
+    // [2,3,2] [2,2,2] -> [4,6,4]
+    // [3,1] [1,3,2] -> [1,9,2]
+    const auto graph_IR = gen_repeat_graph("4, 2");
+    baidu::mirana::poros::RepeatConverter repeatconverter;
+    expand_test_helper(graph_IR, &repeatconverter, true);
+}
+
+TEST(Converters, ATenRepeat3dConvertsCorrectly) {
+    // aten::repeat(Tensor self, int[] repeats) -> Tensor
+    const auto graph_IR = gen_repeat_graph("2, 2, 2");
+    baidu::mirana::poros::RepeatConverter repeatconverter;
+    expand_test_helper(graph_IR, &repeatconverter, true, {2, 3, 2});
+}
+
+TEST(Converters, ATenRepeatExtraDimsConvertsCorrectly) {
+    // aten::repeat(Tensor self, int[] repeats) -> Tensor
+    const auto graph_IR = gen_repeat_graph("1, 3, 2");
+    baidu::mirana::poros::RepeatConverter repeatconverter;
+    expand_test_helper(graph_IR, &repeatconverter, true);
+}
+
+static void expand_dynamic_test_helper(const std::string& graph_IR, 
+                                baidu::mirana::poros::IConverter* converter,
+                                const std::vector<at::Tensor>& input_data,
+                                bool is_dynamic = false,
+                                std::vector<std::vector<at::Tensor>>* prewarm_data = nullptr) {
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = is_dynamic;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, converter, 
+                input_data, graph_output, poros_output, prewarm_data));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(graph_output[0].equal(poros_output[0]));
+}
+
+TEST(Converters, ATenExpandFromSizedynamicConvertsCorrectly) {
+    // aten::expand(Tensor(a) self, int[] size, *, bool implicit=False) -> Tensor(a)
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor, %1 : Tensor):
+          %2 : int = prim::Constant[value=-1]()
+          %3 : int[] = aten::size(%0)
+          %B.1 : int, %H.1 : int, %W.1 : int, %C.1 : int = prim::ListUnpack(%3)
+          %4 : int[] = prim::ListConstruct(%B.1, %2, %C.1)
+          %5 : Tensor = aten::reshape(%0, %4)
+          %6 : int[] = aten::size(%5) 
+          %B.2 : int, %N.2 : int, %C.2 : int = prim::ListUnpack(%6)
+          %7 : int[] = prim::ListConstruct(%B.2, %2, %2)
+          %8 : bool = prim::Constant[value=0]()
+          %9 : Tensor = aten::expand(%1, %7, %8) 
+          return (%9))IR";
+    baidu::mirana::poros::ExpandConverter expandconverter;
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({2, 24, 24, 512}, {at::kCUDA}));
+    input_data.push_back(at::randn({1, 1, 512}, {at::kCUDA}));
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({4, 24, 24, 512}, {at::kCUDA}));
+    prewarm_data[0].push_back(at::randn({1, 1, 512}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({2, 24, 24, 512}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({1, 1, 512}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({2, 24, 24, 512}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({1, 1, 512}, {at::kCUDA}));
+
+    expand_dynamic_test_helper(graph_IR, &expandconverter, input_data, true, &prewarm_data);
+}
+
+
+/*aten::expand_as(Tensor(a) self, Tensor other) -> Tensor(a)*/
+static std::string gen_expand_as_graph() {
+    return R"IR(
+        graph(%0 : Tensor, %1 : Tensor):
+          %3 : Tensor = aten::expand_as(%0, %1)
+          return (%3))IR";
+}
+
+TEST(Converters, ATenExpandAsConvertsCorrectly) {
+    /*aten::expand_as(Tensor(a) self, Tensor other) -> Tensor(a)*/
+    const auto graph_IR = gen_expand_as_graph();
+    baidu::mirana::poros::ExpandConverter expandconverter;
+
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({1, 1, 512}, {at::kCUDA}));
+    input_data.push_back(at::randn({2, 24, 1, 512}, {at::kCUDA}));
+
+    expand_dynamic_test_helper(graph_IR, &expandconverter, input_data);
+}
+
+TEST(Converters, ATenExpandAsDynamicConvertsCorrectly) {
+    /*aten::expand_as(Tensor(a) self, Tensor other) -> Tensor(a)*/
+    const auto graph_IR = gen_expand_as_graph();
+    baidu::mirana::poros::ExpandConverter expandconverter;
+
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({1, 1, 512}, {at::kCUDA}));
+    input_data.push_back(at::randn({2, 24, 1, 512}, {at::kCUDA}));
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({1, 1, 512}, {at::kCUDA}));
+    prewarm_data[0].push_back(at::randn({4, 24, 1, 512}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({1, 1, 512}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({2, 24, 1, 512}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({1, 1, 512}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({2, 24, 1, 512}, {at::kCUDA}));
+
+    expand_dynamic_test_helper(graph_IR, &expandconverter, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, ATenExpandAsDynamicMoreConvertsCorrectly) {
+    /*aten::expand_as(Tensor(a) self, Tensor other) -> Tensor(a)*/
+    const auto graph_IR = gen_expand_as_graph();
+    baidu::mirana::poros::ExpandConverter expandconverter;
+
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({24, 1, 512}, {at::kCUDA}));
+    input_data.push_back(at::randn({4, 24, 1, 512}, {at::kCUDA}));
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({24, 1, 512}, {at::kCUDA}));
+    prewarm_data[0].push_back(at::randn({4, 24, 1, 512}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({2, 1, 512}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({4, 2, 1, 512}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({4, 1, 512}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({4, 4, 1, 512}, {at::kCUDA}));
+
+    expand_dynamic_test_helper(graph_IR, &expandconverter, input_data, true, &prewarm_data);
+}
\ No newline at end of file
diff --git a/poros/unittest/converter/generate_test.cpp b/poros/unittest/converter/generate_test.cpp
new file mode 100644
index 0000000000..261d893075
--- /dev/null
+++ b/poros/unittest/converter/generate_test.cpp
@@ -0,0 +1,546 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file generate_test.cpp
+* @author tianshaoqing@baidu.com
+* @date Tue Nov 23 12:26:28 CST 2021
+* @brief 
+**/
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/converter/gpu/generate.h"
+#include "poros/util/test_util.h"
+
+static void generate_dy_test_helper(const std::string& graph_IR, 
+                                baidu::mirana::poros::IConverter* converter,
+                                const std::vector<at::Tensor>& input_data,
+                                bool is_dynamic = false,
+                                std::vector<std::vector<at::Tensor>>* prewarm_data = nullptr) {
+
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = is_dynamic;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, converter, 
+                input_data, graph_output, poros_output, prewarm_data));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+}
+
+TEST(Converters, ATenZeroslikeConvertsCorrectly) {
+    // aten::zeros_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %1 : None = prim::Constant()
+          %zerosout : Tensor = aten::zeros_like(%0, %1, %1, %1, %1, %1)
+          return (%zerosout))IR";
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({5, 6, 7}, {at::kCUDA}));
+    baidu::mirana::poros::ZerosLikeConverter zeroslikeconverter;
+    generate_dy_test_helper(graph_IR, &zeroslikeconverter, input_data);
+}
+
+TEST(Converters, ATenZeroslikeDtypeConvertsCorrectly) {
+    // aten::zeros_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+    // scalar type index in aten and support situation ('o' is support and 'x' is not support):
+    // uint8_t -> 0 x
+    // int8_t -> 1 x
+    // int16_t -> 2 x
+    // int -> 3 o
+    // int64_t -> 4 x
+    // Half -> 5 o
+    // float -> 6 o
+    // bool -> 11 x
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %1 : None = prim::Constant()
+          %2 : int = prim::Constant[value=3]()
+          %zerosout : Tensor = aten::zeros_like(%0, %2, %1, %1, %1, %1)
+          return (%zerosout))IR";
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({5, 6, 7}, {at::kCUDA}));
+    baidu::mirana::poros::ZerosLikeConverter zeroslikeconverter;
+    generate_dy_test_helper(graph_IR, &zeroslikeconverter, input_data);
+}
+
+TEST(Converters, ATenZeroslikeDynamicConvertsCorrectly) {
+    // aten::zeros_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %1 : None = prim::Constant()
+          %zerosout : Tensor = aten::zeros_like(%0, %1, %1, %1, %1, %1)
+          return (%zerosout))IR";
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({5, 6, 7}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({4, 5, 6}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({4, 5, 6}, {at::kCUDA}));
+
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({4, 5, 6}, {at::kCUDA}));
+    baidu::mirana::poros::ZerosLikeConverter zeroslikeconverter;
+    generate_dy_test_helper(graph_IR, &zeroslikeconverter, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, ATenZeroslikeDynamicDtypeConvertsCorrectly) {
+    // aten::zeros_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %1 : None = prim::Constant()
+          %2 : int = prim::Constant[value=5]()
+          %zerosout : Tensor = aten::zeros_like(%0, %2, %1, %1, %1, %1)
+          return (%zerosout))IR";
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({5, 6, 7}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({4, 5, 6}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({4, 5, 6}, {at::kCUDA}));
+
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({4, 5, 6}, {at::kCUDA}));
+    baidu::mirana::poros::ZerosLikeConverter zeroslikeconverter;
+    generate_dy_test_helper(graph_IR, &zeroslikeconverter, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, ATenZerosDynamicConvertsCorrectly) {
+    // aten::zeros(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %1 : int[] = aten::size(%0)
+          %2 : None = prim::Constant()
+          %3 : Device = prim::Constant[value="cuda"]()
+          %4 : Tensor = aten::zeros(%1, %2, %2, %3, %2)
+          return (%4))IR";
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({5, 6, 7}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({4, 5, 6}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({4, 5, 6}, {at::kCUDA}));
+
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({4, 5, 6}, {at::kCUDA}));
+    baidu::mirana::poros::ZerosConverter ZerosConverter;
+    generate_dy_test_helper(graph_IR, &ZerosConverter, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, ATenZerosDynamicDtypeConvertsCorrectly) {
+    // aten::zeros(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %1 : int[] = aten::size(%0)
+          %2 : None = prim::Constant()
+          %3 : Device = prim::Constant[value="cuda"]()
+          %4 : int = prim::Constant[value=3]()
+          %5 : Tensor = aten::zeros(%1, %4, %2, %3, %2)
+          return (%5))IR";
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({5, 6, 7}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({4, 5, 6}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({4, 5, 6}, {at::kCUDA}));
+
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({4, 5, 6}, {at::kCUDA}));
+    baidu::mirana::poros::ZerosConverter ZerosConverter;
+    generate_dy_test_helper(graph_IR, &ZerosConverter, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, ATenOnesDynamicConvertsCorrectly) {
+    // aten::ones(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %1 : int[] = aten::size(%0)
+          %2 : None = prim::Constant()
+          %3 : Device = prim::Constant[value="cuda"]()
+          %4 : Tensor = aten::ones(%1, %2, %2, %3, %2)
+          return (%4))IR";
+          
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({5, 6, 7}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({4, 5, 6}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({4, 5, 6}, {at::kCUDA}));
+
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({4, 5, 6}, {at::kCUDA}));
+    baidu::mirana::poros::OnesConverter onesconverter;
+    generate_dy_test_helper(graph_IR, &onesconverter, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, ATenOnesDynamicDtypeConvertsCorrectly) {
+    // aten::ones(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %1 : int[] = aten::size(%0)
+          %2 : None = prim::Constant()
+          %3 : Device = prim::Constant[value="cuda"]()
+          %4 : int = prim::Constant[value=5]()
+          %5 : Tensor = aten::ones(%1, %4, %2, %3, %2)
+          return (%5))IR";
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({5, 6, 7}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({4, 5, 6}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({4, 5, 6}, {at::kCUDA}));
+
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({4, 5, 6}, {at::kCUDA}));
+    baidu::mirana::poros::OnesConverter onesconverter;
+    generate_dy_test_helper(graph_IR, &onesconverter, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, ATenFullDynamicDtypeConvertsCorrectly) {
+    // aten::full(int[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %1 : int[] = aten::size(%0)
+          %2 : None = prim::Constant()
+          %3 : Device = prim::Constant[value="cuda"]()
+          %4 : int = prim::Constant[value=6]()
+          %5 : Tensor = aten::full(%1, %4, %4, %2, %3, %2)
+          return (%5))IR";
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({5, 6, 7}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({4, 5, 6}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({4, 5, 6}, {at::kCUDA}));
+
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({4, 5, 6}, {at::kCUDA}));
+    baidu::mirana::poros::FullConverter fullconverter;
+    generate_dy_test_helper(graph_IR, &fullconverter, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, ATenArangeDynamicDtypeConvertsCorrectly) {
+    // aten::arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %1 : int = prim::Constant[value=1]()
+          %2 : int = aten::size(%0, %1)
+          %3 : None = prim::Constant()
+          %4 : Device = prim::Constant[value="cuda"]()
+          %5 : int = prim::Constant[value=3]()
+          %6 : Tensor = aten::arange(%2, %5, %3, %4, %3)
+          return (%6))IR";
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({5, 6, 7}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({4, 5, 6}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({4, 5, 6}, {at::kCUDA}));
+
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({4, 5, 6}, {at::kCUDA}));
+    baidu::mirana::poros::ArangeConverter arangeconverter;
+    generate_dy_test_helper(graph_IR, &arangeconverter, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, ATenArangeStartEndDynamicDtypeConvertsCorrectly) {
+    // aten::arange.start(Scalar start, Scalar end, *, int? dtype=None, int? layout=None, Device? device=None, bool? pin_memory=None) -> (Tensor)
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %1 : int = prim::Constant[value=0]()
+          %2 : int = prim::Constant[value=1]()
+          %s.1 : int = aten::size(%0, %1)
+          %s.2 : int = aten::size(%0, %2)
+          %3 : None = prim::Constant()
+          %4 : Device = prim::Constant[value="cuda"]()
+          %5 : int = prim::Constant[value=3]()
+          %6 : Tensor = aten::arange(%s.1, %s.2, %5, %3, %4, %3)
+          return (%6))IR";
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({1, 8}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({1, 2}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({1, 5}, {at::kCUDA}));
+
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({1, 5}, {at::kCUDA}));
+    baidu::mirana::poros::ArangeConverter arangeconverter;
+    generate_dy_test_helper(graph_IR, &arangeconverter, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, ATenArangeStartConstantEndDynamicDtypeConvertsCorrectly) {
+    // aten::arange.start(Scalar start, Scalar end, *, int? dtype=None, int? layout=None, Device? device=None, bool? pin_memory=None) -> (Tensor)
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %s.1 : int = prim::Constant[value=-10]()
+          %1 : int = prim::Constant[value=1]()
+          %s.2 : int = aten::size(%0, %1)
+          %3 : None = prim::Constant()
+          %4 : Device = prim::Constant[value="cuda"]()
+          %5 : int = prim::Constant[value=6]()
+          %6 : Tensor = aten::arange(%s.1, %s.2, %5, %3, %4, %3)
+          return (%6))IR";
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({1, 8}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({1, 2}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({1, 5}, {at::kCUDA}));
+
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({1, 5}, {at::kCUDA}));
+    baidu::mirana::poros::ArangeConverter arangeconverter;
+    generate_dy_test_helper(graph_IR, &arangeconverter, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, ATenTensorDynamicDtypeConvertsCorrectly) {
+  const auto graph_IR = R"IR(
+    graph(%0 : Tensor):
+        %1 : bool = prim::Constant[value=0]()
+        %2 : Device = prim::Constant[value="cuda:0"]()
+        %3 : int = prim::Constant[value=6]()
+        %4 : int[] = aten::size(%0)
+        %5 : Tensor = aten::tensor(%4, %3, %2, %1)
+        return (%5))IR";
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({11, 2, 1}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({10, 2, 1}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({10, 2, 1}, {at::kCUDA}));
+
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({10, 2, 1}, {at::kCUDA}));
+    baidu::mirana::poros::TensorConverter tensorconverter;
+    
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = true;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &tensorconverter, 
+                input_data, graph_output, poros_output, &prewarm_data));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+}
+
+TEST(Converters, ATenLinspaceScalarTensorConvertsCorrectly) {
+    // aten::linspace(Scalar start, Scalar end, int? steps=None, *, int? dtype=None, int? layout=None, Device? device=None, bool? pin_memory=None) -> (Tensor)
+    // aten::linspace目前只能构造dynamic的单测，非dy的单测会被某些pass变为constant
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+            %2 : int = prim::Constant[value=0]()
+            %3 : None = prim::Constant()
+            %start : int = prim::Constant[value=-10]()
+            %end : int = prim::Constant[value=100]()
+            %step : int = aten::size(%0, %2)
+            %device : Device = prim::Constant[value="cuda"]()
+            %5 : Tensor = aten::linspace(%start, %end, %step, %3, %3, %device, %3)
+            %6 : Tensor = aten::mul(%0, %5)
+            return (%6))IR";
+            
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::ones({6}, {at::kCUDA}));
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::ones({10}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::ones({6}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::ones({6}, {at::kCUDA}));
+
+    baidu::mirana::poros::LinspaceConverter linspaceconverter;
+    
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = true;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &linspaceconverter, 
+                input_data, graph_output, poros_output, &prewarm_data));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+}
+
+TEST(Converters, ATenLinspaceStartEndDiffTypeConvertsCorrectly) {
+    // aten::linspace(Scalar start, Scalar end, int? steps=None, *, int? dtype=None, int? layout=None, Device? device=None, bool? pin_memory=None) -> (Tensor)
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+            %2 : int = prim::Constant[value=0]()
+            %3 : None = prim::Constant()
+            %start : int = prim::Constant[value=-10]()
+            %end : float = prim::Constant[value=43.3]()
+            %step : int = aten::size(%0, %2)
+            %device : Device = prim::Constant[value="cuda"]()
+            %5 : Tensor = aten::linspace(%start, %end, %step, %3, %3, %device, %3)
+            %6 : Tensor = aten::mul(%0, %5)
+            return (%6))IR";
+            
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::ones({6}, {at::kCUDA}));
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::ones({10}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::ones({6}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::ones({6}, {at::kCUDA}));
+
+    baidu::mirana::poros::LinspaceConverter linspaceconverter;
+    
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = true;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &linspaceconverter, 
+                input_data, graph_output, poros_output, &prewarm_data));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+}
+
+TEST(Converters, ATenLinspaceStepNoneConvertsCorrectly) {
+    std::string graph_IR_str;
+    if (TORCH_VERSION_MAJOR < 2 && TORCH_VERSION_MINOR < 11) {
+        // aten::linspace(Scalar start, Scalar end, int? steps=None, *, int? dtype=None, int? layout=None, Device? device=None, bool? pin_memory=None) -> (Tensor)
+        graph_IR_str = R"IR(
+            graph(%0 : Tensor, %1 : Tensor):
+                %2 : int = prim::Constant[value=0]()
+                %3 : None = prim::Constant()
+                %start : int = aten::size(%0, %2)
+                %end : float = prim::Constant[value=43.3]()
+                %device : Device = prim::Constant[value="cuda"]()
+                %5 : Tensor = aten::linspace(%start, %end, %3, %3, %3, %device, %3)
+                %6 : Tensor = aten::mul(%1, %5)
+                return (%6))IR";
+    } else {
+        // aten::linspace(Scalar start, Scalar end, int steps, *, int? dtype=None, int? layout=None, Device? device=None, bool? pin_memory=None) -> (Tensor)
+        graph_IR_str = R"IR(
+            graph(%0 : Tensor, %1 : Tensor):
+                %2 : int = prim::Constant[value=0]()
+                %3 : None = prim::Constant()
+                %start : int = aten::size(%0, %2)
+                %end : float = prim::Constant[value=43.3]()
+                %step : int = prim::Constant[value=100]()
+                %device : Device = prim::Constant[value="cuda"]()
+                %5 : Tensor = aten::linspace(%start, %end, %step, %3, %3, %device, %3)
+                %6 : Tensor = aten::mul(%1, %5)
+                return (%6))IR";
+    }
+    const std::string graph_IR = graph_IR_str;
+            
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::ones({1}, {at::kCUDA}));
+    input_data.push_back(at::ones({100}, {at::kCUDA}));
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::ones({6}, {at::kCUDA}));
+    prewarm_data[0].push_back(at::ones({100}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::ones({1}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::ones({100}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::ones({1}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::ones({100}, {at::kCUDA}));
+
+    baidu::mirana::poros::LinspaceConverter linspaceconverter;
+    
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = true;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &linspaceconverter, 
+                input_data, graph_output, poros_output, &prewarm_data));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+}
+
+TEST(Converters, ATenFulllikeConvertsCorrectly) {
+    // aten::full_like(Tensor self, Scalar fill_value, *, int? dtype=None, int? layout=None, Device? device=None, bool? pin_memory=None, int? memory_format=None) -> (Tensor)
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %1 : None = prim::Constant()
+          %scalar : float = prim::Constant[value=2.5]()
+          %out : Tensor = aten::full_like(%0, %scalar, %1, %1, %1, %1, %1)
+          return (%out))IR";
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({2, 3, 4}, {at::kCUDA}));
+    baidu::mirana::poros::FulllikeConverter fulllikeconverter;
+    generate_dy_test_helper(graph_IR, &fulllikeconverter, input_data);
+}
+
+TEST(Converters, ATenFulllikeDefaultTypeConvertsCorrectly) {
+    // aten::full_like(Tensor self, Scalar fill_value, *, int? dtype=None, int? layout=None, Device? device=None, bool? pin_memory=None, int? memory_format=None) -> (Tensor)
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %1 : None = prim::Constant()
+          %scalar : float = prim::Constant[value=2.5]()
+          %out : Tensor = aten::full_like(%0, %scalar, %1, %1, %1, %1, %1)
+          return (%out))IR";
+    std::vector<at::Tensor> input_data;
+    auto options_pyt_int = torch::TensorOptions().device(torch::kCUDA, 0).dtype(torch::kInt);
+    input_data.push_back(at::zeros({2, 3, 4}, options_pyt_int));
+    baidu::mirana::poros::FulllikeConverter fulllikeconverter;
+    generate_dy_test_helper(graph_IR, &fulllikeconverter, input_data);
+}
+
+TEST(Converters, ATenFulllikeDtypeConvertsCorrectly) {
+    // aten::full_like(Tensor self, Scalar fill_value, *, int? dtype=None, int? layout=None, Device? device=None, bool? pin_memory=None, int? memory_format=None) -> (Tensor)
+    // scalar type index in aten and support situation ('o' is support and 'x' is not support):
+    // uint8_t -> 0 x
+    // int8_t -> 1 x
+    // int16_t -> 2 x
+    // int -> 3 o
+    // int64_t -> 4 x
+    // Half -> 5 o
+    // float -> 6 o
+    // bool -> 11 x
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+            %1 : None = prim::Constant()
+            %2 : int = prim::Constant[value=6]()
+            %scalar : int = prim::Constant[value=2]()
+            %out : Tensor = aten::full_like(%0, %scalar, %2, %1, %1, %1, %1)
+            return (%out))IR";
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({2, 3, 4}, {at::kCUDA}));
+    baidu::mirana::poros::FulllikeConverter fulllikeconverter;
+    generate_dy_test_helper(graph_IR, &fulllikeconverter, input_data);
+}
+
+TEST(Converters, ATenFulllikeDynamicConvertsCorrectly) {
+    // aten::full_like(Tensor self, Scalar fill_value, *, int? dtype=None, int? layout=None, Device? device=None, bool? pin_memory=None, int? memory_format=None) -> (Tensor)
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %1 : None = prim::Constant()
+          %scalar : int = prim::Constant[value=2]()
+          %out : Tensor = aten::full_like(%0, %scalar, %1, %1, %1, %1, %1)
+          return (%out))IR";
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({4, 5, 6}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({2, 3, 4}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({2, 3, 4}, {at::kCUDA}));
+
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({2, 3, 4}, {at::kCUDA}));
+    baidu::mirana::poros::FulllikeConverter fulllikeconverter;
+    generate_dy_test_helper(graph_IR, &fulllikeconverter, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, ATenFulllikeDynamicDtypeConvertsCorrectly) {
+    // aten::full_like(Tensor self, Scalar fill_value, *, int? dtype=None, int? layout=None, Device? device=None, bool? pin_memory=None, int? memory_format=None) -> (Tensor)
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %1 : None = prim::Constant()
+          %2 : int = prim::Constant[value=3]()
+          %scalar : float = prim::Constant[value=2.5]()
+          %out : Tensor = aten::full_like(%0, %scalar, %2, %1, %1, %1, %1)
+          return (%out))IR";
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({4, 5, 6}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({2, 3, 4}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({2, 3, 4}, {at::kCUDA}));
+
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({2, 3, 4}, {at::kCUDA}));
+    baidu::mirana::poros::FulllikeConverter fulllikeconverter;
+    generate_dy_test_helper(graph_IR, &fulllikeconverter, input_data, true, &prewarm_data);
+}
\ No newline at end of file
diff --git a/poros/unittest/converter/group_norm_test.cpp b/poros/unittest/converter/group_norm_test.cpp
new file mode 100644
index 0000000000..7dafc83652
--- /dev/null
+++ b/poros/unittest/converter/group_norm_test.cpp
@@ -0,0 +1,188 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file group_norm_test.cpp
+* @author tianshaoqing@baidu.com
+* @date Wed Sep 27 11:24:21 CST 2021
+* @brief 
+**/
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/converter/gpu/group_norm.h"
+#include "poros/util/test_util.h"
+
+static void groupnorm_test_helper(const std::string& graph_IR,
+                            std::vector<at::Tensor>& input_data) {
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    baidu::mirana::poros::GroupNormConverter groupnormconverter;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &groupnormconverter, 
+                input_data, graph_output, poros_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+}
+
+TEST(Converters, ATenGroupNormConvertsCorrectly) {
+    // aten::group_norm(Tensor input, int num_groups, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enabled=True) -> Tensor
+    const auto graph_IR = R"IR(
+      graph(%0 : Tensor,
+            %gamma : Tensor,
+            %beta : Tensor):
+        %1: int = prim::Constant[value=2]()
+        %7 : bool = prim::Constant[value=0]()
+        %8 : float = prim::Constant[value=1.0000000000000001e-05]()
+        %9 : Tensor = aten::group_norm(%0, %1, %gamma, %beta, %8, %7)
+        return (%9))IR";
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({2, 10, 3, 3}, {at::kCUDA}));
+    input_data.push_back(at::randn({10}, {at::kCUDA}));
+    input_data.push_back(at::randn({10}, {at::kCUDA}));
+    groupnorm_test_helper(graph_IR, input_data);
+}
+
+TEST(Converters, ATenGroupNormConvertsCorrectly2InputsGamma) {
+    // aten::group_norm(Tensor input, int num_groups, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enabled=True) -> Tensor
+    const auto graph_IR = R"IR(
+      graph(%0 : Tensor, %gamma : Tensor):
+        %1 : int = prim::Constant[value=20]()
+        %2 : None = prim::Constant()
+        %7 : bool = prim::Constant[value=0]()
+        %8 : float = prim::Constant[value=1.0000000000000001e-05]()
+        %9 : Tensor = aten::group_norm(%0, %1, %gamma, %2, %8, %7)
+        return (%9))IR";
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({4, 100, 50, 50}, {at::kCUDA}));
+    input_data.push_back(at::randn({100}, {at::kCUDA}));
+    groupnorm_test_helper(graph_IR, input_data);
+}
+
+TEST(Converters, ATenGroupNormConvertsCorrectlyOneInput) {
+    // aten::group_norm(Tensor input, int num_groups, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enabled=True) -> Tensor
+    const auto graph_IR = R"IR(
+      graph(%0 : Tensor):
+        %1 : int = prim::Constant[value=20]()
+        %2 : None = prim::Constant()
+        %7 : bool = prim::Constant[value=0]()
+        %8 : float = prim::Constant[value=1.0000000000000001e-05]()
+        %9 : Tensor = aten::group_norm(%0, %1, %2, %2, %8, %7)
+        return (%9))IR";
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({4, 100, 50, 50}, {at::kCUDA}));
+    groupnorm_test_helper(graph_IR, input_data);
+}
+
+
+static void groupnorm_dy_test_helper(const std::string& graph_IR, 
+                                const std::vector<at::Tensor>& input_data,
+                                bool is_dynamic = false,
+                                std::vector<std::vector<at::Tensor>>* prewarm_data = nullptr) {
+    baidu::mirana::poros::GroupNormConverter groupnormconverter;
+
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = is_dynamic;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &groupnormconverter, 
+                input_data, graph_output, poros_output, prewarm_data));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+}
+
+TEST(Converters, ATenGroupNormConvertsDynamicCorrectly) {
+    // aten::group_norm(Tensor input, int num_groups, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enabled=True) -> Tensor
+    const auto graph_IR = R"IR(
+      graph(%0 : Tensor,
+            %gamma : Tensor,
+            %beta : Tensor):
+        %1: int = prim::Constant[value=2]()
+        %7 : bool = prim::Constant[value=0]()
+        %8 : float = prim::Constant[value=1.0000000000000001e-05]()
+        %9 : Tensor = aten::group_norm(%0, %1, %gamma, %beta, %8, %7)
+        return (%9))IR";
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({5, 10, 3, 3}, {at::kCUDA}));
+    prewarm_data[0].push_back(at::ones({10}, {at::kCUDA}));
+    prewarm_data[0].push_back(at::ones({10}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({2, 10, 3, 3}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::ones({10}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::ones({10}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({2, 10, 3, 3}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::ones({10}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::ones({10}, {at::kCUDA}));
+
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({2, 10, 3, 3}, {at::kCUDA}));
+    input_data.push_back(at::ones({10}, {at::kCUDA}));
+    input_data.push_back(at::ones({10}, {at::kCUDA}));
+
+    groupnorm_dy_test_helper(graph_IR, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, ATenGroupNormConvertsCorrectlyDynamic2Inputsgamma) {
+    // aten::group_norm(Tensor input, int num_groups, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enabled=True) -> Tensor
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor, %gamma : Tensor):
+          %1 : int = prim::Constant[value=2]()
+          %2 : None = prim::Constant()
+          %7 : bool = prim::Constant[value=0]()
+          %8 : float = prim::Constant[value=1.0000000000000001e-05]()
+          %9 : Tensor = aten::group_norm(%0, %1, %gamma, %2, %8, %7)
+          return (%9))IR";
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({20, 100, 50, 50}, {at::kCUDA}));
+    prewarm_data[0].push_back(at::ones({100}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({10, 100, 40, 40}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::ones({100}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({10, 100, 40, 40}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::ones({100}, {at::kCUDA}));
+
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({10, 100, 40, 40}, {at::kCUDA}));
+    input_data.push_back(at::ones({100}, {at::kCUDA}));
+
+    groupnorm_dy_test_helper(graph_IR, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, ATenGroupNormConvertsDynamicOneInputCorrectly) {
+    // aten::group_norm(Tensor input, int num_groups, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enabled=True) -> Tensor
+     const auto graph_IR = R"IR(
+      graph(%0 : Tensor):
+        %1 : int = prim::Constant[value=2]()
+        %2 : None = prim::Constant()
+        %7 : bool = prim::Constant[value=0]()
+        %8 : float = prim::Constant[value=1.0000000000000001e-05]()
+        %9 : Tensor = aten::group_norm(%0, %1, %2, %2, %8, %7)
+        return (%9))IR";
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({5, 10, 6, 6}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({2, 10, 3, 3}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({2, 10, 3, 3}, {at::kCUDA}));
+
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({2, 10, 3, 3}, {at::kCUDA}));
+
+    groupnorm_dy_test_helper(graph_IR, input_data, true, &prewarm_data);
+}
\ No newline at end of file
diff --git a/poros/unittest/converter/interpolate_test.cpp b/poros/unittest/converter/interpolate_test.cpp
new file mode 100644
index 0000000000..7265c7efcd
--- /dev/null
+++ b/poros/unittest/converter/interpolate_test.cpp
@@ -0,0 +1,273 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file interpolate_test.cpp
+* @author tianshaoqing@baidu.com
+* @date Wed Sep 27 11:24:21 CST 2021
+* @brief 
+**/
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/converter/gpu/interpolate.h"
+#include "poros/util/test_util.h"
+
+static void interpolate_test_helper(const std::string& graph_IR,
+                            baidu::mirana::poros::IConverter* converter,
+                            std::vector<int64_t> shape){
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn(shape, {at::kCUDA}));
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, converter, 
+                input_data, graph_output, poros_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+}
+
+static std::string gen_upsample_nearest_nd_graph(bool vec_scales,
+                                                const std::string& op,
+                                                const std::string& output_size,
+                                                const std::string& scales) {
+    std::string output_ir("");
+    std::string scales_ir("");
+    std::string op_ir("");
+    if (!vec_scales) {
+        output_ir = "int[] = prim::Constant[value=[" + output_size + "]]()";
+        if (scales.empty()) {
+            scales_ir = "None = prim::Constant()";
+        } else {
+            scales_ir = "float = prim::Constant[value=" + scales + "]()";
+        }
+        if (op == "upsample_nearest1d") {
+            op_ir = op + "(%0, %1, %2)";
+        } else if (op == "upsample_nearest2d") {
+            op_ir = op + "(%0, %1, %2, %2)";
+        } else if (op == "upsample_nearest3d") {
+            op_ir = op + "(%0, %1, %2, %2, %2)";
+        } else {
+            return "";
+        }
+    } else {
+        if (output_size.empty()) {
+            output_ir = "None = prim::Constant()";
+        } else {
+            output_ir = "int[] = prim::Constant[value=[" + output_size + "]]()";
+        }
+        if (scales.empty()) {
+            scales_ir = "None = prim::Constant()";
+        } else {
+            scales_ir = "float[] = prim::Constant[value=[" + scales + "]]()";
+        }
+        op_ir = op + "(%0, %1, %2)";
+    }
+    return R"IR(
+        graph(%0 : Tensor):
+          %1 : )IR" + output_ir + R"IR(
+          %2 : )IR" + scales_ir + R"IR(
+          %3 : Tensor = aten::)IR" + op_ir + R"IR(
+          return (%3))IR";
+}
+
+static std::string gen_upsample_linear_graph(bool vec_scales,
+                                            const std::string& op,
+                                            const std::string& output_size,
+                                            const std::string& align_corners,
+                                            const std::string& scales) {
+    std::string output_ir("");
+    std::string scales_ir("");
+    std::string op_ir("");
+    if (!vec_scales) {
+        output_ir = "int[] = prim::Constant[value=[" + output_size + "]]()";
+        if (scales.empty()) {
+            scales_ir = "None = prim::Constant()";
+        } else {
+            scales_ir = "float = prim::Constant[value=" + scales + "]()";
+        }
+        if (op == "upsample_linear1d") {
+            op_ir = op + "(%0, %1, %2, %3)";
+        } else if (op == "upsample_bilinear2d") {
+            op_ir = op + "(%0, %1, %2, %3, %3)";
+        } else if (op == "upsample_trilinear3d") {
+            op_ir = op + "(%0, %1, %2, %3, %3, %3)";
+        } else {
+            return "";
+        }
+    } else {
+        if (output_size.empty()) {
+            output_ir = "None = prim::Constant()";
+        } else {
+            output_ir = "int[] = prim::Constant[value=[" + output_size + "]]()";
+        }
+        if (scales.empty()) {
+            scales_ir = "None = prim::Constant()";
+        } else {
+            scales_ir = "float[] = prim::Constant[value=[" + scales + "]]()";
+        }
+        op_ir = op + "(%0, %1, %2, %3)";
+    }
+
+    return R"IR(
+        graph(%0 : Tensor):
+          %1 : )IR" + output_ir + R"IR(
+          %2 : bool = prim::Constant[value=)IR" + align_corners + R"IR(]()
+          %3 : )IR" + scales_ir + R"IR(
+          %4 : Tensor = aten::)IR" + op_ir + R"IR(
+          return (%4))IR";
+}
+
+TEST(Converters, ATenUpsampleNearest1d) {
+    // aten::upsample_nearest1d(Tensor self, int[1] output_size, float? scales=None) -> Tensor
+    const auto graph_IR = gen_upsample_nearest_nd_graph(false, "upsample_nearest1d", "10", "");
+    baidu::mirana::poros::UnsampleNearest1DConverter unsamplenearest1dconverter;
+    interpolate_test_helper(graph_IR, &unsamplenearest1dconverter, {10, 2, 2});
+}
+
+TEST(Converters, ATenUpsampleNearest1dScalar) {
+    // aten::upsample_nearest1d(Tensor self, int[1] output_size, float? scales=None) -> Tensor
+    const auto graph_IR = gen_upsample_nearest_nd_graph(false, "upsample_nearest1d", "8", "4.0");
+    baidu::mirana::poros::UnsampleNearest1DConverter unsamplenearest1dconverter;
+    interpolate_test_helper(graph_IR, &unsamplenearest1dconverter, {10, 2, 2});
+}
+
+TEST(Converters, ATenUpsampleNearest1dVecScalar) {
+    // aten::upsample_nearest1d.vec(Tensor input, int[]? output_size, float[]? scale_factors) -> Tensor
+    const auto graph_IR = gen_upsample_nearest_nd_graph(true, "upsample_nearest1d", "", "4.0");          
+    baidu::mirana::poros::UnsampleNearest1DConverter unsamplenearest1dconverter;
+    interpolate_test_helper(graph_IR, &unsamplenearest1dconverter, {10, 2, 2});
+}
+
+TEST(Converters, ATenUpsampleNearest2d) {
+    // aten::upsample_nearest2d(Tensor self, int[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor
+    const auto graph_IR = gen_upsample_nearest_nd_graph(false, "upsample_nearest2d", "10, 8", "");
+    baidu::mirana::poros::UnsampleNearest2DConverter unsamplenearest2dconverter;
+    interpolate_test_helper(graph_IR, &unsamplenearest2dconverter, {10, 2, 2, 2});
+}
+
+TEST(Converters, ATenUpsampleNearest2dScalar) {
+    // aten::upsample_nearest2d(Tensor self, int[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor
+    const auto graph_IR = gen_upsample_nearest_nd_graph(false, "upsample_nearest2d", "8, 8", "4.0");
+    baidu::mirana::poros::UnsampleNearest2DConverter unsamplenearest2dconverter;
+    interpolate_test_helper(graph_IR, &unsamplenearest2dconverter, {10, 2, 2, 2});
+}
+
+TEST(Converters, ATenUpsampleNearest2dVecScalar) {
+    // aten::upsample_nearest2d.vec(Tensor input, int[]? output_size, float[]? scale_factors) -> Tensor
+    const auto graph_IR = gen_upsample_nearest_nd_graph(true, "upsample_nearest2d", "", "5.0, 4.0");
+    baidu::mirana::poros::UnsampleNearest2DConverter unsamplenearest2dconverter;
+    interpolate_test_helper(graph_IR, &unsamplenearest2dconverter, {10, 2, 2, 2});
+}
+
+TEST(Converters, ATenUpsampleNearest3d) {
+    // aten::upsample_nearest3d(Tensor self, int[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+    const auto graph_IR = gen_upsample_nearest_nd_graph(false, "upsample_nearest3d", "10, 8, 6", "");
+    baidu::mirana::poros::UnsampleNearest3DConverter unsamplenearest3dconverter;
+    interpolate_test_helper(graph_IR, &unsamplenearest3dconverter, {10, 2, 2, 2, 2});
+}
+
+TEST(Converters, ATenUpsampleNearest3dScalar) {
+    // aten::upsample_nearest3d(Tensor self, int[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+    const auto graph_IR = gen_upsample_nearest_nd_graph(false, "upsample_nearest3d", "8, 8, 8", "4.0");
+    baidu::mirana::poros::UnsampleNearest3DConverter unsamplenearest3dconverter;
+    interpolate_test_helper(graph_IR, &unsamplenearest3dconverter, {10, 2, 2, 2, 2});
+}
+
+TEST(Converters, ATenUpsampleNearest3dVecScalar) {
+    // aten::upsample_nearest3d.vec(Tensor input, int[]? output_size, float[]? scale_factors) -> Tensor
+    const auto graph_IR = gen_upsample_nearest_nd_graph(true, "upsample_nearest3d", "", "5.0, 4.0, 3.0");
+    baidu::mirana::poros::UnsampleNearest3DConverter unsamplenearest3dconverter;
+    interpolate_test_helper(graph_IR, &unsamplenearest3dconverter, {10, 2, 2, 2, 2});
+}
+
+// start almost equal
+TEST(Converters, ATenUpsampleLinear1dWithAlignCorners) {
+    // aten::upsample_linear1d(Tensor self, int[1] output_size, bool align_corners, float? scales=None) -> Tensor
+    const auto graph_IR = gen_upsample_linear_graph(false, "upsample_linear1d", "10", "1", "");
+    baidu::mirana::poros::UnsampleLinear1DConverter unsamplelinear1dconverter;
+    interpolate_test_helper(graph_IR, &unsamplelinear1dconverter, {10, 2, 2});
+}
+
+TEST(Converters, ATenUpsampleLinear1dWithoutAlignCorners) {
+    // aten::upsample_linear1d(Tensor self, int[1] output_size, bool align_corners, float? scales=None) -> Tensor
+    const auto graph_IR = gen_upsample_linear_graph(false, "upsample_linear1d", "10", "0", "5.0");
+    baidu::mirana::poros::UnsampleLinear1DConverter unsamplelinear1dconverter;
+    interpolate_test_helper(graph_IR, &unsamplelinear1dconverter, {10, 2, 2});
+}
+
+TEST(Converters, ATenUpsampleLinear1dScalesWithoutAlignCorners) {
+    // aten::upsample_linear1d(Tensor self, int[1] output_size, bool align_corners, float? scales=None) -> Tensor
+    const auto graph_IR = gen_upsample_linear_graph(false, "upsample_linear1d", "8", "0", "4.0");
+    baidu::mirana::poros::UnsampleLinear1DConverter unsamplelinear1dconverter;
+    interpolate_test_helper(graph_IR, &unsamplelinear1dconverter, {10, 2, 2});
+}
+TEST(Converters, ATenUpsampleLinear1dVecScaleFactorsWithoutAlignCorners) {
+    // aten::upsample_linear1d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
+    const auto graph_IR = gen_upsample_linear_graph(true, "upsample_linear1d", "", "0", "4.0");
+    baidu::mirana::poros::UnsampleLinear1DConverter unsamplelinear1dconverter;
+    interpolate_test_helper(graph_IR, &unsamplelinear1dconverter, {10, 2, 2});
+}
+
+TEST(Converters, ATenUpsampleBilinear2dWithAlignCorners) {
+    // aten::upsample_bilinear2d(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+    const auto graph_IR = gen_upsample_linear_graph(false, "upsample_bilinear2d", "10, 8", "1", "");
+    baidu::mirana::poros::UnsampleBilinear2DConverter unsamplebilinear2dconverter;
+    interpolate_test_helper(graph_IR, &unsamplebilinear2dconverter, {10, 2, 2, 2});
+}
+
+TEST(Converters, ATenUpsampleBilinear2dWithoutAlignCorners) {
+    // aten::upsample_bilinear2d(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+    const auto graph_IR = gen_upsample_linear_graph(false, "upsample_bilinear2d", "10, 8", "0", "");
+    baidu::mirana::poros::UnsampleBilinear2DConverter unsamplebilinear2dconverter;
+    interpolate_test_helper(graph_IR, &unsamplebilinear2dconverter, {10, 2, 2, 2});
+}
+
+TEST(Converters, ATenUpsampleBilinear2dScalesWithoutAlignCorners) {
+    // aten::upsample_bilinear2d(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+    const auto graph_IR = gen_upsample_linear_graph(false, "upsample_bilinear2d", "10, 10", "0", "5.0");
+    baidu::mirana::poros::UnsampleBilinear2DConverter unsamplebilinear2dconverter;
+    interpolate_test_helper(graph_IR, &unsamplebilinear2dconverter, {10, 2, 2, 2});
+}
+
+TEST(Converters, ATenUpsampleBilinear2dVecScaleFactorsWithoutAlignCorners) {
+    // aten::upsample_bilinear2d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
+    const auto graph_IR = gen_upsample_linear_graph(true, "upsample_bilinear2d", "", "0", "5.0, 4.0");
+    baidu::mirana::poros::UnsampleBilinear2DConverter unsamplebilinear2dconverter;
+    interpolate_test_helper(graph_IR, &unsamplebilinear2dconverter, {10, 2, 2, 2});
+}
+
+TEST(Converters, ATenUpsampleTrilinear3dWithAlignCorners) {
+    // aten::upsample_trilinear3d(Tensor self, int[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+    const auto graph_IR = gen_upsample_linear_graph(false, "upsample_trilinear3d", "10, 8, 6", "1", "");
+    baidu::mirana::poros::UnsampleTrilinear3DConverter unsampletrilinear3dconverter;
+    interpolate_test_helper(graph_IR, &unsampletrilinear3dconverter, {10, 2, 2, 2, 2});
+}
+
+TEST(Converters, ATenUpsampleTrilinear3dWithoutAlignCorners) {
+    // aten::upsample_trilinear3d(Tensor self, int[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+    const auto graph_IR = gen_upsample_linear_graph(false, "upsample_trilinear3d", "10, 8, 6", "0", "");
+    baidu::mirana::poros::UnsampleTrilinear3DConverter unsampletrilinear3dconverter;
+    interpolate_test_helper(graph_IR, &unsampletrilinear3dconverter, {10, 2, 2, 2, 2});
+}
+
+TEST(Converters, ATenUpsampleTrilinear3dVecScaleFactorsWithoutAlignCorners) {
+    // aten::upsample_trilinear3d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
+    const auto graph_IR = gen_upsample_linear_graph(true, "upsample_trilinear3d", "", "0", "5.0, 4.0, 3.0");
+    baidu::mirana::poros::UnsampleTrilinear3DConverter unsampletrilinear3dconverter;
+    interpolate_test_helper(graph_IR, &unsampletrilinear3dconverter, {10, 2, 2, 2, 2});
+}
\ No newline at end of file
diff --git a/poros/unittest/converter/layer_norm_test.cpp b/poros/unittest/converter/layer_norm_test.cpp
new file mode 100644
index 0000000000..fa1ea9f0a3
--- /dev/null
+++ b/poros/unittest/converter/layer_norm_test.cpp
@@ -0,0 +1,198 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file layer_norm_test.cpp
+* @author tianshaoqing@baidu.com
+* @date Wed Sep 27 11:24:21 CST 2021
+* @brief 
+**/
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/converter/gpu/layer_norm.h"
+#include "poros/util/test_util.h"
+
+static void layernorm_test_helper(const std::string& graph_IR,
+                            std::vector<at::Tensor>& input_data) {
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    baidu::mirana::poros::LayerNormConverter layernormconverter;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &layernormconverter, 
+                input_data, graph_output, poros_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+}
+
+TEST(Converters, ATenLayerNormConvertsCorrectlyLast3Dims) {
+    // aten::layer_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor
+    const auto graph_IR = R"IR(
+      graph(%0 : Tensor,
+            %gamma : Tensor,
+            %beta : Tensor):
+        %1: int = prim::Constant[value=3]()
+        %2: int = prim::Constant[value=100]()
+        %3: int = prim::Constant[value=100]()
+        %4 : int[] = prim::ListConstruct(%1, %2, %3)
+        %7 : bool = prim::Constant[value=0]()
+        %8 : float = prim::Constant[value=1.0000000000000001e-05]()
+        %9 : Tensor = aten::layer_norm(%0, %4, %gamma, %beta, %8, %7)
+        return (%9))IR";
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({4, 3, 100, 100}, {at::kCUDA}));
+    input_data.push_back(at::randn({3, 100, 100}, {at::kCUDA}));
+    input_data.push_back(at::randn({3, 100, 100}, {at::kCUDA}));
+    layernorm_test_helper(graph_IR, input_data);
+}
+
+// 同conv2d
+TEST(Converters, ATenLayerNormConvertsCorrectlyLast2Dims) {
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor,
+              %gamma : Tensor,
+              %beta : Tensor):
+          %2: int = prim::Constant[value=100]()
+          %3: int = prim::Constant[value=100]()
+          %4 : int[] = prim::ListConstruct(%2, %3)
+          %7 : bool = prim::Constant[value=0]()
+          %8 : float = prim::Constant[value=1.0000000000000001e-05]()
+          %9 : Tensor = aten::layer_norm(%0, %4, %gamma, %beta, %8, %7)
+          return (%9))IR";
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({4, 3, 100, 100}, {at::kCUDA}));
+    input_data.push_back(at::randn({100, 100}, {at::kCUDA}));
+    input_data.push_back(at::randn({100, 100}, {at::kCUDA}));
+    layernorm_test_helper(graph_IR, input_data);
+}
+
+TEST(Converters, ATenLayerNormConvertsCorrectlyLast1Dims) {
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor,
+              %gamma : Tensor,
+              %beta : Tensor):
+          %3: int = prim::Constant[value=100]()
+          %4 : int[] = prim::ListConstruct(%3)
+          %7 : bool = prim::Constant[value=0]()
+          %8 : float = prim::Constant[value=1.0000000000000001e-05]()
+          %9 : Tensor = aten::layer_norm(%0, %4, %gamma, %beta, %8, %7)
+          return (%9))IR";
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({4, 3, 100, 100}, {at::kCUDA}));
+    input_data.push_back(at::randn({100}, {at::kCUDA}));
+    input_data.push_back(at::randn({100}, {at::kCUDA}));
+    layernorm_test_helper(graph_IR, input_data);
+}
+
+TEST(Converters, ATenLayerNormConvertsCorrectly2InputsGamma) {
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor,
+              %gamma: Tensor):
+          %beta: None = prim::Constant()
+          %1: int = prim::Constant[value=100]()
+          %4 : int[] = prim::ListConstruct(%1)
+          %7 : bool = prim::Constant[value=0]()
+          %8 : float = prim::Constant[value=1.0000000000000001e-05]()
+          %9 : Tensor = aten::layer_norm(%0, %4, %gamma, %beta, %8, %7)
+          return (%9))IR";
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({4, 3, 100, 100}, {at::kCUDA}));
+    input_data.push_back(at::randn({100}, {at::kCUDA}));
+    layernorm_test_helper(graph_IR, input_data);
+}
+
+static void layernorm_dy_test_helper(const std::string& graph_IR, 
+                                const std::vector<at::Tensor>& input_data,
+                                bool is_dynamic = false,
+                                std::vector<std::vector<at::Tensor>>* prewarm_data = nullptr) {
+    baidu::mirana::poros::LayerNormConverter layernormconverter;
+
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = is_dynamic;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &layernormconverter, 
+                input_data, graph_output, poros_output, prewarm_data));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+}
+
+TEST(Converters, ATenLayerNormConvertsCorrectly3dDynamicInput1dNormalizedShape) {
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor,
+              %gamma: Tensor,
+              %beta: Tensor):
+          %1: int = prim::Constant[value=4]()
+          %4 : int[] = prim::ListConstruct(%1)
+          %7 : bool = prim::Constant[value=0]()
+          %8 : float = prim::Constant[value=1.0000000000000001e-05]()
+          %9 : Tensor = aten::layer_norm(%0, %4, %gamma, %beta, %8, %7)
+          return (%9))IR";
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({10, 3, 4}, {at::kCUDA}));
+    prewarm_data[0].push_back(at::ones({4}, {at::kCUDA}));
+    prewarm_data[0].push_back(at::ones({4}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({5, 3, 4}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::ones({4}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::ones({4}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({5, 3, 4}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::ones({4}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::ones({4}, {at::kCUDA}));
+
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({5, 3, 4}, {at::kCUDA}));
+    input_data.push_back(at::ones({4}, {at::kCUDA}));
+    input_data.push_back(at::ones({4}, {at::kCUDA}));
+
+    layernorm_dy_test_helper(graph_IR, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, ATenLayerNormConvertsCorrectly3dDynamicInput2dNormalizedShape) {
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor,
+              %gamma : Tensor,
+              %beta : Tensor):
+          %2: int = prim::Constant[value=3]()
+          %3: int = prim::Constant[value=4]()
+          %4 : int[] = prim::ListConstruct(%2, %3)
+          %7 : bool = prim::Constant[value=0]()
+          %8 : float = prim::Constant[value=1.0000000000000001e-05]()
+          %9 : Tensor = aten::layer_norm(%0, %4, %gamma, %beta, %8, %7)
+          return (%9))IR";
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({10, 3, 4}, {at::kCUDA}));
+    prewarm_data[0].push_back(at::ones({3, 4}, {at::kCUDA}));
+    prewarm_data[0].push_back(at::ones({3, 4}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({5, 3, 4}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::ones({3, 4}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::ones({3, 4}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({5, 3, 4}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::ones({3, 4}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::ones({3, 4}, {at::kCUDA}));
+
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({5, 3, 4}, {at::kCUDA}));
+    input_data.push_back(at::ones({3, 4}, {at::kCUDA}));
+    input_data.push_back(at::ones({3, 4}, {at::kCUDA}));
+
+    layernorm_dy_test_helper(graph_IR, input_data, true, &prewarm_data);
+}
\ No newline at end of file
diff --git a/poros/unittest/converter/linear_test.cpp b/poros/unittest/converter/linear_test.cpp
new file mode 100644
index 0000000000..e5fbf23682
--- /dev/null
+++ b/poros/unittest/converter/linear_test.cpp
@@ -0,0 +1,104 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file linear_test.cpp
+* @author tianshaoqing@baidu.com
+* @date Wed Sep 27 11:24:21 CST 2021
+* @brief 
+**/
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/converter/gpu/linear.h"
+#include "poros/util/test_util.h"
+
+static void linear_test_helper(const std::string& graph_IR,
+                            const std::vector<at::Tensor>& input_data,
+                            const std::vector<size_t> replace_const_index) {
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    baidu::mirana::poros::LinearConverter linearconverter;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &linearconverter, 
+                input_data, graph_output, poros_output, nullptr, "", replace_const_index));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+}
+
+static std::string gen_no_bias_graph() {
+    std::string graph = R"IR(
+        graph(%0 : Tensor, %1 : Tensor):
+          %2 : None = prim::Constant()
+          %3 : Tensor = aten::linear(%0, %1, %2)
+          return (%3))IR";
+    return graph;
+}
+
+TEST(Converters, ATenLinearNoBiasConvertsCorrectly) {
+    // aten::linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
+    const auto graph_IR = gen_no_bias_graph();
+    baidu::mirana::poros::LinearConverter linearconverter;
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({1, 2}, {at::kCUDA}));
+    input_data.push_back(at::randn({3, 2}, {at::kCUDA})); // 内部转置
+    linear_test_helper(graph_IR, input_data, {});
+}
+
+TEST(Converters, ATenLinearNoBiasNeedPaddingConvertsCorrectly) {
+    // aten::linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
+    const auto graph_IR = gen_no_bias_graph();
+    baidu::mirana::poros::LinearConverter linearconverter;
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({2, 64, 8}, {at::kCUDA}));
+    input_data.push_back(at::randn({30, 8}, {at::kCUDA})); // 内部转置
+    linear_test_helper(graph_IR, input_data, {});
+}
+
+TEST(Converters, ATenLinearNoBiasNeedPaddingConstWeightConvertsCorrectly) {
+    // aten::linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
+    const auto graph_IR = gen_no_bias_graph();
+    baidu::mirana::poros::LinearConverter linearconverter;
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({2, 64, 8}, {at::kCUDA}));
+    input_data.push_back(at::randn({30, 8}, {at::kCUDA})); // 内部转置
+    linear_test_helper(graph_IR, input_data, {1}); //把第二个参数转换成常量
+}
+
+TEST(Converters, ATenLinearNoBiasNeedPaddingConstWeight2ConvertsCorrectly) {
+    // aten::linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
+    const auto graph_IR = gen_no_bias_graph();
+    baidu::mirana::poros::LinearConverter linearconverter;
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({2, 64, 64, 8}, {at::kCUDA}));
+    input_data.push_back(at::randn({30, 8}, {at::kCUDA})); // 内部转置
+    linear_test_helper(graph_IR, input_data, {1});  //把第二个参数转换成常量
+}
+
+TEST(Converters, ATenLinearBiasConvertsCorrectly) {
+    // aten::linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor, %1 : Tensor, %2 : Tensor):
+          %3 : Tensor = aten::linear(%0, %1, %2)
+          return (%3))IR";
+    baidu::mirana::poros::LinearConverter linearconverter;
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({1, 3}, {at::kCUDA}));
+    input_data.push_back(at::randn({2, 3}, {at::kCUDA}));
+    input_data.push_back(at::randn({2}, {at::kCUDA}));
+    linear_test_helper(graph_IR, input_data, {});
+}
\ No newline at end of file
diff --git a/poros/unittest/converter/logical_test.cpp b/poros/unittest/converter/logical_test.cpp
new file mode 100644
index 0000000000..945c4e7418
--- /dev/null
+++ b/poros/unittest/converter/logical_test.cpp
@@ -0,0 +1,201 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file logical_test.cpp
+* @author Lin Xiao Chun (linxiaochun@baidu.com)
+* @date 2022-02-17 18:32:15
+* @brief
+**/
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/converter/gpu/logical.h"
+#include "poros/util/test_util.h"
+
+enum InputTypeEnum {
+    TYPE_A = 0, //  [4]*[4]
+    TYPE_B, // [2,2]*[2,2]
+    TYPE_C, // [4]*[true]
+    TYPE_D, //broadcasting [1,3,2]*[2]
+    TYPE_E, //broadcasting [2,3,4]*[3,4]
+};
+
+static std::vector<at::Tensor> get_input_data(const InputTypeEnum input_type) {
+    std::vector<at::Tensor> input_data;
+    auto options_pyt = torch::TensorOptions().device(torch::kCUDA, 0).dtype(torch::kBool);
+
+    switch (input_type) {
+        case TYPE_A: //  [4]*[4]
+            input_data.push_back(torch::tensor({false, true, false, true}, options_pyt));
+            input_data.push_back(torch::tensor({false, true, true, true}, options_pyt));
+            break;
+        case TYPE_B:// [2,2]*[2,2]
+            input_data.push_back(torch::tensor({{false, true},
+                                                {false, true}}, options_pyt));
+            input_data.push_back(torch::tensor({{false, true},
+                                                {true,  true}}, options_pyt));
+            break;
+        case TYPE_C:// [4]*[1]
+            input_data.push_back(torch::tensor({false, true, false, true}, options_pyt));
+            input_data.push_back(torch::tensor({true}, options_pyt));
+            break;
+        case TYPE_D://broadcasting [1,3,2]*[2]
+            input_data.push_back(torch::tensor({{{true, true}, {false, true}, {false, false}}}, options_pyt));
+            input_data.push_back(torch::tensor({false, true}, options_pyt));
+            break;
+        case TYPE_E://broadcasting [2,3,4]*[3,4]
+            input_data.push_back(torch::tensor({
+                                                       {{false, true, false, true},  {false, true, false, false},
+                                                               {true,  true, true,  true}},
+                                                       {{false, true, false, false}, {true,  true, true,  true},
+                                                               {false, true, false, true}}
+                                               }, options_pyt));
+            input_data.push_back(torch::tensor({{false, true, false, true},
+                                                {false, true, false, false},
+                                                {true,  true, true,  true}}, options_pyt));
+            break;
+    }
+
+    return input_data;
+}
+
+static void and_test_helper(const std::string &graph_IR,
+                            baidu::mirana::poros::IConverter *converter,
+                            const InputTypeEnum input_type) {
+
+    auto input_data = get_input_data(input_type);
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, converter,
+                                                                    input_data, graph_output, poros_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+}
+
+static std::string gen_and_or_tensor_graph(const std::string &op) {
+    return R"IR(
+        graph(%0 : Tensor, %1 : Tensor):
+          %2 : Tensor = aten::)IR" + op + R"IR((%0, %1)
+          return (%2))IR";
+}
+
+static std::string gen_not_tensor_graph(const std::string &op) {
+    return R"IR(
+        graph(%0 : Tensor):
+          %2 : Tensor = aten::)IR" + op + R"IR((%0)
+          return (%2))IR";
+}
+
+TEST(Converters, ATenLogicalAndConvertsCorrectly) {
+
+    const auto graph_IR = gen_and_or_tensor_graph("__and__");
+    baidu::mirana::poros::AndConverter converter;
+    and_test_helper(graph_IR, &converter, TYPE_A);
+    and_test_helper(graph_IR, &converter, TYPE_B);
+    and_test_helper(graph_IR, &converter, TYPE_C);
+    and_test_helper(graph_IR, &converter, TYPE_D);
+    and_test_helper(graph_IR, &converter, TYPE_E);
+}
+
+TEST(Converters, ATenLogicalBitwiseAndConvertsCorrectly) {
+
+    const auto graph_IR = gen_and_or_tensor_graph("bitwise_and");
+    baidu::mirana::poros::AndConverter converter;
+    and_test_helper(graph_IR, &converter, TYPE_A);
+    and_test_helper(graph_IR, &converter, TYPE_B);
+    and_test_helper(graph_IR, &converter, TYPE_C);
+    and_test_helper(graph_IR, &converter, TYPE_D);
+    and_test_helper(graph_IR, &converter, TYPE_E);
+}
+
+TEST(Converters, ATenLogicalOrConvertsCorrectly) {
+
+    const auto graph_IR = gen_and_or_tensor_graph("__or__");
+    baidu::mirana::poros::OrConverter converter;
+    and_test_helper(graph_IR, &converter, TYPE_A);
+    and_test_helper(graph_IR, &converter, TYPE_B);
+    and_test_helper(graph_IR, &converter, TYPE_C);
+    and_test_helper(graph_IR, &converter, TYPE_D);
+    and_test_helper(graph_IR, &converter, TYPE_E);
+}
+
+TEST(Converters, ATenLogicalBitwiseOrConvertsCorrectly) {
+
+    const auto graph_IR = gen_and_or_tensor_graph("bitwise_or");
+    baidu::mirana::poros::OrConverter converter;
+    and_test_helper(graph_IR, &converter, TYPE_A);
+    and_test_helper(graph_IR, &converter, TYPE_B);
+    and_test_helper(graph_IR, &converter, TYPE_C);
+    and_test_helper(graph_IR, &converter, TYPE_D);
+    and_test_helper(graph_IR, &converter, TYPE_E);
+}
+
+TEST(Converters, ATenLogicalXOrConvertsCorrectly) {
+
+    const auto graph_IR = gen_and_or_tensor_graph("__xor__");
+    baidu::mirana::poros::XorConverter converter;
+    and_test_helper(graph_IR, &converter, TYPE_A);
+    and_test_helper(graph_IR, &converter, TYPE_B);
+    and_test_helper(graph_IR, &converter, TYPE_C);
+    and_test_helper(graph_IR, &converter, TYPE_D);
+    and_test_helper(graph_IR, &converter, TYPE_E);
+}
+
+TEST(Converters, ATenLogicalBitwiseXOrConvertsCorrectly) {
+
+    const auto graph_IR = gen_and_or_tensor_graph("bitwise_xor");
+    baidu::mirana::poros::XorConverter converter;
+    and_test_helper(graph_IR, &converter, TYPE_A);
+    and_test_helper(graph_IR, &converter, TYPE_B);
+    and_test_helper(graph_IR, &converter, TYPE_C);
+    and_test_helper(graph_IR, &converter, TYPE_D);
+    and_test_helper(graph_IR, &converter, TYPE_E);
+}
+
+static void not_test_helper(const std::string &graph_IR,
+                            baidu::mirana::poros::IConverter *converter,
+                            const InputTypeEnum input_type) {
+
+    auto input_data = get_input_data(input_type);
+    input_data.pop_back(); // only need one input, pop out the last one
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, converter,
+                                                                    input_data, graph_output, poros_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+}
+
+TEST(Converters, ATenLogicalBitwiseNotConvertsCorrectly) {
+
+    const auto graph_IR = gen_not_tensor_graph("bitwise_not");
+    baidu::mirana::poros::NotConverter converter;
+    not_test_helper(graph_IR, &converter, TYPE_A);
+    not_test_helper(graph_IR, &converter, TYPE_B);
+    not_test_helper(graph_IR, &converter, TYPE_C);
+    not_test_helper(graph_IR, &converter, TYPE_D);
+    not_test_helper(graph_IR, &converter, TYPE_E);
+}
+
+//}
\ No newline at end of file
diff --git a/poros/unittest/converter/lstm_cell_test.cpp b/poros/unittest/converter/lstm_cell_test.cpp
new file mode 100644
index 0000000000..3f04dea455
--- /dev/null
+++ b/poros/unittest/converter/lstm_cell_test.cpp
@@ -0,0 +1,75 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file lstm_cell_test.cpp
+* @author wangrui39@baidu.com
+* @date Mon December 13 11:36:11 CST 2021
+* @brief 
+**/
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/converter/gpu/lstm_cell.h"
+#include "poros/util/test_util.h"
+
+static void linear_test_helper(const std::string& graph_IR,
+                            const std::vector<at::Tensor>& input_data) {
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    baidu::mirana::poros::LstmCellConverter lstm_cellconverter;
+
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &lstm_cellconverter, 
+                input_data, graph_output, poros_output));
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+}
+
+TEST(Converters, ATenlstm_cellconverterCorrectly) {
+    //aten::lstm_cell(Tensor input, Tensor[] hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> (Tensor, Tensor)
+    
+    const auto graph = R"IR(
+        graph(%0 : Tensor,
+          %1 : Tensor,
+          %3 : Tensor,
+          %4 : Tensor,
+          %5 : Tensor,
+          %6 : Tensor,
+          %7 : Tensor):
+          %2 : Tensor[] = prim::ListConstruct(%0, %1)
+          %8 : Tensor, %9 : Tensor = aten::lstm_cell(%3, %2, %4, %5, %6, %7)
+          return (%8))IR";
+
+    std::vector<at::Tensor> input_data;
+    auto input = at::randn({50, 10}, {at::kCUDA});
+    auto h0 = at::randn({50, 20}, {at::kCUDA});
+    auto c0 = at::randn({50, 20}, {at::kCUDA});
+    auto w_ih = at::randn({4 * 20, 10}, {at::kCUDA});
+    auto w_hh = at::randn({4 * 20, 20}, {at::kCUDA});
+    auto b_ih = at::randn({4 * 20}, {at::kCUDA});
+    auto b_hh = at::randn({4 * 20}, {at::kCUDA});
+    
+    input_data.push_back(h0);
+    input_data.push_back(c0);
+    input_data.push_back(input);
+    input_data.push_back(w_ih);
+    input_data.push_back(w_hh);
+    input_data.push_back(b_ih);
+    input_data.push_back(b_hh);
+
+    linear_test_helper(graph, input_data);
+}
diff --git a/poros/unittest/converter/lstm_test.cpp b/poros/unittest/converter/lstm_test.cpp
new file mode 100644
index 0000000000..012e6701a5
--- /dev/null
+++ b/poros/unittest/converter/lstm_test.cpp
@@ -0,0 +1,224 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file lstm_cell_test.cpp
+* @author wangrui39@baidu.com
+* @date Mon December 13 11:36:11 CST 2021
+* @brief 
+**/
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/converter/gpu/lstm.h"
+#include "poros/util/test_util.h"
+
+static void lstm_test_helper(const std::string& graph_IR,
+                            const std::vector<at::Tensor>& input_data,
+                            baidu::mirana::poros::IConverter* converter) {
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, converter,
+                input_data, graph_output, poros_output));
+    ASSERT_EQ(3, graph_output.size());
+    ASSERT_EQ(3, poros_output.size());
+
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[1], poros_output[1], 2e-6));
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[2], poros_output[2], 2e-6));
+
+}
+
+TEST(Converters, ATenlstmconverterCorrectly) {
+    // aten::lstm.input(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor)
+    // num_layers = 1
+    // bidirectional = false
+    // batch_first = false
+    const auto graph = R"IR(
+        graph( %0 : Tensor,
+          %1 : Tensor,
+          %2 : Tensor,
+          %3 : Tensor,
+          %4 : Tensor,
+          %5 : Tensor,
+          %6 : Tensor):
+          %11 : bool = prim::Constant[value=1]()
+          %12 : bool = prim::Constant[value=0]()
+          %13 : int = prim::Constant[value=1]()
+          %14 : float = prim::Constant[value=0.0]()
+          %15 : Tensor[] = prim::ListConstruct(%0, %1)
+          %16 : Tensor[] = prim::ListConstruct(%3, %4, %5, %6)
+          %17 : Tensor, %18 : Tensor, %19 : Tensor = aten::lstm(%2, %15, %16, %11, %13, %14, %12, %12, %12)
+          return (%17, %18, %19))IR";
+
+    /*const auto graph = R"IR(
+        graph( %0 : Tensor,
+          %1 : Tensor,
+          %2 : Tensor,
+          %3 : Tensor,
+          %4 : Tensor,
+          %5 : Tensor,
+          %6 : Tensor):
+          %11 : bool = prim::Constant[value=1]()
+          %12 : bool = prim::Constant[value=0]()
+          %13 : int = prim::Constant[value=1]()
+          %14 : float = prim::Constant[value=0.0]()
+          %15 : Tensor[] = prim::ListConstruct(%0, %1)
+          %16 : Tensor[] = prim::ListConstruct(%3, %4, %5, %6)
+          %17 : Tensor, %18 : Tensor, %19 : Tensor = aten::lstm(%2, %15, %16, %11, %13, %14, %12, %12, %12)
+          return (%17, %18, %19))IR";*/
+
+    std::vector<at::Tensor> input_data;    
+    auto input = at::randn({1, 5, 1}, {at::kCUDA});
+    auto h0 = at::randn({1, 5, 2}, {at::kCUDA});
+    auto c0 = at::randn({1, 5, 2}, {at::kCUDA});
+
+    auto w1 = at::randn({8, 1}, {at::kCUDA});
+    auto w2 = at::randn({8, 2}, {at::kCUDA});
+    auto w3 = at::randn({8}, {at::kCUDA});
+    auto w4 = at::randn({8}, {at::kCUDA});
+
+    input_data.push_back(h0);
+    input_data.push_back(c0);
+    input_data.push_back(input);
+
+    input_data.push_back(w1);
+    input_data.push_back(w2);
+    input_data.push_back(w3);
+    input_data.push_back(w4);
+
+
+    baidu::mirana::poros::LstmConverter lstmconverter;
+    lstm_test_helper(graph, input_data, &lstmconverter);
+}
+
+TEST(Converters, ATenlstmconverterBidirectionalCorrectly) {
+    // aten::lstm.input(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor)
+    // num_layers = 1
+    // bidirectional = true
+    // batch_first = true
+    const auto graph = R"IR(
+        graph( %0 : Tensor,
+          %1 : Tensor,
+          %2 : Tensor,
+          %3 : Tensor,
+          %4 : Tensor,
+          %5 : Tensor,
+          %6 : Tensor,
+          %7 : Tensor,
+          %8 : Tensor,
+          %9 : Tensor,
+          %10 : Tensor):
+          %11 : bool = prim::Constant[value=1]()
+          %12 : bool = prim::Constant[value=0]()
+          %13 : int = prim::Constant[value=1]()
+          %14 : float = prim::Constant[value=0.0]()
+          %15 : Tensor[] = prim::ListConstruct(%0, %1)
+          %16 : Tensor[] = prim::ListConstruct(%3, %4, %5, %6, %7, %8, %9, %10)
+          %17 : Tensor, %18 : Tensor, %19 : Tensor = aten::lstm(%2, %15, %16, %11, %13, %14, %12, %11, %11)
+          return (%17, %18, %19))IR";
+
+
+    std::vector<at::Tensor> input_data;    
+    auto input = at::randn({50, 7, 10}, {at::kCUDA});
+    auto h0 = at::randn({2, 50, 20}, {at::kCUDA});
+    auto c0 = at::randn({2, 50, 20}, {at::kCUDA});
+
+    auto w1 = at::randn({80, 10}, {at::kCUDA});
+    auto w2 = at::randn({80, 20}, {at::kCUDA});
+    auto w3 = at::randn({80}, {at::kCUDA});
+    auto w4 = at::randn({80}, {at::kCUDA});
+
+    auto r_w1 = at::randn({80, 10}, {at::kCUDA});
+    auto r_w2 = at::randn({80, 20}, {at::kCUDA});
+    auto r_w3 = at::randn({80}, {at::kCUDA});
+    auto r_w4 = at::randn({80}, {at::kCUDA});
+
+    input_data.push_back(h0);
+    input_data.push_back(c0);
+    input_data.push_back(input);
+
+    input_data.push_back(w1);
+    input_data.push_back(w2);
+    input_data.push_back(w3);
+    input_data.push_back(w4);
+    input_data.push_back(r_w1);
+    input_data.push_back(r_w2);
+    input_data.push_back(r_w3);
+    input_data.push_back(r_w4);
+
+
+    baidu::mirana::poros::LstmConverter lstmconverter;
+    lstm_test_helper(graph, input_data, &lstmconverter);
+}
+
+TEST(Converters, ATenlstmconverterNumlayerCorrectly) {
+    // aten::lstm.input(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor)
+    // num_layers > 1
+    // bidirectional = false
+    // batch_first = true
+    const auto graph = R"IR(
+        graph( %0 : Tensor,
+          %1 : Tensor,
+          %2 : Tensor,
+          %3 : Tensor,
+          %4 : Tensor,
+          %5 : Tensor,
+          %6 : Tensor,
+          %7 : Tensor,
+          %8 : Tensor,
+          %9 : Tensor,
+          %10 : Tensor):
+          %11 : bool = prim::Constant[value=1]()
+          %12 : bool = prim::Constant[value=0]()
+          %13 : int = prim::Constant[value=2]()
+          %14 : float = prim::Constant[value=0.0]()
+          %15 : Tensor[] = prim::ListConstruct(%0, %1)
+          %16 : Tensor[] = prim::ListConstruct(%3, %4, %5, %6, %7, %8, %9, %10)
+          %17 : Tensor, %18 : Tensor, %19 : Tensor = aten::lstm(%2, %15, %16, %11, %13, %14, %12, %12, %11)
+          return (%17, %18, %19))IR";
+
+    std::vector<at::Tensor> input_data;    
+    auto input = at::randn({50, 7, 10}, {at::kCUDA});
+    auto h0 = at::randn({2, 50, 20}, {at::kCUDA});
+    auto c0 = at::randn({2, 50, 20}, {at::kCUDA});
+
+    auto num1_w1 = at::randn({80, 10}, {at::kCUDA});
+    auto num1_w2 = at::randn({80, 20}, {at::kCUDA});
+    auto num1_w3 = at::randn({80}, {at::kCUDA});
+    auto num1_w4 = at::randn({80}, {at::kCUDA});
+    auto num2_w1 = at::randn({80, 20}, {at::kCUDA});
+    auto num2_w2 = at::randn({80, 20}, {at::kCUDA});
+    auto num2_w3 = at::randn({80}, {at::kCUDA});
+    auto num2_w4 = at::randn({80}, {at::kCUDA});
+
+    input_data.push_back(h0);
+    input_data.push_back(c0);
+    input_data.push_back(input);
+
+    input_data.push_back(num1_w1);
+    input_data.push_back(num1_w2);
+    input_data.push_back(num1_w3);
+    input_data.push_back(num1_w4);
+    input_data.push_back(num2_w1);
+    input_data.push_back(num2_w2);
+    input_data.push_back(num2_w3);
+    input_data.push_back(num2_w4);
+
+    baidu::mirana::poros::LstmConverter lstmconverter;
+    lstm_test_helper(graph, input_data, &lstmconverter);
+}
diff --git a/poros/unittest/converter/matrix_multiply_test.cpp b/poros/unittest/converter/matrix_multiply_test.cpp
new file mode 100644
index 0000000000..f96307148d
--- /dev/null
+++ b/poros/unittest/converter/matrix_multiply_test.cpp
@@ -0,0 +1,104 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file matrix_multiply_test.cpp
+* @author tianjinjin@baidu.com
+* @date Tue Sep 14 18:19:00 CST 2021
+* @brief 
+**/
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/converter/gpu/matrix_multiply.h"
+#include "poros/util/test_util.h"
+
+static void matrix_multiply_test_helper(const std::string& graph_IR, 
+                                baidu::mirana::poros::IConverter* converter, 
+                                std::vector<int64_t> shape1, 
+                                std::vector<int64_t> shape2,
+                                bool tripleinputs = false,
+                                std::vector<int64_t> shape3 = {5}) {
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn(shape1, {at::kCUDA}));
+    input_data.push_back(at::randn(shape2, {at::kCUDA}));
+    if (tripleinputs){
+        input_data.push_back(at::randn(shape3, {at::kCUDA}));
+    }
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    c10::ShowLogInfoToStderr();
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, converter, 
+                input_data, graph_output, poros_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+}
+
+TEST(Converters, ATenMatmulConvertersCorrectly) {
+    // aten::matmul(Tensor self, Tensor other) -> Tensor
+    const auto graph_IR = R"IR(
+      graph(%0 : Tensor, %1 : Tensor):
+        %2 : Tensor = aten::matmul(%0, %1)
+        return (%2))IR";
+    baidu::mirana::poros::MatmulConverter matmulconverter;
+    matrix_multiply_test_helper(graph_IR, &matmulconverter, {3}, {3});
+    matrix_multiply_test_helper(graph_IR, &matmulconverter, {1, 1536}, {1536, 2});
+    matrix_multiply_test_helper(graph_IR, &matmulconverter, {3}, {3, 512});
+    matrix_multiply_test_helper(graph_IR, &matmulconverter, {512}, {512, 3});
+    matrix_multiply_test_helper(graph_IR, &matmulconverter, {512, 3}, {3});
+    matrix_multiply_test_helper(graph_IR, &matmulconverter, {1, 30, 1024}, {1024});
+    matrix_multiply_test_helper(graph_IR, &matmulconverter, {1, 30, 1024}, {1024, 214});
+    matrix_multiply_test_helper(graph_IR, &matmulconverter, {8}, {512, 8, 10});
+    matrix_multiply_test_helper(graph_IR, &matmulconverter, {254, 8}, {512, 8, 10});
+    matrix_multiply_test_helper(graph_IR, &matmulconverter, {10, 3, 512}, {10, 512, 214});
+    matrix_multiply_test_helper(graph_IR, &matmulconverter, {10, 1, 24, 224}, {7, 224, 5});
+}
+
+TEST(Converters, ATenBmmConvertersCorrectly) {
+    // aten::bmm(Tensor self, Tensor mat2) -> Tensor
+    const auto graph_IR = R"IR(
+      graph(%0 : Tensor, %1 : Tensor):
+        %2 : Tensor = aten::bmm(%0, %1)
+        return (%2))IR";
+    baidu::mirana::poros::BmmConverter bmmconverter;
+    matrix_multiply_test_helper(graph_IR, &bmmconverter, {10, 3, 4}, {10, 4, 5});
+}
+
+static std::string gen_addmm_graph(const std::string& beta, const std::string& alpha) {
+    return R"IR(
+      graph(%0 : Tensor, %1 : Tensor, %2 : Tensor):
+        %3 : float = prim::Constant[value=)IR" + beta + R"IR(]()
+        %4 : float = prim::Constant[value=)IR" + alpha + R"IR(]()
+        %5 : Tensor = aten::addmm(%0, %1, %2, %3, %4)
+        return (%5))IR";
+}
+
+TEST(Converters, ATenAddmmConvertersCorrectly) {
+    // aten::addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+    const auto graph_IR = gen_addmm_graph("1.0", "1.0");
+    baidu::mirana::poros::AddmmConverter addmmconverter;
+    matrix_multiply_test_helper(graph_IR, &addmmconverter, {2, 3}, {2, 3}, true, {3, 3});
+    matrix_multiply_test_helper(graph_IR, &addmmconverter, {3}, {2, 3}, true, {3, 3});
+}
+
+TEST(Converters, ATenAddmmBetaAlphaConvertersCorrectly) {
+    // aten::addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+    const auto graph_IR = gen_addmm_graph("3.3", "2.2");
+    baidu::mirana::poros::AddmmConverter addmmconverter;
+    matrix_multiply_test_helper(graph_IR, &addmmconverter, {2, 3}, {2, 3}, true, {3, 3});
+}
\ No newline at end of file
diff --git a/poros/unittest/converter/meshgrid_test.cpp b/poros/unittest/converter/meshgrid_test.cpp
new file mode 100644
index 0000000000..7e2339bc09
--- /dev/null
+++ b/poros/unittest/converter/meshgrid_test.cpp
@@ -0,0 +1,66 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file meshgrid_test.cpp
+* @author wangrui39@baidu.com
+* @date Monday November 27 11:36:11 CST 2021
+* @brief 
+**/
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/converter/gpu/meshgrid.h"
+#include "poros/util/test_util.h"
+
+static void add_test_helper(const std::string& graph_IR, 
+                            baidu::mirana::poros::IConverter* converter,
+                            std::vector<float> value1 = {1.0, 2.0, 3.0},
+                            std::vector<float> value2 = {4.0, 5.0}){
+    std::vector<at::Tensor> input_data;
+    auto options_pyt = torch::TensorOptions().device(torch::kCUDA, 0);//.dtype(torch::kInt32);
+    input_data.push_back(at::tensor(value1, options_pyt));
+    input_data.push_back(at::tensor(value2, options_pyt));
+
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, converter, 
+                input_data, graph_output, poros_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+}
+
+static std::string gen_meshgrid_graph() {
+    std::string graph = R"IR(
+      graph(%x.1 : Tensor,
+        %y.1 : Tensor):
+        %10 : int = prim::Constant[value=1]()
+        %4 : Tensor[] = prim::ListConstruct(%x.1, %y.1)
+        %5 : Tensor[] = aten::meshgrid(%4)
+        %grid_x.1 : Tensor, %grid_y.1 : Tensor = prim::ListUnpack(%5)
+        %11 : Tensor = aten::add(%grid_x.1, %grid_y.1, %10)
+        return (%11))IR"; 
+
+    return graph;
+}
+
+TEST(Converters, ATenMeshgridConvertsCorrectly) {
+    const auto graph_IR = gen_meshgrid_graph();
+    baidu::mirana::poros::MeshgridConverter meshgridconverter;
+    add_test_helper(graph_IR, &meshgridconverter);
+}
\ No newline at end of file
diff --git a/poros/unittest/converter/mul_div_test.cpp b/poros/unittest/converter/mul_div_test.cpp
new file mode 100644
index 0000000000..3e7d8fb1c0
--- /dev/null
+++ b/poros/unittest/converter/mul_div_test.cpp
@@ -0,0 +1,410 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file mul_div_test.cpp
+* @author tianshaoqing@baidu.com
+* @date Wed Sep 27 11:24:21 CST 2021
+* @brief 
+**/
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/converter/gpu/mul_div.h"
+#include "poros/util/test_util.h"
+
+static void mul_div_test_helper(const std::string& graph_IR, 
+                                baidu::mirana::poros::IConverter* converter, 
+                                bool singleInput,
+                                std::vector<int64_t> shape1 = {5}, 
+                                std::vector<int64_t> shape2 = {5}) {
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn(shape1, {at::kCUDA}));
+    if (!singleInput){
+        input_data.push_back(at::randn(shape2, {at::kCUDA}));
+    }
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, converter, 
+                input_data, graph_output, poros_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+}
+
+std::string gen_mul_div_tensor_graph(const std::string& op) {
+    return R"IR(
+      graph(%0 : Tensor, %1 : Tensor):
+        %2 : Tensor = aten::)IR" + op + R"IR((%0, %1)
+        return (%2))IR";
+}
+
+std::string gen_mul_div_scalar_graph(const std::string& op, const std::string& scalar) {
+    return R"IR(
+      graph(%0 : Tensor):
+        %1 : float = prim::Constant[value=)IR" + scalar + R"IR(]()
+        %2 : Tensor = aten::)IR" + op + R"IR((%0, %1)
+        return (%2))IR";
+}
+
+TEST(Converters, ATenMulConvertsCorrectly) {
+    // aten::mul.Tensor(Tensor self, Tensor other) -> Tensor
+    const auto graph_IR = gen_mul_div_tensor_graph("mul");
+    baidu::mirana::poros::MulConverter mulconverter;
+    mul_div_test_helper(graph_IR, &mulconverter, false);
+    mul_div_test_helper(graph_IR, &mulconverter, false, {3, 4}, {4});
+    mul_div_test_helper(graph_IR, &mulconverter, false, {4}, {3, 4});
+    mul_div_test_helper(graph_IR, &mulconverter, false, {4, 1}, {1, 4});
+    mul_div_test_helper(graph_IR, &mulconverter, false, {3, 4, 3}, {4, 3});
+    mul_div_test_helper(graph_IR, &mulconverter, false, {4, 3}, {3, 4, 3});
+}
+
+TEST(Converters, ATenMulScalarConvertsCorrectly) {
+    // aten::mul.Scalar(Tensor self, Scalar other) -> Tensor
+    const auto graph_IR = gen_mul_div_scalar_graph("mul", "2.4");
+    baidu::mirana::poros::MulConverter mulconverter;
+    mul_div_test_helper(graph_IR, &mulconverter, true);
+    mul_div_test_helper(graph_IR, &mulconverter, true, {3, 4, 3});
+}
+
+TEST(Converters, ATenMul_ConvertsCorrectly) {
+    // aten::mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    const auto graph_IR = gen_mul_div_tensor_graph("mul_");
+    baidu::mirana::poros::MulConverter mulconverter;
+    mul_div_test_helper(graph_IR, &mulconverter, false);
+    mul_div_test_helper(graph_IR, &mulconverter, false, {3, 4}, {4});
+    mul_div_test_helper(graph_IR, &mulconverter, false, {3, 4, 3}, {4, 3});
+}
+
+TEST(Converters, ATenMul_ScalarConvertsCorrectly) {
+    // aten::mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+    const auto graph_IR = gen_mul_div_scalar_graph("mul_", "2.4");
+    baidu::mirana::poros::MulConverter mulconverter;
+    mul_div_test_helper(graph_IR, &mulconverter, true);
+    mul_div_test_helper(graph_IR, &mulconverter, true, {3, 4, 3});
+}
+
+TEST(Converters, ATenDivConvertsCorrectly) {
+    // aten::div.Tensor(Tensor self, Tensor other) -> Tensor
+    const auto graph_IR = gen_mul_div_tensor_graph("div");
+    baidu::mirana::poros::DivConverter divconverter;
+    mul_div_test_helper(graph_IR, &divconverter, false);
+    mul_div_test_helper(graph_IR, &divconverter, false, {3, 4}, {4});
+    mul_div_test_helper(graph_IR, &divconverter, false, {4}, {3, 4});
+    mul_div_test_helper(graph_IR, &divconverter, false, {4, 1}, {1, 4});
+    mul_div_test_helper(graph_IR, &divconverter, false, {3, 4, 3}, {4, 3});
+    mul_div_test_helper(graph_IR, &divconverter, false, {4, 3}, {3, 4, 3});
+}
+
+TEST(Converters, ATenDivScalarConvertsCorrectly) {
+    // aten::div.Scalar(Tensor self, Scalar other) -> (Tensor)
+    const auto graph_IR = gen_mul_div_scalar_graph("div", "2.4");
+    baidu::mirana::poros::DivConverter divconverter;
+    mul_div_test_helper(graph_IR, &divconverter, true);
+    mul_div_test_helper(graph_IR, &divconverter, true, {3, 4, 3});
+}
+
+TEST(Converters, ATenDiv_ConvertsCorrectly) {
+    // aten::div_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    const auto graph_IR = gen_mul_div_tensor_graph("div_");
+    baidu::mirana::poros::DivConverter divconverter;
+    mul_div_test_helper(graph_IR, &divconverter, false);
+    mul_div_test_helper(graph_IR, &divconverter, false, {3, 4}, {4});
+    mul_div_test_helper(graph_IR, &divconverter, false, {3, 4, 3}, {4, 3});
+}
+
+TEST(Converters, ATenDiv_ScalarConvertsCorrectly) {
+    // aten::div_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+    const auto graph_IR = gen_mul_div_scalar_graph("div_", "2.4");
+    baidu::mirana::poros::DivConverter divconverter;
+    mul_div_test_helper(graph_IR, &divconverter, true);
+    mul_div_test_helper(graph_IR, &divconverter, true, {3, 4, 3});
+}
+
+TEST(Converters, ATenDivIntDivideIntConvertsCorrectly) {
+    // aten::div.Tensor(Tensor self, Tensor other) -> Tensor
+    const auto graph_IR = gen_mul_div_tensor_graph("div");
+
+    auto options_pyt_int = torch::TensorOptions().device(torch::kCUDA, 0).dtype(torch::kInt);
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(torch::tensor({14}, options_pyt_int));
+    input_data.push_back(torch::tensor({2}, options_pyt_int));
+
+    baidu::mirana::poros::DivConverter divconverter;
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &divconverter, 
+                input_data, graph_output, poros_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+}
+
+TEST(Converters, ATenDivFloatDivideIntConvertsCorrectly) {
+    // aten::div.Scalar(Tensor self, Scalar other) -> (Tensor)
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+            %1 : int = prim::Constant[value=3]()
+            %2 : Tensor = aten::div(%0, %1)
+            return (%2))IR";
+
+    auto options_pyt_float = torch::TensorOptions().device(torch::kCUDA, 0).dtype(torch::kFloat);
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(torch::tensor({15.3}, options_pyt_float));
+
+    baidu::mirana::poros::DivConverter divconverter;
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &divconverter, 
+                input_data, graph_output, poros_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+}
+
+TEST(Converters, ATenDivIntDivideFloatConvertsCorrectly) {
+    // aten::div.Scalar(Tensor self, Scalar other) -> (Tensor)
+    const auto graph_IR = gen_mul_div_scalar_graph("div", "2.4");
+
+    auto options_pyt_int = torch::TensorOptions().device(torch::kCUDA, 0).dtype(torch::kInt);
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(torch::tensor({15}, options_pyt_int));
+
+    baidu::mirana::poros::DivConverter divconverter;
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &divconverter, 
+                input_data, graph_output, poros_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+}
+
+TEST(Converters, ATenRemainderConvertsCorrectly) {
+    // aten::remainder.Tensor(Tensor self, Tensor other) -> Tensor
+    const auto graph_IR = gen_mul_div_tensor_graph("remainder");
+    baidu::mirana::poros::RemainderConverter remainder;
+    mul_div_test_helper(graph_IR, &remainder, false);
+    mul_div_test_helper(graph_IR, &remainder, false, {3, 4}, {4});
+    mul_div_test_helper(graph_IR, &remainder, false, {4}, {3, 4});
+    mul_div_test_helper(graph_IR, &remainder, false, {4, 1}, {1, 4});
+    mul_div_test_helper(graph_IR, &remainder, false, {3, 4, 3}, {4, 3});
+    mul_div_test_helper(graph_IR, &remainder, false, {4, 3}, {3, 4, 3});
+}
+
+TEST(Converters, ATenRemainderScalarConvertsCorrectly) {
+    // aten::remainder.Scalar(Tensor self, Scalar other) -> Tensor
+    const auto graph_IR = gen_mul_div_scalar_graph("remainder", "-0.4");
+    baidu::mirana::poros::RemainderConverter remainder;
+    mul_div_test_helper(graph_IR, &remainder, true);
+    mul_div_test_helper(graph_IR, &remainder, true, {3, 4, 3});
+}
+
+
+static void mul_div_dynamic_test_helper(const std::string& graph_IR, 
+                                baidu::mirana::poros::IConverter* converter,
+                                const std::vector<at::Tensor>& input_data,
+                                bool is_dynamic = false,
+                                std::vector<std::vector<at::Tensor>>* prewarm_data = nullptr) {
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = is_dynamic;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, converter, 
+                input_data, graph_output, poros_output, prewarm_data));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(graph_output[0].equal(poros_output[0]));
+}
+
+TEST(Converters, ATenMulIntdynamicConvertsCorrectly) {
+    // aten::mul.int(int a, int b) -> (int)
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %1 : int = prim::Constant[value=0]()
+          %2 : int = prim::Constant[value=1]()
+          %3 : int = aten::size(%0, %1)
+          %4 : int = aten::size(%0, %2)
+          %5 : int = aten::mul(%3, %4)
+          %6 : Tensor = aten::add(%0, %5, %2)
+          return (%6))IR";
+    baidu::mirana::poros::MulConverter mulconverter;
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::zeros({2, 3}, {at::kCUDA}).to(at::ScalarType::Int));
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::zeros({4, 5}, {at::kCUDA}).to(at::ScalarType::Int));
+    prewarm_data[1].push_back(at::zeros({2, 3}, {at::kCUDA}).to(at::ScalarType::Int));
+    prewarm_data[2].push_back(at::zeros({2, 3}, {at::kCUDA}).to(at::ScalarType::Int));
+
+    mul_div_dynamic_test_helper(graph_IR, &mulconverter, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, ATenDivIntdynamicConvertsCorrectly) {
+    // aten::div.int(int a, int b) -> (float)
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %1 : int = prim::Constant[value=0]()
+          %2 : int = prim::Constant[value=1]()
+          %3 : int = aten::size(%0, %1)
+          %4 : int = aten::size(%0, %2)
+          %5 : float = aten::div(%3, %4)
+          %6 : Tensor = aten::add(%0, %5, %2)
+          return (%6))IR";
+    baidu::mirana::poros::DivConverter divconverter;
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::zeros({4, 5}, {at::kCUDA}));
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::zeros({10, 8}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::zeros({4, 5}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::zeros({4, 5}, {at::kCUDA}));
+
+    mul_div_dynamic_test_helper(graph_IR, &divconverter, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, ATenDivNegIntdynamicConvertsCorrectly) {
+    // aten::div.int(int a, int b) -> (float)
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %1 : int = prim::Constant[value=0]()
+          %2 : int = prim::Constant[value=1]()
+          %3 : int = aten::size(%0, %1)
+          %4 : int = aten::size(%0, %2)
+          %34 : int = prim::Constant[value=100]()
+          %35 : int = aten::sub(%3, %34)
+          %5 : float = aten::div(%35, %4)
+          %6 : Tensor = aten::add(%0, %5, %2)
+          return (%6))IR";
+    baidu::mirana::poros::DivConverter divconverter;
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::zeros({4, 5}, {at::kCUDA}));
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::zeros({10, 8}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::zeros({4, 5}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::zeros({4, 5}, {at::kCUDA}));
+
+    mul_div_dynamic_test_helper(graph_IR, &divconverter, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, ATenFloordivIntdynamicConvertsCorrectly) {
+    // aten::floordiv.int(int a, int b) -> (int)
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %1 : int = prim::Constant[value=0]()
+          %2 : int = prim::Constant[value=1]()
+          %3 : int = aten::size(%0, %1)
+          %4 : int = aten::size(%0, %2)
+          %5 : int = aten::floordiv(%3, %4)
+          %6 : Tensor = aten::add(%0, %5, %2)
+          return (%6))IR";
+    baidu::mirana::poros::FloordivConverter floordivconverter;
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::zeros({10, 4}, {at::kCUDA}).to(at::ScalarType::Int));
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::zeros({12, 5}, {at::kCUDA}).to(at::ScalarType::Int));
+    prewarm_data[1].push_back(at::zeros({10, 4}, {at::kCUDA}).to(at::ScalarType::Int));
+    prewarm_data[2].push_back(at::zeros({10, 4}, {at::kCUDA}).to(at::ScalarType::Int));
+
+    mul_div_dynamic_test_helper(graph_IR, &floordivconverter, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, ATenFloordivNegIntdynamicConvertsCorrectly) {
+    // aten::floordiv.int(int a, int b) -> (int)
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %1 : int = prim::Constant[value=0]()
+          %2 : int = prim::Constant[value=1]()
+          %3 : int = aten::size(%0, %1)
+          %4 : int = aten::size(%0, %2)
+          %34 : int = prim::Constant[value=100]()
+          %35 : int = aten::sub(%3, %34)
+          %5 : int = aten::floordiv(%35, %4)
+          %6 : Tensor = aten::add(%0, %5, %2)
+          return (%6))IR";
+    baidu::mirana::poros::FloordivConverter floordivconverter;
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::zeros({10, 4}, {at::kCUDA}).to(at::ScalarType::Int));
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::zeros({12, 5}, {at::kCUDA}).to(at::ScalarType::Int));
+    prewarm_data[1].push_back(at::zeros({10, 4}, {at::kCUDA}).to(at::ScalarType::Int));
+    prewarm_data[2].push_back(at::zeros({10, 4}, {at::kCUDA}).to(at::ScalarType::Int));
+
+    mul_div_dynamic_test_helper(graph_IR, &floordivconverter, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, ATenRoundToZeroFloordivIntdynamicConvertsCorrectly) {
+    // aten::__round_to_zero_floordiv(int a, int b) -> (int)
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %1 : int = prim::Constant[value=0]()
+          %2 : int = prim::Constant[value=1]()
+          %3 : int = aten::size(%0, %1)
+          %4 : int = aten::size(%0, %2)
+          %5 : int = aten::__round_to_zero_floordiv(%3, %4)
+          %6 : Tensor = aten::add(%0, %5, %2)
+          return (%6))IR";
+    baidu::mirana::poros::FloordivConverter floordivconverter;
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::zeros({10, 4}, {at::kCUDA}).to(at::ScalarType::Int));
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::zeros({12, 5}, {at::kCUDA}).to(at::ScalarType::Int));
+    prewarm_data[1].push_back(at::zeros({10, 4}, {at::kCUDA}).to(at::ScalarType::Int));
+    prewarm_data[2].push_back(at::zeros({10, 4}, {at::kCUDA}).to(at::ScalarType::Int));
+
+    mul_div_dynamic_test_helper(graph_IR, &floordivconverter, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, ATenRoundToZeroFloordivNegIntdynamicConvertsCorrectly) {
+    // aten::__round_to_zero_floordiv(int a, int b) -> (int)
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %1 : int = prim::Constant[value=0]()
+          %2 : int = prim::Constant[value=1]()
+          %3 : int = aten::size(%0, %1)
+          %4 : int = aten::size(%0, %2)
+          %34 : int = prim::Constant[value=100]()
+          %35 : int = aten::sub(%3, %34)
+          %5 : int = aten::__round_to_zero_floordiv(%35, %4)
+          %6 : Tensor = aten::add(%0, %5, %2)
+          return (%6))IR";
+    baidu::mirana::poros::FloordivConverter floordivconverter;
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::zeros({10, 4}, {at::kCUDA}).to(at::ScalarType::Int));
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::zeros({12, 5}, {at::kCUDA}).to(at::ScalarType::Int));
+    prewarm_data[1].push_back(at::zeros({10, 4}, {at::kCUDA}).to(at::ScalarType::Int));
+    prewarm_data[2].push_back(at::zeros({10, 4}, {at::kCUDA}).to(at::ScalarType::Int));
+
+    mul_div_dynamic_test_helper(graph_IR, &floordivconverter, input_data, true, &prewarm_data);
+}
\ No newline at end of file
diff --git a/poros/unittest/converter/norm_test.cpp b/poros/unittest/converter/norm_test.cpp
new file mode 100644
index 0000000000..e0e53a7e50
--- /dev/null
+++ b/poros/unittest/converter/norm_test.cpp
@@ -0,0 +1,122 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file norm_test.cpp
+* @author Lin Xiao Chun (linxiaochun@baidu.com)
+* @date 2022-02-23 20:38:15
+* @brief
+**/
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/converter/gpu/norm.h"
+#include "poros/util/test_util.h"
+
+static void norm_test_helper(const std::string &graph_IR,
+                             baidu::mirana::poros::IConverter *converter,
+                             std::vector<int64_t> shape1 = {5}) {
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn(shape1, {at::kCUDA}));
+
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, converter,
+                                                                    input_data, graph_output, poros_output));
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+}
+
+static std::string gen_norm_tensor_graph(const std::string &p, const std::string &dims, const std::string &keepdim) {
+    return R"IR(
+graph(%1 : Tensor):
+  %2 : bool = prim::Constant[value=)IR" + keepdim + R"IR(]()
+  %3 : int = prim::Constant[value=)IR" + p + R"IR(]()
+  %4 : int[] = prim::Constant[value=)IR" + dims + R"IR(]()
+  %5 : Tensor = aten::norm(%1, %3, %4, %2)
+  return (%5)
+)IR";
+}
+
+static std::string gen_norm_empty_dims_graph(const std::string &p, const std::string &dims, const std::string &keepdim) {
+    return R"IR(
+graph(%1 : Tensor):
+  %2 : bool = prim::Constant[value=)IR" + keepdim + R"IR(]()
+  %3 : int = prim::Constant[value=)IR" + p + R"IR(]()
+  %4 : int[] = prim::ListConstruct()
+  %5 : Tensor = aten::norm(%1, %3, %4, %2)
+  return (%5)
+)IR";
+}
+
+
+
+TEST(Converters, ATenNormConvertsCorrectlyWith) {
+    std::vector<std::string> graphIRs;
+    graphIRs.push_back(gen_norm_tensor_graph("2", "[0]","0"));
+    graphIRs.push_back(gen_norm_tensor_graph("2", "[1]","0"));
+    graphIRs.push_back(gen_norm_empty_dims_graph("2", "","0"));
+    graphIRs.push_back(gen_norm_tensor_graph("2", "[1,2]","0"));
+    graphIRs.push_back(gen_norm_tensor_graph("2", "[-2,2]","0"));
+    graphIRs.push_back(gen_norm_tensor_graph("2", "[1,2]","1"));
+    graphIRs.push_back(gen_norm_tensor_graph("1.5", "[1,2]","0"));
+    graphIRs.push_back(gen_norm_tensor_graph("0.2", "[-1,-2,-3,-4]","1"));
+
+    baidu::mirana::poros::NormConverter converter;
+
+    for(auto ir:graphIRs){
+        norm_test_helper(ir, &converter, {3,4,5,6,7});
+    }
+}
+
+static std::string gen_frobenius_norm_tensor_graph(const std::string &dims, const std::string &keepdim) {
+    return R"IR(
+graph(%1 : Tensor):
+  %2 : bool = prim::Constant[value=)IR" + keepdim + R"IR(]()
+  %4 : int[] = prim::Constant[value=)IR" + dims + R"IR(]()
+  %5 : Tensor = aten::frobenius_norm(%1, %4, %2)
+  return (%5)
+)IR";
+}
+
+static std::string gen_frobenius_norm_empty_dims_graph(const std::string &dims, const std::string &keepdim) {
+    return R"IR(
+graph(%1 : Tensor):
+  %2 : bool = prim::Constant[value=)IR" + keepdim + R"IR(]()
+  %4 : int[] = prim::ListConstruct()
+  %5 : Tensor = aten::frobenius_norm(%1, %4, %2)
+  return (%5)
+)IR";
+}
+
+TEST(Converters, ATenFrobeniusNormConvertsCorrectlyWith) {
+    std::vector<std::string> graphIRs;
+    graphIRs.push_back(gen_frobenius_norm_tensor_graph( "[0]","0"));
+    graphIRs.push_back(gen_frobenius_norm_tensor_graph("[1]","0"));
+    graphIRs.push_back(gen_frobenius_norm_empty_dims_graph("","0"));
+    graphIRs.push_back(gen_frobenius_norm_tensor_graph("[1,2]","0"));
+    graphIRs.push_back(gen_frobenius_norm_tensor_graph("[-2,2]","0"));
+    graphIRs.push_back(gen_frobenius_norm_tensor_graph("[1,2]","1"));
+    graphIRs.push_back(gen_frobenius_norm_tensor_graph( "[1,2]","0"));
+
+    baidu::mirana::poros::FrobeniusNormConverter converter;
+
+    for(auto ir:graphIRs){
+        norm_test_helper(ir, &converter, {3,4,5,6,7});
+    }
+}
\ No newline at end of file
diff --git a/poros/unittest/converter/pooling_test.cpp b/poros/unittest/converter/pooling_test.cpp
new file mode 100644
index 0000000000..8e780fefe1
--- /dev/null
+++ b/poros/unittest/converter/pooling_test.cpp
@@ -0,0 +1,268 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file pooling_test.cpp
+* @author tianshaoqing@baidu.com
+* @date Wed Sep 27 11:24:21 CST 2021
+* @brief 
+**/
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/converter/gpu/pooling.h"
+#include "poros/util/test_util.h"
+
+static void pooling_test_helper(const std::string& graph_IR,
+                            baidu::mirana::poros::IConverter* converter,
+                            std::vector<int64_t> shape) {
+    std::vector<at::Tensor> input_data;
+    // input_data.push_back(at::randn(shape, {at::kCUDA}));
+    input_data.push_back(at::randint(-50, 50, shape, {at::kCUDA}));
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, converter, 
+                input_data, graph_output, poros_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+}
+
+static std::string gen_maxpool_graph(const std::string& op,
+                                    const std::string& kernel_size,
+                                    const std::string& stride,
+                                    const std::string& padding,
+                                    const std::string& dilation,
+                                    const std::string& ceil_mode) {
+    return R"IR(
+        graph(%0 : Tensor):
+          %1 : int[] = prim::Constant[value=[)IR" + kernel_size + R"IR(]]()
+          %2 : int[] = prim::Constant[value=[)IR" + stride + R"IR(]]()
+          %3 : int[] = prim::Constant[value=[)IR" + padding + R"IR(]]()
+          %4 : int[] = prim::Constant[value=[)IR" + dilation + R"IR(]]()
+          %5 : bool = prim::Constant[value=)IR" + ceil_mode + R"IR(]()
+          %6 : Tensor = aten::)IR" + op + R"IR((%0, %1, %2, %3, %4, %5)
+          return (%6))IR";
+}
+
+static std::string gen_avgpool_graph(const std::string& op,
+                                    const std::string& kernel_size,
+                                    const std::string& stride,
+                                    const std::string& padding,
+                                    const std::string& ceil_mode,
+                                    const std::string& count_include_pad,
+                                    const std::string& divisor_override) {
+    std::string divisor_ir("");
+    std::string op_ir("");
+    if (divisor_override.empty()) {
+        divisor_ir = "None = prim::Constant()";
+    } else {
+        divisor_ir = "int = prim::Constant[value=" + divisor_override + "]()";
+    }
+    if (op == "avg_pool1d") {
+        op_ir = op + "(%0, %1, %2, %3, %4, %5)";
+    } else {
+        op_ir = op + "(%0, %1, %2, %3, %4, %5, %6)";
+    }
+    return R"IR(
+        graph(%0 : Tensor):
+          %1 : int[] = prim::Constant[value=[)IR" + kernel_size + R"IR(]]()
+          %2 : int[] = prim::Constant[value=[)IR" + stride + R"IR(]]()
+          %3 : int[] = prim::Constant[value=[)IR" + padding + R"IR(]]()
+          %4 : bool = prim::Constant[value=)IR" + ceil_mode + R"IR(]()
+          %5 : bool = prim::Constant[value=)IR" + count_include_pad + R"IR(]()
+          %6 : )IR" + divisor_ir + R"IR(
+          %7 : Tensor = aten::)IR" + op_ir + R"IR(
+          return (%7))IR";
+}
+
+TEST(Converters, ATenMaxPool1DConvertsCorrectly) {
+    // aten::max_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> Tensor
+    const auto graph_IR = gen_maxpool_graph("max_pool1d", "3", "2", "1", "1", "0");
+    baidu::mirana::poros::PoolingConverter poolingconverter;
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 1, 7});
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 1, 8});
+}
+
+TEST(Converters, ATenMaxPool1DCeilConvertsCorrectly) { 
+    // aten::max_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> Tensor
+    const auto graph_IR = gen_maxpool_graph("max_pool1d", "3", "2", "1", "1", "1");
+    baidu::mirana::poros::PoolingConverter poolingconverter;
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 1, 8});
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 1, 7});
+}
+
+TEST(Converters, ATenMaxPool2DConvertsCorrectly) {
+    // aten::max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
+    const auto graph_IR = gen_maxpool_graph("max_pool2d", "3, 3", "2, 2", "1, 1", "1, 1", "0");
+    baidu::mirana::poros::PoolingConverter poolingconverter;
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 7, 7});
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 8, 8});
+}
+
+TEST(Converters, ATenMaxPool2DCeilConvertsCorrectly) { 
+    // aten::max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
+    const auto graph_IR = gen_maxpool_graph("max_pool2d", "3, 3", "2, 2", "1, 1", "1, 1", "1");
+    baidu::mirana::poros::PoolingConverter poolingconverter;
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 8, 8});
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 7, 7});
+}
+
+TEST(Converters, ATenMaxPool3DConvertsCorrectly) {
+    // aten::max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor
+    const auto graph_IR = gen_maxpool_graph("max_pool3d", "3, 3, 3", "2, 2, 2", "1, 1, 1", "1, 1, 1", "0");
+    baidu::mirana::poros::PoolingConverter poolingconverter;
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 3, 7, 7, 7});
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 3, 8, 8, 8});
+}
+
+TEST(Converters, ATenMaxPool3DCeilConvertsCorrectly) { 
+    // aten::max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor
+    const auto graph_IR = gen_maxpool_graph("max_pool3d", "3, 3, 3", "2, 2, 2", "1, 1, 1", "1, 1, 1", "1");
+    baidu::mirana::poros::PoolingConverter poolingconverter;
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 3, 8, 8, 8});
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 3, 7, 7, 7});
+}
+
+TEST(Converters, ATenAvgPool1DConvertsCorrectly) {
+    // aten::avg_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, bool ceil_mode=False, bool count_include_pad=True) -> Tensor
+    const auto graph_IR = gen_avgpool_graph("avg_pool1d", "3", "2", "1", "0", "1", "");
+    baidu::mirana::poros::PoolingConverter poolingconverter;
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 1, 7});
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 1, 8});
+}
+
+TEST(Converters, ATenAvgPool1DCeilConvertsCorrectly) {
+    // aten::avg_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, bool ceil_mode=False, bool count_include_pad=True) -> Tensor
+    const auto graph_IR = gen_avgpool_graph("avg_pool1d", "3", "2", "1", "1", "1", "");
+    baidu::mirana::poros::PoolingConverter poolingconverter;
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 1, 7});
+    // pooling_test_helper(graph_IR, &poolingconverter, {1, 1, 8}); // fail
+}
+
+TEST(Converters, ATenAvgPool1DNoCountPadConvertsCorrectly) {
+    // aten::avg_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, bool ceil_mode=False, bool count_include_pad=True) -> Tensor
+    const auto graph_IR = gen_avgpool_graph("avg_pool1d", "3", "2", "1", "0", "0", "");
+    baidu::mirana::poros::PoolingConverter poolingconverter;
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 1, 7});
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 1, 8});
+}
+
+TEST(Converters, ATenAvgPool1DCeilNoCountPadConvertsCorrectly) {
+    // aten::avg_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, bool ceil_mode=False, bool count_include_pad=True) -> Tensor
+    const auto graph_IR = gen_avgpool_graph("avg_pool1d", "3", "2", "1", "1", "0", "");
+    baidu::mirana::poros::PoolingConverter poolingconverter;
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 1, 7});
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 1, 8});
+}
+
+TEST(Converters, ATenAvgPool2DConvertsCorrectly) {
+    // aten::avg_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
+    const auto graph_IR = gen_avgpool_graph("avg_pool2d", "3, 3", "2, 2", "1, 1", "0", "1", "");
+    baidu::mirana::poros::PoolingConverter poolingconverter;
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 7, 7});
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 8, 8});
+}
+
+TEST(Converters, ATenAvgPool2DCeilConvertsCorrectly) {
+    // aten::avg_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
+    const auto graph_IR = gen_avgpool_graph("avg_pool2d", "3, 3", "2, 2", "1, 1", "1", "1", "");
+    baidu::mirana::poros::PoolingConverter poolingconverter;
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 7, 7});
+    // pooling_test_helper(graph_IR, &poolingconverter, {1, 8, 8}); // fail
+}
+
+TEST(Converters, ATenAvgPool2DNoCountPadConvertsCorrectly) {
+    // aten::avg_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
+    const auto graph_IR = gen_avgpool_graph("avg_pool2d", "3, 3", "2, 2", "1, 1", "0", "0", "");
+    baidu::mirana::poros::PoolingConverter poolingconverter;
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 7, 7});
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 8, 8});
+}
+
+TEST(Converters, ATenAvgPool2DCeilNoCountPadConvertsCorrectly) {
+    // aten::avg_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
+    const auto graph_IR = gen_avgpool_graph("avg_pool2d", "3, 3", "2, 2", "1, 1", "1", "0", "");
+    baidu::mirana::poros::PoolingConverter poolingconverter;
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 7, 7});
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 8, 8});
+}
+
+TEST(Converters, ATenAvgPool2DDivConvertsCorrectly) {
+    // aten::avg_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
+    const auto graph_IR = gen_avgpool_graph("avg_pool2d", "3, 3", "2, 2", "1, 1", "0", "1", "4");
+    baidu::mirana::poros::PoolingConverter poolingconverter;
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 7, 7});
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 8, 8});
+}
+
+TEST(Converters, ATenAvgPool2DNegtiveDivConvertsCorrectly) {
+    // aten::avg_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
+    const auto graph_IR = gen_avgpool_graph("avg_pool2d", "3, 3", "2, 2", "1, 1", "0", "1", "-4");
+    baidu::mirana::poros::PoolingConverter poolingconverter;
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 7, 7});
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 8, 8});
+}
+
+
+TEST(Converters, ATenAvgPool3DConvertsCorrectly) {
+    // aten::avg_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
+    const auto graph_IR = gen_avgpool_graph("avg_pool3d", "3, 3, 3", "2, 2, 2", "1, 1, 1", "0", "1", "");
+    baidu::mirana::poros::PoolingConverter poolingconverter;
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 3, 7, 7, 7});
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 3, 8, 8, 8});
+}
+
+TEST(Converters, ATenAvgPool3DCeilConvertsCorrectly) {
+    // aten::avg_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
+    const auto graph_IR = gen_avgpool_graph("avg_pool3d", "3, 3, 3", "2, 2, 2", "1, 1, 1", "1", "1", "");
+    baidu::mirana::poros::PoolingConverter poolingconverter;
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 3, 7, 7, 7});
+    // pooling_test_helper(graph_IR, &poolingconverter, {1, 3, 8, 8, 8}); // fail
+}
+
+TEST(Converters, ATenAvgPool3DNoCountPadConvertsCorrectly) {
+    // aten::avg_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
+    const auto graph_IR = gen_avgpool_graph("avg_pool3d", "3, 3, 3", "2, 2, 2", "1, 1, 1", "0", "0", "");
+    baidu::mirana::poros::PoolingConverter poolingconverter;
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 3, 7, 7, 7});
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 3, 8, 8, 8});
+}
+
+TEST(Converters, ATenAvgPool3DCeilNoCountPadConvertsCorrectly) {
+    // aten::avg_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
+    const auto graph_IR = gen_avgpool_graph("avg_pool3d", "3, 3, 3", "2, 2, 2", "1, 1, 1", "1", "0", "");
+    baidu::mirana::poros::PoolingConverter poolingconverter;
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 3, 7, 7, 7});
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 3, 8, 8, 8});
+}
+
+TEST(Converters, ATenAvgPool3DDivConvertsCorrectly) {
+    // aten::avg_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
+    const auto graph_IR = gen_avgpool_graph("avg_pool3d", "3, 3, 3", "2, 2, 2", "1, 1, 1", "0", "1", "8");
+    baidu::mirana::poros::PoolingConverter poolingconverter;
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 3, 7, 7, 7});
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 3, 8, 8, 8});
+}
+
+TEST(Converters, ATenAvgPool3DNegtiveDivConvertsCorrectly) {
+    // aten::avg_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
+    const auto graph_IR = gen_avgpool_graph("avg_pool3d", "3, 3, 3", "2, 2, 2", "1, 1, 1", "0", "1", "-8");
+    baidu::mirana::poros::PoolingConverter poolingconverter;
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 3, 7, 7, 7});
+    pooling_test_helper(graph_IR, &poolingconverter, {1, 3, 8, 8, 8});
+}
\ No newline at end of file
diff --git a/poros/unittest/converter/reduce_test.cpp b/poros/unittest/converter/reduce_test.cpp
new file mode 100644
index 0000000000..81c8f95145
--- /dev/null
+++ b/poros/unittest/converter/reduce_test.cpp
@@ -0,0 +1,362 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file reduce_test.cpp
+* @author tianshaoqing@baidu.com
+* @date Wed Sep 27 11:24:21 CST 2021
+* @brief 
+**/
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/converter/gpu/reduce.h"
+#include "poros/util/test_util.h"
+
+static void reduce_test_helper(const std::string& graph_IR,
+                            baidu::mirana::poros::IConverter* converter,
+                            std::vector<int64_t> shape1,
+                            bool single_input = true,
+                            std::vector<int64_t> shape2 = {4, 4},
+                            bool single_output = true){
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn(shape1, {at::kCUDA}));
+
+    if (!single_input){
+        input_data.push_back(at::randn(shape2, {at::kCUDA}));
+    }
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, converter, 
+                input_data, graph_output, poros_output));
+
+    if (single_output) {
+        ASSERT_EQ(1, graph_output.size());
+        ASSERT_EQ(1, poros_output.size());
+    } else {
+        ASSERT_EQ(2, graph_output.size());
+        ASSERT_EQ(2, poros_output.size());
+    }
+
+    for (size_t i = 0; i < graph_output.size(); i++) {
+        ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[i], poros_output[i], 2e-6));
+    }
+}
+
+static std::string gen_basic_graph(const std::string& op) {
+    return R"IR(
+      graph(%0 : Tensor):
+        %1 : None = prim::Constant()
+        %2 : Tensor = aten::)IR" +
+        op + R"IR((%0, %1)
+        return (%2))IR";
+}
+
+static std::string gen_min_max_graph(const std::string& op) {
+    return R"IR(
+      graph(%0 : Tensor):
+        %1 : Tensor = aten::)IR" +
+        op + R"IR((%0)
+        return (%1))IR";
+}
+
+static std::string gen_min_max_other_graph(const std::string& op) {
+    return R"IR(
+      graph(%0 : Tensor, %1 : Tensor):
+        %1 : Tensor = aten::)IR" +
+        op + R"IR((%0, %1)
+        return (%1))IR";
+}
+
+static std::string gen_min_max_dim_graph(const std::string& op, const std::string& dim) {
+    return R"IR(
+      graph(%0 : Tensor):
+        %1 : int = prim::Constant[value=)IR" + dim + R"IR(]()
+        %2 : bool = prim::Constant[value=0]()
+        %3 : Tensor, %4 : Tensor = aten::)IR" + op + R"IR((%0, %1, %2)
+        return (%3, %4))IR";
+}
+
+static std::string gen_mean_sum_dim_graph(const std::string& op, const std::string& dim, const std::string& keepdim) {
+    return R"IR(
+        graph(%0 : Tensor):
+          %1 : int[] = prim::Constant[value=[)IR" + dim + R"IR(]]()
+          %2 : bool = prim::Constant[value=)IR" + keepdim + R"IR(]()
+          %3 : None = prim::Constant()
+          %4 : Tensor = aten::)IR" + op + R"IR((%0, %1, %2, %3)
+          return (%4))IR";
+}
+
+static std::string gen_prod_dim_graph(const std::string& op, const std::string& dim, const std::string& keepdim) {
+    return R"IR(
+        graph(%0 : Tensor):
+          %1 : int = prim::Constant[value=)IR" + dim + R"IR(]()
+          %2 : bool = prim::Constant[value=)IR" + keepdim + R"IR(]()
+          %3 : None = prim::Constant()
+          %4 : Tensor = aten::)IR" + op + R"IR((%0, %1, %2, %3)
+          return (%4))IR";
+}
+
+TEST(Converters, ATenMeanConvertsCorrectly) {
+    // aten::mean(Tensor self, *, ScalarType? dtype=None) -> Tensor
+    const auto graph_IR = gen_basic_graph("mean");
+    baidu::mirana::poros::MeanConverter meanconverter;
+    reduce_test_helper(graph_IR, &meanconverter, {4, 4});
+}
+
+TEST(Converters, ATenMeanDimConvertsCorrectly) {
+    // aten::mean.dim(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+    const auto graph_IR = gen_mean_sum_dim_graph("mean", "1", "0");
+    baidu::mirana::poros::MeanConverter meanconverter;
+    reduce_test_helper(graph_IR, &meanconverter, {4, 4, 4});
+}
+
+TEST(Converters, ATenMeanMltiDimsConvertsCorrectly) {
+    // aten::mean.dim(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+    const auto graph_IR = gen_mean_sum_dim_graph("mean", "0, 1", "0");
+    baidu::mirana::poros::MeanConverter meanconverter;
+    reduce_test_helper(graph_IR, &meanconverter, {4, 4, 4});
+}
+
+TEST(Converters, ATenMeanKeepDimsConvertsCorrectly) {
+    // aten::mean.dim(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+    const auto graph_IR = gen_mean_sum_dim_graph("mean", "1", "1");
+    baidu::mirana::poros::MeanConverter meanconverter;
+    reduce_test_helper(graph_IR, &meanconverter, {4, 4});
+}
+
+TEST(Converters, ATenMeanDimNegOneIndexConvertsCorrectly) {
+    // aten::mean.dim(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+    const auto graph_IR = gen_mean_sum_dim_graph("mean", "-1", "0");
+    baidu::mirana::poros::MeanConverter meanconverter;
+    reduce_test_helper(graph_IR, &meanconverter, {4, 4, 4});
+}
+
+TEST(Converters, ATenMeanDimNegOneIndexKeepDimsConvertsCorrectly) {
+    // aten::mean.dim(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+    const auto graph_IR = gen_mean_sum_dim_graph("mean", "-1", "1");
+    baidu::mirana::poros::MeanConverter meanconverter;
+    reduce_test_helper(graph_IR, &meanconverter, {4, 4, 4});
+}
+
+TEST(Converters, ATenMeanDimNegIndexConvertsCorrectly) {
+    // aten::mean.dim(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+    const auto graph_IR = gen_mean_sum_dim_graph("mean", "-2", "0");
+    baidu::mirana::poros::MeanConverter meanconverter;
+    reduce_test_helper(graph_IR, &meanconverter, {4, 4, 4});
+}
+
+TEST(Converters, ATenMeanDimNegIndexKeepDimsConvertsCorrectly) {
+    // aten::mean.dim(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+    const auto graph_IR = gen_mean_sum_dim_graph("mean", "-2", "1");
+    baidu::mirana::poros::MeanConverter meanconverter;
+    reduce_test_helper(graph_IR, &meanconverter, {4, 4, 4});
+}
+
+TEST(Converters, ATenSumConvertsCorrectly) {
+    // aten::sum(Tensor self, *, ScalarType? dtype=None) -> Tensor
+    const auto graph_IR = gen_basic_graph("sum");
+    baidu::mirana::poros::SumConverter sumconverter;
+    reduce_test_helper(graph_IR, &sumconverter, {4, 4});
+}
+
+TEST(Converters, ATenSumDimConvertsCorrectly) {
+    // aten::sum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+    const auto graph_IR = gen_mean_sum_dim_graph("sum", "1", "0");
+    baidu::mirana::poros::SumConverter sumconverter;
+    reduce_test_helper(graph_IR, &sumconverter, {4, 4, 4});
+}
+
+TEST(Converters, ATenSumMltiDimsConvertsCorrectly) {
+    // aten::sum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+    const auto graph_IR = gen_mean_sum_dim_graph("sum", "0, 1", "0");
+    baidu::mirana::poros::SumConverter sumconverter;
+    reduce_test_helper(graph_IR, &sumconverter, {4, 4, 4});
+}
+
+TEST(Converters, ATenSumKeepDimsConvertsCorrectly) {
+    // aten::sum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+    const auto graph_IR = gen_mean_sum_dim_graph("sum", "1", "1");
+    baidu::mirana::poros::SumConverter sumconverter;
+    reduce_test_helper(graph_IR, &sumconverter, {4, 4});
+}
+
+TEST(Converters, ATenSumDimNegOneIndexConvertsCorrectly) {
+    // aten::sum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+    const auto graph_IR = gen_mean_sum_dim_graph("sum", "-1", "0");
+    baidu::mirana::poros::SumConverter sumconverter;
+    reduce_test_helper(graph_IR, &sumconverter, {4, 4, 4});
+}
+
+TEST(Converters, ATenSumDimNegOneIndexKeepDimsConvertsCorrectly) {
+    // aten::sum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+    const auto graph_IR = gen_mean_sum_dim_graph("sum", "-1", "1");
+    baidu::mirana::poros::SumConverter sumconverter;
+    reduce_test_helper(graph_IR, &sumconverter, {4, 4, 4});
+}
+
+TEST(Converters, ATenSumDimNegIndexConvertsCorrectly) {
+    // aten::sum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+    const auto graph_IR = gen_mean_sum_dim_graph("sum", "-2", "0");
+    baidu::mirana::poros::SumConverter sumconverter;
+    reduce_test_helper(graph_IR, &sumconverter, {4, 4, 4});
+}
+
+TEST(Converters, ATenSumDimNegIndexKeepDimsConvertsCorrectly) {
+    // aten::sum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+    const auto graph_IR = gen_mean_sum_dim_graph("sum", "-2", "1");
+    baidu::mirana::poros::SumConverter sumconverter;
+    reduce_test_helper(graph_IR, &sumconverter, {4, 4, 4});
+}
+
+TEST(Converters, ATenProdConvertsCorrectly) {
+    // aten::prod(Tensor self, *, ScalarType? dtype=None) -> Tensor
+    const auto graph_IR = gen_basic_graph("prod");
+    baidu::mirana::poros::ProdConverter prodconverter;
+    reduce_test_helper(graph_IR, &prodconverter, {4, 4});
+}
+
+TEST(Converters, ATenProdDimConvertsCorrectly) {
+    // aten::prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+    const auto graph_IR = gen_prod_dim_graph("prod", "1", "0");
+    baidu::mirana::poros::ProdConverter prodconverter;
+    reduce_test_helper(graph_IR, &prodconverter, {4, 4, 4});
+}
+
+TEST(Converters, ATenProdKeepDimsConvertsCorrectly) {
+    // aten::prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+    const auto graph_IR = gen_prod_dim_graph("prod", "1", "1");
+    baidu::mirana::poros::ProdConverter prodconverter;
+    reduce_test_helper(graph_IR, &prodconverter, {4, 4});
+}
+
+TEST(Converters, ATenMaxConvertsCorrectly) {
+    // aten::max(Tensor self) -> Tensor
+    const auto graph_IR = gen_min_max_graph("max");
+    baidu::mirana::poros::MaxMinConverter maxminconverter;
+    reduce_test_helper(graph_IR, &maxminconverter, {4, 4});
+}
+
+TEST(Converters, ATenMinConvertsCorrectly) {
+    // aten::min(Tensor self) -> Tensor
+    const auto graph_IR = gen_min_max_graph("min");
+    baidu::mirana::poros::MaxMinConverter maxminconverter;
+    reduce_test_helper(graph_IR, &maxminconverter, {4, 4});
+}
+
+TEST(Converters, ATenMaxOtherConvertsCorrectly) {
+    // aten::max.other(Tensor self, Tensor other) -> Tensor
+    const auto graph_IR = gen_min_max_other_graph("max");
+    baidu::mirana::poros::MaxMinConverter maxminconverter;
+    reduce_test_helper(graph_IR, &maxminconverter, {4, 4}, false, {4, 4});
+    reduce_test_helper(graph_IR, &maxminconverter, {3, 4}, false, {4});
+    reduce_test_helper(graph_IR, &maxminconverter, {4}, false, {3, 4});
+    reduce_test_helper(graph_IR, &maxminconverter, {4, 1}, false, {1, 4});
+    reduce_test_helper(graph_IR, &maxminconverter, {3, 4, 3}, false, {4, 3});
+    reduce_test_helper(graph_IR, &maxminconverter, {4, 3}, false, {3, 4, 3});
+}
+
+TEST(Converters, ATenMinOtherConvertsCorrectly) {
+    // aten::min.other(Tensor self, Tensor other) -> Tensor
+    const auto graph_IR = gen_min_max_other_graph("min");
+    baidu::mirana::poros::MaxMinConverter maxminconverter;
+    reduce_test_helper(graph_IR, &maxminconverter, {4, 4}, false, {4, 4});
+    reduce_test_helper(graph_IR, &maxminconverter, {3, 4}, false, {4});
+    reduce_test_helper(graph_IR, &maxminconverter, {4}, false, {3, 4});
+    reduce_test_helper(graph_IR, &maxminconverter, {4, 1}, false, {1, 4});
+    reduce_test_helper(graph_IR, &maxminconverter, {3, 4, 3}, false, {4, 3});
+    reduce_test_helper(graph_IR, &maxminconverter, {4, 3}, false, {3, 4, 3});
+}
+
+TEST(Converters, ATenMaxDimConvertsCorrectly) {
+    // aten::max.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+    const auto graph_IR = gen_min_max_dim_graph("max", "0");
+    baidu::mirana::poros::MaxMinConverter maxminconverter;
+    reduce_test_helper(graph_IR, &maxminconverter, {4, 5, 3}, true, {}, false);
+    const auto graph_IR2 = gen_min_max_dim_graph("max", "1");
+    reduce_test_helper(graph_IR2, &maxminconverter, {4, 5, 3}, true, {}, false);
+    const auto graph_IR3 = gen_min_max_dim_graph("max", "-1");
+    reduce_test_helper(graph_IR3, &maxminconverter, {4, 5, 3}, true, {}, false);
+    const auto graph_IR4 = gen_min_max_dim_graph("max", "-1");
+    reduce_test_helper(graph_IR4, &maxminconverter, {4, 3}, true, {}, false);
+}
+
+TEST(Converters, ATenMinDimConvertsCorrectly) {
+    // aten::min.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+    const auto graph_IR = gen_min_max_dim_graph("min", "0");
+    baidu::mirana::poros::MaxMinConverter maxminconverter;
+    reduce_test_helper(graph_IR, &maxminconverter, {4, 5, 3}, true, {}, false);
+    const auto graph_IR2 = gen_min_max_dim_graph("min", "1");
+    reduce_test_helper(graph_IR2, &maxminconverter, {4, 5, 3}, true, {}, false);
+    const auto graph_IR3 = gen_min_max_dim_graph("min", "-1");
+    reduce_test_helper(graph_IR3, &maxminconverter, {4, 5, 3}, true, {}, false);
+    const auto graph_IR4 = gen_min_max_dim_graph("min", "-1");
+    reduce_test_helper(graph_IR4, &maxminconverter, {4, 3}, true, {}, false);
+}
+
+TEST(Converters, ATenMaxDimDynamicConvertsCorrectly) {
+    // aten::max.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+    const auto graph_IR = gen_min_max_dim_graph("max", "0");
+    baidu::mirana::poros::MaxMinConverter maxminconverter;
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({4, 5, 6}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({3, 4, 5}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({3, 4, 5}, {at::kCUDA}));
+
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({3, 4, 5}, {at::kCUDA}));
+    
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = true;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &maxminconverter, 
+                input_data, graph_output, poros_output, &prewarm_data));
+    ASSERT_EQ(2, graph_output.size());
+    ASSERT_EQ(2, poros_output.size());
+    
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[1], poros_output[1], 2e-6));
+}
+
+TEST(Converters, ATenMinDimDynamicConvertsCorrectly) {
+    // aten::max.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+    const auto graph_IR = gen_min_max_dim_graph("min", "1");
+    baidu::mirana::poros::MaxMinConverter maxminconverter;
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({4, 5, 6}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({3, 4, 5}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({3, 4, 5}, {at::kCUDA}));
+
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({3, 4, 5}, {at::kCUDA}));
+    
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = true;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &maxminconverter, 
+                input_data, graph_output, poros_output, &prewarm_data));
+    ASSERT_EQ(2, graph_output.size());
+    ASSERT_EQ(2, poros_output.size());
+    
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[1], poros_output[1], 2e-6));
+}
\ No newline at end of file
diff --git a/poros/unittest/converter/reflection_pad_test.cpp b/poros/unittest/converter/reflection_pad_test.cpp
new file mode 100644
index 0000000000..8408e96519
--- /dev/null
+++ b/poros/unittest/converter/reflection_pad_test.cpp
@@ -0,0 +1,137 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file reflection_pad_test.cpp
+* @author tianshaoqing@baidu.com
+* @date Wed Sep 27 11:24:21 CST 2021
+* @brief 
+**/
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/converter/gpu/reflection_pad.h"
+#include "poros/util/test_util.h"
+
+static void reflection_pad_test_helper(const std::string& graph_IR, 
+                                std::vector<int64_t> shape,
+                                bool is_dynamic = false,
+                                std::vector<std::vector<at::Tensor>>* prewarm_data = nullptr) {
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn(shape, {at::kCUDA}));
+
+    baidu::mirana::poros::ReflectionPadConverter reflectionpadconverter;
+
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = is_dynamic;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &reflectionpadconverter, 
+                input_data, graph_output, poros_output, prewarm_data));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(graph_output[0].equal(poros_output[0]));
+}
+
+static std::string gen_reflection_pad_graph(const std::string& op, 
+                                            const std::string& padding) {
+    return R"IR(
+      graph(%0 : Tensor):
+        %1 : int[] = prim::Constant[value=[)IR" + padding + R"IR(]]()
+        %2 : Tensor = aten::)IR" + op + R"IR((%0, %1)
+        return (%2))IR";
+}
+
+TEST(Converters, ATenReflectionPad1DConvertsCorrectly) {
+    // aten::reflection_pad1d(Tensor self, int[2] padding) -> Tensor
+    const auto graph_IR = gen_reflection_pad_graph("reflection_pad1d", "2, 2");
+    reflection_pad_test_helper(graph_IR, {2, 5});
+}
+
+TEST(Converters, ATenReflectionPad2DConvertsCorrectly) {
+    // aten::reflection_pad2d(Tensor self, int[4] padding) -> Tensor
+    const auto graph_IR = gen_reflection_pad_graph("reflection_pad2d", "1, 1, 2, 3");
+    reflection_pad_test_helper(graph_IR, {3, 4, 3});
+}
+
+TEST(Converters, ATenReflectionPad1DDynamicConvertsCorrectly) {
+    // aten::reflection_pad1d(Tensor self, int[2] padding) -> Tensor
+    const auto graph_IR = gen_reflection_pad_graph("reflection_pad1d", "2, 3");
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({3, 6}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({2, 5}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({2, 5}, {at::kCUDA}));
+
+    reflection_pad_test_helper(graph_IR, {2, 5}, true, &prewarm_data);
+}
+
+TEST(Converters, ATenReflectionPad2DDynamicConvertsCorrectly) {
+    // aten::reflection_pad2d(Tensor self, int[4] padding) -> Tensor
+    const auto graph_IR = gen_reflection_pad_graph("reflection_pad2d", "1, 1, 2, 3");
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({4, 5, 4}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({3, 4, 3}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({3, 4, 3}, {at::kCUDA}));
+
+    reflection_pad_test_helper(graph_IR, {3, 4, 3}, true, &prewarm_data);
+}
+
+TEST(Converters, ATenReflectionPad1DDynamicscalarinputConvertsCorrectly) {
+    // aten::reflection_pad2d(Tensor self, int[4] padding) -> Tensor
+    const auto graph_IR = R"IR(
+      graph(%0 : Tensor):
+        %1 : int = prim::Constant[value=1]()
+        %2 : int = prim::Constant[value=1]()
+        %3 : int = prim::Constant[value=2]()
+        %4 : int = aten::size(%0, %1)
+        %5 : float = aten::div(%4, %3)
+        %6 : int = aten::floor(%5)
+        %7 : int[] = prim::ListConstruct(%1, %6)
+        %8 : Tensor = aten::reflection_pad1d(%0, %7)
+        return (%8))IR";
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({3, 7}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({2, 5}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({2, 5}, {at::kCUDA}));
+
+    reflection_pad_test_helper(graph_IR, {2, 7}, true, &prewarm_data);
+}
+
+
+TEST(Converters, ATenReflectionPad2DDynamicscalarinputConvertsCorrectly) {
+    // aten::reflection_pad2d(Tensor self, int[4] padding) -> Tensor
+    const auto graph_IR = R"IR(
+      graph(%0 : Tensor):
+        %1 : int = prim::Constant[value=1]()
+        %2 : int = prim::Constant[value=1]()
+        %3 : int = prim::Constant[value=2]()
+        %4 : int = aten::size(%0, %1)
+        %5 : float = aten::div(%4, %3)
+        %6 : int = aten::floor(%5)
+        %7 : int[] = prim::ListConstruct(%1, %2, %3, %6)
+        %8 : Tensor = aten::reflection_pad2d(%0, %7)
+        return (%8))IR";
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({4, 7, 4}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({3, 5, 3}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({3, 5, 3}, {at::kCUDA}));
+
+    reflection_pad_test_helper(graph_IR, {3, 5, 3}, true, &prewarm_data);
+
+}
\ No newline at end of file
diff --git a/poros/unittest/converter/replication_pad_test.cpp b/poros/unittest/converter/replication_pad_test.cpp
new file mode 100644
index 0000000000..4e0d331e6a
--- /dev/null
+++ b/poros/unittest/converter/replication_pad_test.cpp
@@ -0,0 +1,105 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file replication_pad_test.cpp
+* @author tianshaoqing@baidu.com
+* @date Wed Sep 27 11:24:21 CST 2021
+* @brief 
+**/
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/converter/gpu/replication_pad.h"
+#include "poros/util/test_util.h"
+
+static void replicationpad_test_helper(const std::string& graph_IR,
+                                    std::vector<int64_t> shape) {
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn(shape, {at::kCUDA}));
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    baidu::mirana::poros::ReplicationPadConverter replicationpadconverter;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &replicationpadconverter, 
+                input_data, graph_output, poros_output));
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    // ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+    ASSERT_TRUE(graph_output[0].equal(poros_output[0]));
+}
+
+static std::string gen_replicationpad_graph(const std::string& op, 
+                                            const std::string& padding) {
+    return R"IR(
+      graph(%0 : Tensor):
+        %1 : int[] = prim::Constant[value=[)IR" + padding + R"IR(]]()
+        %2 : Tensor = aten::)IR" + op + R"IR((%0, %1)
+        return (%2))IR";
+}
+
+TEST(Converters, ATenReplicationPad1DConvertsCorrectly) {
+    // aten::replication_pad1d(Tensor self, int[2] padding) -> Tensor
+    const auto graph_IR = gen_replicationpad_graph("replication_pad1d", "2, 3");
+    replicationpad_test_helper(graph_IR, {1, 3, 4});
+}
+
+TEST(Converters, ATenReplicationPad1DRightZeroConvertsCorrectly) {
+    // aten::replication_pad1d(Tensor self, int[2] padding) -> Tensor
+    const auto graph_IR = gen_replicationpad_graph("replication_pad1d", "2, 0");
+    replicationpad_test_helper(graph_IR, {1, 3, 4});
+}
+
+TEST(Converters, ATenReplicationPad1DLeftZeroConvertsCorrectly) {
+    // aten::replication_pad1d(Tensor self, int[2] padding) -> Tensor
+    const auto graph_IR = gen_replicationpad_graph("replication_pad1d", "0, 3");
+    replicationpad_test_helper(graph_IR, {1, 3, 4});
+}
+
+TEST(Converters, ATenReplicationPad2DConvertsCorrectly) {
+    // aten::replication_pad2d(Tensor self, int[4] padding) -> Tensor
+    const auto graph_IR = gen_replicationpad_graph("replication_pad2d", "2, 3, 2, 3");
+    replicationpad_test_helper(graph_IR, {1, 3, 4, 5});
+}
+
+TEST(Converters, ATenReplicationPad2DBottomZeroConvertsCorrectly) {
+    // aten::replication_pad2d(Tensor self, int[4] padding) -> Tensor
+    const auto graph_IR = gen_replicationpad_graph("replication_pad2d", "2, 0, 2, 0");
+    replicationpad_test_helper(graph_IR, {1, 3, 4, 5});
+}
+
+TEST(Converters, ATenReplicationPad2DTopZeroConvertsCorrectly) {
+    // aten::replication_pad2d(Tensor self, int[4] padding) -> Tensor
+    const auto graph_IR = gen_replicationpad_graph("replication_pad2d", "0, 3, 0, 3");
+    replicationpad_test_helper(graph_IR, {1, 3, 4, 5});
+}
+
+TEST(Converters, ATenReplicationPad3DConvertsCorrectly) {
+    // aten::replication_pad3d(Tensor self, int[6] padding) -> Tensor
+    const auto graph_IR = gen_replicationpad_graph("replication_pad3d", "2, 3, 2, 3, 1, 4");
+    replicationpad_test_helper(graph_IR, {1, 3, 4, 5, 3});
+}
+
+TEST(Converters, ATenReplicationPad3DRightBottomZeroConvertsCorrectly) {
+    // aten::replication_pad3d(Tensor self, int[6] padding) -> Tensor
+    const auto graph_IR = gen_replicationpad_graph("replication_pad3d", "2, 0, 2, 0, 1, 0");
+    replicationpad_test_helper(graph_IR, {1, 3, 4, 5, 3});
+}
+
+TEST(Converters, ATenReplicationPad3DLeftTopZeroConvertsCorrectly) {
+    // aten::replication_pad3d(Tensor self, int[6] padding) -> Tensor
+    const auto graph_IR = gen_replicationpad_graph("replication_pad3d", "2, 0, 2, 0, 1, 0");
+    replicationpad_test_helper(graph_IR, {1, 3, 4, 5, 3});
+}
\ No newline at end of file
diff --git a/poros/unittest/converter/roll_test.cpp b/poros/unittest/converter/roll_test.cpp
new file mode 100644
index 0000000000..7bb4ee02df
--- /dev/null
+++ b/poros/unittest/converter/roll_test.cpp
@@ -0,0 +1,79 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file roll_test.cpp
+* @author tianshaoqing@baidu.com
+* @date Wed Jul 20 19:34:51 CST 2022
+* @brief 
+**/
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/converter/gpu/roll.h"
+#include "poros/util/test_util.h"
+
+static void roll_test_helper(const std::string& graph_IR, 
+                                std::vector<int64_t> shape,
+                                bool is_dynamic = false,
+                                std::vector<std::vector<at::Tensor>>* prewarm_data = nullptr) {
+    std::vector<at::Tensor> input_data;
+    int64_t shape_mul = 1;
+    for (int64_t& s : shape) {
+        shape_mul *= s;
+    }
+    input_data.push_back(at::randint(0, shape_mul, shape, {at::kCUDA}));
+
+    baidu::mirana::poros::RollConverter rollconverter;
+
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = is_dynamic;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &rollconverter, 
+                input_data, graph_output, poros_output, prewarm_data));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(graph_output[0].equal(poros_output[0]));
+}
+
+static std::string gen_roll_graph(const std::string& shifts, const std::string& dims) {
+    return R"IR(
+        graph(%0 : Tensor):
+          %1 : int[] = prim::Constant[value=)IR" + shifts + R"IR(]()
+          %2 : int[] = prim::Constant[value=)IR" + dims + R"IR(]()
+          %3 : Tensor = aten::roll(%0, %1, %2)
+          return (%3))IR";
+}
+
+TEST(Converters, ATenRollConvertsCorrectly) {
+    // aten::roll(Tensor self, int[1] shifts, int[1] dims=[]) -> (Tensor)
+    const std::string graph_IR = gen_roll_graph("[-1, 0, -2, 3]", "[0, 1, 2, 3]");
+    roll_test_helper(graph_IR, {4, 4, 4, 4});
+}
+
+
+TEST(Converters, ATenRollConvertsCorrectlyShiftsGreaterThanDims) {
+    // aten::roll(Tensor self, int[1] shifts, int[1] dims=[]) -> (Tensor)
+    const std::string graph_IR = gen_roll_graph("[-99, 100, 51, -21]", "[0, 1, 2, 3]");
+    roll_test_helper(graph_IR, {4, 4, 4, 4});
+}
+
+TEST(Converters, ATenRollConvertsCorrectlyShiftSomeDims) {
+    // aten::roll(Tensor self, int[1] shifts, int[1] dims=[]) -> (Tensor)
+    const std::string graph_IR = gen_roll_graph("[0, -2, 3]", "[0, 1, 3]");
+    roll_test_helper(graph_IR, {4, 4, 4, 4});
+}
\ No newline at end of file
diff --git a/poros/unittest/converter/select_test.cpp b/poros/unittest/converter/select_test.cpp
new file mode 100644
index 0000000000..8e1ac6ef76
--- /dev/null
+++ b/poros/unittest/converter/select_test.cpp
@@ -0,0 +1,1350 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file select_test.cpp
+* @author tianshaoqing@baidu.com
+* @date Wed Sep 27 11:24:21 CST 2021
+* @brief 
+**/
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/converter/gpu/select.h"
+#include "poros/util/test_util.h"
+
+static void select_test_helper(const std::string& graph_IR, 
+                                baidu::mirana::poros::IConverter* converter,
+                                std::vector<int64_t> shape,
+                                bool is_dynamic = false,
+                                std::vector<std::vector<at::Tensor>>* prewarm_data = nullptr) {
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn(shape, {at::kCUDA}));
+
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = is_dynamic;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, converter, 
+                input_data, graph_output, poros_output, prewarm_data));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(graph_output[0].equal(poros_output[0]));
+}
+
+static void split_test_helper(const std::string& graph_IR, 
+                                baidu::mirana::poros::IConverter* converter,
+                                std::vector<int64_t> shape,
+                                const int64_t& output_size) {
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn(shape, {at::kCUDA}));
+    
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, converter, 
+                input_data, graph_output, poros_output));
+
+    ASSERT_EQ(output_size, graph_output.size());
+    ASSERT_EQ(output_size, poros_output.size());
+
+    for (int64_t i = 0; i < output_size; i++) {
+        ASSERT_TRUE(graph_output[i].equal(poros_output[i]));
+    }
+}
+
+static void embedding_test_helper(const std::string& graph_IR, 
+                                baidu::mirana::poros::IConverter* converter) {
+
+    std::vector<at::Tensor> input_data;
+    auto options_pyt = torch::TensorOptions().device(torch::kCUDA, 0).dtype(torch::kInt64);
+    auto weight = at::randn({10, 4}, {at::kCUDA});
+    auto input = at::tensor({2, 3, 4}, options_pyt);
+    input_data.push_back(weight);
+    input_data.push_back(input);
+                            
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, converter, 
+                input_data, graph_output, poros_output));
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+}
+
+static std::string gen_select_graph(const std::string& dim, const std::string& index) {
+    return R"IR(
+        graph(%0 : Tensor):
+          %1 : int = prim::Constant[value=)IR" + dim + R"IR(]()
+          %2 : int = prim::Constant[value=)IR" + index + R"IR(]()
+          %3 : Tensor = aten::select(%0, %1, %2)
+          return (%3))IR";
+}
+
+static std::string gen_slice_graph(const std::string& dim, 
+                                const std::string& start,
+                                const std::string& end,
+                                const std::string& step) {
+    std::string start_ir, end_ir;
+    if (start.empty()) {
+        start_ir = "%2 : None = prim::Constant()";
+    } else {
+        start_ir = "%2 : int = prim::Constant[value=" + start + "]()";
+    }
+    if (end.empty()) {
+        end_ir = "%3 : None = prim::Constant()";
+    } else {
+        end_ir = "%3 : int = prim::Constant[value=" + end + "]()";
+    }
+    return R"IR(
+        graph(%0 : Tensor):
+          %1 : int = prim::Constant[value=)IR" + dim + R"IR(]()
+          )IR" + start_ir + R"IR(
+          )IR" + end_ir + R"IR(
+          %4 : int = prim::Constant[value=)IR" + step + R"IR(]()
+          %5 : Tensor = aten::slice(%0, %1, %2, %3, %4)
+          return (%5))IR";
+}
+
+static std::string gen_narrow_graph(const std::string& dim, 
+                                    const std::string& start,
+                                    const std::string& length,
+                                    bool singleinput) {
+    if (singleinput) {
+        return R"IR(
+            graph(%0 : Tensor):
+              %1 : int = prim::Constant[value=)IR" + dim + R"IR(]()
+              %2 : int = prim::Constant[value=)IR" + start + R"IR(]()
+              %3 : int = prim::Constant[value=)IR" + length + R"IR(]()
+              %4 : Tensor = aten::narrow(%0, %1, %2, %3)
+              return (%4))IR";
+    } else {
+        return R"IR(
+            graph(%0 : Tensor, %1 : Tensor):
+              %2 : int = prim::Constant[value=)IR" + dim + R"IR(]()
+              %3 : int = prim::Constant[value=)IR" + length + R"IR(]()
+              %4 : Tensor = aten::narrow(%0, %2, %1, %3)
+              return (%4))IR";
+    }
+}
+
+static std::string gen_indexput_graph(const std::string& fold) {
+    return  R"IR(
+    graph(%x : Tensor):
+        %none : NoneType = prim::Constant()
+        %0 : int = prim::Constant[value=0]()
+        %1 : int = prim::Constant[value=1]()
+        %2 : int = prim::Constant[value=2]()
+        %4 : int = prim::Constant[value=4]()
+        %negtive : int = prim::Constant[value=-1]()
+        %fold : int = prim::Constant[value=)IR" + fold + R"IR(]()
+        %false : bool = prim::Constant[value=0]()
+
+        %out : Tensor = aten::zeros_like(%x, %none, %none, %none, %none, %none)
+        %302 : Tensor = aten::slice(%x, %0, %none, %none, %1)
+        %303 : Tensor = aten::slice(%302, %1, %1, %none, %1)
+        %304 : Tensor = aten::slice(%303, %2, %none, %fold, %1)
+
+        %2726 : int = aten::size(%out, %0)
+        %2731 : Tensor = aten::arange(%2726, %4, %none, %none, %none)
+        %2733 : Tensor = aten::slice(%2731, %0, %none, %none, %1)
+
+        %2735 : int = aten::size(%out, %1)
+        %2740 : Tensor = aten::arange(%2735, %4, %none, %none, %none)
+        %2742 : Tensor = aten::slice(%2740, %0, %none, %negtive, %1)
+
+        %2744 : int = aten::size(%out, %2)
+        %2749 : Tensor = aten::arange(%2744, %4, %none, %none, %none)
+        %2751 : Tensor = aten::slice(%2749, %0, %none, %fold, %1)
+
+        %2752 : int[] = prim::Constant[value=[-1, 1, 1]]()
+        %2753 : Tensor = aten::view(%2733, %2752)
+        %2754 : int[] = prim::Constant[value=[-1, 1]]()
+        %2755 : Tensor = aten::view(%2742, %2754)
+        %2756 : Tensor?[] = prim::ListConstruct(%2753, %2755, %2751)
+        %2757 : Tensor = aten::index_put(%out, %2756, %304, %false)
+        return (%2757))IR";
+}
+
+static std::string gen_indexput_with_singular_value_graph() {
+    return R"IR(
+      graph(%x : Tensor):
+        %false : bool = prim::Constant[value=0]()
+        %none  : NoneType = prim::Constant()
+        %neg1 : int = prim::Constant[value=-1]()
+        %0 : int = prim::Constant[value=0]()
+        %1 : int = prim::Constant[value=1]()
+        %4 : int = prim::Constant[value=4]()
+        %device : Device = prim::Constant[value="cuda:0"]()
+
+        %size : int[] = aten::size(%x)
+        %input_shape : int[] = aten::slice(%size, %none, %neg1, %1)
+        %attention_mask : Tensor = aten::zeros(%input_shape, %none, %none, %device, %none)
+        %92 : int = aten::size(%attention_mask, %1)
+        %90 : Tensor = aten::arange(%92, %4, %none, %none, %none)
+        %86 : Tensor = aten::slice(%90, %0, %none, %none, %1)
+        %2326 : int = prim::dtype(%86)
+        %101 : Tensor = aten::tensor(%0, %2326, %device, %false)
+
+        %index : Tensor?[] = prim::ListConstruct(%101, %86)
+
+        %28 : int = prim::dtype(%attention_mask)
+        %value : Tensor = aten::tensor(%1, %28, %device, %false)
+        %tmp : Tensor = aten::index_put(%attention_mask, %index, %value, %false)
+        %out : Tensor = aten::mul(%tmp, %4)
+        return (%out))IR";
+}
+
+TEST(Converters, ATenSelectIntConvertsCorrectly) {
+    // aten::select.int(Tensor(a) self, int dim, int index) -> Tensor(a)
+    const auto graph_IR = gen_select_graph("0", "0");
+    baidu::mirana::poros::SelectConverter selectconverter;
+    select_test_helper(graph_IR, &selectconverter, {4, 4, 4});
+}
+
+TEST(Converters, ATenSelectIntDimIsOneConvertsCorrectly) {
+    // aten::select.int(Tensor(a) self, int dim, int index) -> Tensor(a)
+    const auto graph_IR = gen_select_graph("1", "0");
+    baidu::mirana::poros::SelectConverter selectconverter;
+    select_test_helper(graph_IR, &selectconverter, {4, 4, 4});
+}
+
+TEST(Converters, ATenSelectIntDimNegativeConvertsCorrectly) {
+    // aten::select.int(Tensor(a) self, int dim, int index) -> Tensor(a)
+    const auto graph_IR = gen_select_graph("-2", "0");
+    baidu::mirana::poros::SelectConverter selectconverter;
+    select_test_helper(graph_IR, &selectconverter, {4, 4, 4});
+}
+
+TEST(Converters, ATenSelectIntNegIndexConvertsCorrectly) {
+    // aten::select.int(Tensor(a) self, int dim, int index) -> Tensor(a)
+    const auto graph_IR = gen_select_graph("0", "-1");
+    baidu::mirana::poros::SelectConverter selectconverter;
+    select_test_helper(graph_IR, &selectconverter, {4, 4, 4});
+}
+
+TEST(Converters, ATenSelectSelfDynaimcConverterCorrectly) {
+    // aten::select.int(Tensor(a) self, int dim, int index) -> Tensor(a)
+    const auto graph_IR = gen_select_graph("3", "0");
+    baidu::mirana::poros::SelectConverter selectconverter;
+
+    std::vector<at::Tensor> input_data;
+    auto options_pyt = torch::TensorOptions().device(torch::kCUDA, 0).dtype(torch::kInt);
+    input_data.push_back(at::randint(0, 100, {3, 4, 5, 6}, options_pyt)); // indices
+    
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randint(0, 3, {3, 4, 5, 6}, options_pyt)); // indices
+    prewarm_data[1].push_back(at::randint(0, 3, {2, 3, 4, 5}, options_pyt)); // indices
+    prewarm_data[2].push_back(at::randint(0, 3, {2, 3, 4, 5}, options_pyt)); // indices
+
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = true;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &selectconverter, 
+                input_data, graph_output, poros_output, &prewarm_data));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(graph_output[0].equal(poros_output[0]));
+}
+
+TEST(Converters, ATenSliceConvertsCorrectly) {
+    // aten::slice.Tensor(Tensor(a) self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor(a)
+    const auto graph_IR = gen_slice_graph("2", "0", "2", "1");
+    baidu::mirana::poros::SliceConverter sliceconverter;
+    select_test_helper(graph_IR, &sliceconverter, {3, 4, 5, 6});
+}
+
+TEST(Converters, ATenSliceDimNegConvertsCorrectly) {
+    // aten::slice.Tensor(Tensor(a) self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor(a)
+    const auto graph_IR = gen_slice_graph("-2", "0", "2", "1");
+    
+    baidu::mirana::poros::SliceConverter sliceconverter;
+    select_test_helper(graph_IR, &sliceconverter, {3, 4, 5, 6});
+}
+
+TEST(Converters, ATenSliceStartNoneConvertsCorrectly) {
+    // aten::slice.Tensor(Tensor(a) self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor(a)
+    const auto graph_IR = gen_slice_graph("2", "", "3", "1");
+    baidu::mirana::poros::SliceConverter sliceconverter;
+    select_test_helper(graph_IR, &sliceconverter, {3, 4, 5, 6});
+}
+
+TEST(Converters, ATenSliceStartNegConvertsCorrectly) {
+    // aten::slice.Tensor(Tensor(a) self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor(a)
+    const auto graph_IR = gen_slice_graph("2", "-2", "3", "1");
+    baidu::mirana::poros::SliceConverter sliceconverter;
+    select_test_helper(graph_IR, &sliceconverter, {3, 4, 5, 6});
+}
+
+TEST(Converters, ATenSliceEndNoneConvertsCorrectly) {
+    // aten::slice.Tensor(Tensor(a) self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor(a)
+    const auto graph_IR = gen_slice_graph("2", "1", "", "1");
+    baidu::mirana::poros::SliceConverter sliceconverter;
+    select_test_helper(graph_IR, &sliceconverter, {3, 4, 5, 6});
+}
+
+TEST(Converters, ATenSliceEndNegConvertsCorrectly) {
+    // aten::slice.Tensor(Tensor(a) self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor(a)
+    const auto graph_IR = gen_slice_graph("2", "0", "-2", "2");
+    baidu::mirana::poros::SliceConverter sliceconverter;
+    select_test_helper(graph_IR, &sliceconverter, {3, 4, 5, 6});
+}
+
+TEST(Converters, ATenSliceStartEndNegConvertsCorrectly) {
+    // aten::slice.Tensor(Tensor(a) self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor(a)
+    const auto graph_IR = gen_slice_graph("2", "-3", "-1", "2");
+    baidu::mirana::poros::SliceConverter sliceconverter;
+    select_test_helper(graph_IR, &sliceconverter, {3, 4, 5, 6});
+}
+
+TEST(Converters, ATenSliceStartEndNoneConvertsCorrectly) {
+    // aten::slice.Tensor(Tensor(a) self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor(a)
+    const auto graph_IR = gen_slice_graph("2", "", "", "2");
+    baidu::mirana::poros::SliceConverter sliceconverter;
+    select_test_helper(graph_IR, &sliceconverter, {3, 4, 5, 6});
+}
+
+TEST(Converters, ATenSliceStepConvertsCorrectly) {
+    // aten::slice.Tensor(Tensor(a) self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor(a)
+    const auto graph_IR = gen_slice_graph("2", "0", "3", "2");
+    baidu::mirana::poros::SliceConverter sliceconverter;
+    select_test_helper(graph_IR, &sliceconverter, {3, 4, 5, 6});
+}
+
+TEST(Converters, ATenSliceResnetTestConvertsCorrectly) {
+    // aten::slice.Tensor(Tensor(a) self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor(a)
+    const auto graph_IR = gen_slice_graph("1", "0", "8", "1");
+    baidu::mirana::poros::SliceConverter sliceconverter;
+    select_test_helper(graph_IR, &sliceconverter, {1, 8, 256, 56, 56});
+}
+
+TEST(Converters, ATenSliceDimDynamicTestConvertsCorrectly) {
+    // aten::slice.Tensor(Tensor(a) self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor(a)
+    const auto graph_IR = gen_slice_graph("0", "2", "8", "1");
+    baidu::mirana::poros::SliceConverter sliceconverter;
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({20, 16, 32}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({5, 16, 32}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({10, 16, 32}, {at::kCUDA}));
+
+    select_test_helper(graph_IR, &sliceconverter, {10, 16, 32}, true, &prewarm_data);
+}
+
+TEST(Converters, ATenSliceDimDynamicStartEndBothNegTestConvertsCorrectly) {
+    // aten::slice.Tensor(Tensor(a) self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor(a)
+    const auto graph_IR = gen_slice_graph("0", "-5", "-1", "1");
+    baidu::mirana::poros::SliceConverter sliceconverter;
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({20, 16, 32}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({5, 16, 32}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({10, 16, 32}, {at::kCUDA}));
+
+    select_test_helper(graph_IR, &sliceconverter, {10, 16, 32}, true, &prewarm_data);
+}
+
+TEST(Converters, ATenSliceDimDynamicStartEndBothNoneTestConvertsCorrectly) {
+    // aten::slice.Tensor(Tensor(a) self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor(a)
+    const auto graph_IR = gen_slice_graph("0", "", "", "1");
+    baidu::mirana::poros::SliceConverter sliceconverter;
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({20, 16, 32}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({5, 16, 32}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({10, 16, 32}, {at::kCUDA}));
+
+    select_test_helper(graph_IR, &sliceconverter, {10, 16, 32}, true, &prewarm_data);
+}
+
+TEST(Converters, ATenSliceDimDynamicTestStepConvertsCorrectly) {
+    // aten::slice.Tensor(Tensor(a) self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor(a)
+    const auto graph_IR = gen_slice_graph("0", "-8", "-1", "2");
+    baidu::mirana::poros::SliceConverter sliceconverter;
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({20, 16, 32}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({5, 16, 32}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({10, 16, 32}, {at::kCUDA}));
+
+    select_test_helper(graph_IR, &sliceconverter, {10, 16, 32}, true, &prewarm_data);
+}
+
+TEST(Converters, ATenSliceDimNotDynamicTestConvertsCorrectly) {
+    // aten::slice.Tensor(Tensor(a) self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor(a)
+    const auto graph_IR = gen_slice_graph("1", "10", "16", "1");
+    baidu::mirana::poros::SliceConverter sliceconverter;
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({20, 16, 32}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({5, 16, 32}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({10, 16, 32}, {at::kCUDA}));
+
+    select_test_helper(graph_IR, &sliceconverter, {10, 16, 32}, true, &prewarm_data);
+}
+
+TEST(Converters, ATenSliceDimNotDynamicStartEndBothNegTestConvertsCorrectly) {
+    // aten::slice.Tensor(Tensor(a) self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor(a)
+    const auto graph_IR = gen_slice_graph("1", "-10", "-5", "1");
+    baidu::mirana::poros::SliceConverter sliceconverter;
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({20, 16, 32}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({5, 16, 32}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({10, 16, 32}, {at::kCUDA}));
+
+    select_test_helper(graph_IR, &sliceconverter, {10, 16, 32}, true, &prewarm_data);
+}
+
+TEST(Converters, ATenSliceDimNotDynamicStartEndBothNoneTestConvertsCorrectly) {
+    // aten::slice.Tensor(Tensor(a) self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor(a)
+    const auto graph_IR = gen_slice_graph("1", "", "", "1");
+    baidu::mirana::poros::SliceConverter sliceconverter;
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({20, 16, 32}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({5, 16, 32}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({10, 16, 32}, {at::kCUDA}));
+
+    select_test_helper(graph_IR, &sliceconverter, {10, 16, 32}, true, &prewarm_data);
+}
+
+TEST(Converters, ATenSliceDimNotDynamicTestStepConvertsCorrectly) {
+    // aten::slice.Tensor(Tensor(a) self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor(a)
+    const auto graph_IR = gen_slice_graph("1", "-10", "-5", "2");
+    baidu::mirana::poros::SliceConverter sliceconverter;
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({20, 16, 32}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({5, 16, 32}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({10, 16, 32}, {at::kCUDA}));
+
+    select_test_helper(graph_IR, &sliceconverter, {10, 16, 32}, true, &prewarm_data);
+}
+
+TEST(Converters, ATenSliceTStartEndBothNoneDynamicConvertsCorrectly) {
+    // aten::slice.t(t[] l, int? start=None, int? end=None, int step=1) -> (t[])
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %1 : int[] = aten::size(%0)
+          %2 : None = prim::Constant()
+          %3 : Device = prim::Constant[value="cuda"]()
+          %4 : int = prim::Constant[value=6]()
+          %5 : int = prim::Constant[value=1]()
+          %6 : int[] = aten::slice(%1, %2, %2, %5)
+          %7 : Tensor = aten::ones(%6, %4, %2, %3, %2)
+          return (%7))IR";
+    baidu::mirana::poros::SliceConverter sliceconverter;
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({5, 6, 7, 8}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({4, 5, 6, 7}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({4, 5, 6, 7}, {at::kCUDA}));
+
+    select_test_helper(graph_IR, &sliceconverter, {4, 5, 6, 7}, true, &prewarm_data);
+}
+
+TEST(Converters, ATenSliceTStartEndDynamicConvertsCorrectly) {
+    // aten::slice.t(t[] l, int? start=None, int? end=None, int step=1) -> (t[])
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %start : int = prim::Constant[value=1]()
+          %end : int = prim::Constant[value=3]()
+          %1 : int = prim::Constant[value=1]()
+          %2 : None = prim::Constant()
+          %3 : int[] = aten::size(%0)
+          %4 : int[] = aten::slice(%3, %start, %end, %1)
+          %5 : Device = prim::Constant[value="cuda"]()
+          %6 : int = prim::Constant[value=6]()
+          %7 : Tensor = aten::ones(%4, %6, %2, %5, %2)
+          return (%7))IR";
+    baidu::mirana::poros::SliceConverter sliceconverter;
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({5, 6, 7, 8}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({4, 5, 6, 7}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({4, 5, 6, 7}, {at::kCUDA}));
+
+    select_test_helper(graph_IR, &sliceconverter, {4, 5, 6, 7}, true, &prewarm_data);
+}
+
+TEST(Converters, ATenSliceTStartEndNegDynamicConvertsCorrectly) {
+    // aten::slice.t(t[] l, int? start=None, int? end=None, int step=1) -> (t[])
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %start : int = prim::Constant[value=-3]()
+          %end : int = prim::Constant[value=-1]()
+          %1 : int = prim::Constant[value=1]()
+          %2 : None = prim::Constant()
+          %3 : int[] = aten::size(%0)
+          %4 : int[] = aten::slice(%3, %start, %end, %1)
+          %5 : Device = prim::Constant[value="cuda"]()
+          %6 : int = prim::Constant[value=6]()
+          %7 : Tensor = aten::ones(%4, %6, %2, %5, %2)
+          return (%7))IR";
+    baidu::mirana::poros::SliceConverter sliceconverter;
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({5, 6, 7, 8}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({4, 5, 6, 7}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({4, 5, 6, 7}, {at::kCUDA}));
+
+    select_test_helper(graph_IR, &sliceconverter, {4, 5, 6, 7}, true, &prewarm_data);
+}
+
+TEST(Converters, ATenSliceTStartEndStepDynamicConvertsCorrectly) {
+    // aten::slice.t(t[] l, int? start=None, int? end=None, int step=1) -> (t[])
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %start : int = prim::Constant[value=0]()
+          %end : int = prim::Constant[value=3]()
+          %step : int = prim::Constant[value=2]()
+          %1 : int = prim::Constant[value=1]()
+          %2 : None = prim::Constant()
+          %3 : int[] = aten::size(%0)
+          %4 : int[] = aten::slice(%3, %start, %end, %step)
+          %5 : Device = prim::Constant[value="cuda"]()
+          %6 : int = prim::Constant[value=6]()
+          %7 : Tensor = aten::ones(%4, %6, %2, %5, %2)
+          return (%7))IR";
+    baidu::mirana::poros::SliceConverter sliceconverter;
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({5, 6, 7, 8}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({4, 5, 6, 7}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({4, 5, 6, 7}, {at::kCUDA}));
+
+    select_test_helper(graph_IR, &sliceconverter, {4, 5, 6, 7}, true, &prewarm_data);
+}
+
+TEST(Converters, ATenSliceFromSizeStartDynamicConvertsCorrectly) {
+    // aten::slice.Tensor(Tensor(a) self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor(a)
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %1 : int = prim::Constant[value=1]()
+          %2 : int = aten::size(%0, %1)
+          %3 : int = prim::Constant[value=3]()
+          %4 : int = aten::floordiv(%2, %3)
+          %end : None = prim::Constant()
+          %step : int = prim::Constant[value=1]()
+          %5 : Tensor = aten::slice(%0, %1, %4, %end, %step)
+          return (%5))IR";
+    baidu::mirana::poros::SliceConverter sliceconverter;
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({5, 10, 7, 8}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({4, 5, 6, 7}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({4, 5, 6, 7}, {at::kCUDA}));
+
+    select_test_helper(graph_IR, &sliceconverter, {4, 5, 6, 7}, true, &prewarm_data);
+}
+
+TEST(Converters, ATenSliceFromSizeEndDynamicConvertsCorrectly) {
+    // aten::slice.Tensor(Tensor(a) self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor(a)
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %1 : int = prim::Constant[value=1]()
+          %2 : int = aten::size(%0, %1)
+          %3 : int = prim::Constant[value=3]()
+          %4 : int = aten::floordiv(%2, %3)
+          %start : None = prim::Constant()
+          %step : int = prim::Constant[value=1]()
+          %5 : Tensor = aten::slice(%0, %1, %start, %4, %step)
+          return (%5))IR";
+    baidu::mirana::poros::SliceConverter sliceconverter;
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({5, 10, 7, 8}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({4, 5, 6, 7}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({4, 5, 6, 7}, {at::kCUDA}));
+
+    select_test_helper(graph_IR, &sliceconverter, {4, 5, 6, 7}, true, &prewarm_data);
+}
+
+TEST(Converters, ATenSliceFromSizeStartEndDynamicConvertsCorrectly) {
+    // aten::slice.Tensor(Tensor(a) self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor(a)
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %1 : int = prim::Constant[value=1]()
+          %2 : int = aten::size(%0, %1)
+          %3 : int = prim::Constant[value=2]()
+          %4 : int = prim::Constant[value=5]()
+          %5 : int = aten::floordiv(%2, %3)
+          %6 : int = aten::floordiv(%2, %4)
+          %step : int = prim::Constant[value=1]()
+          %5 : Tensor = aten::slice(%0, %1, %6, %5, %step)
+          return (%5))IR";
+    baidu::mirana::poros::SliceConverter sliceconverter;
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({5, 10, 7, 8}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({4, 5, 6, 7}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({4, 5, 6, 7}, {at::kCUDA}));
+
+    select_test_helper(graph_IR, &sliceconverter, {4, 5, 6, 7}, true, &prewarm_data);
+}
+
+TEST(Converters, ATenNarrowScalarConvertsCorrectly) {
+    // aten::narrow(Tensor(a) self, int dim, int start, int length) -> Tensor(a)
+    const auto graph_IR = gen_narrow_graph("2", "0", "2", true);
+    baidu::mirana::poros::NarrowConverter narrowconverter;
+    select_test_helper(graph_IR, &narrowconverter, {4, 4, 4, 4});
+}
+
+TEST(Converters, ATenNarrowScalarNegtiveStartConvertsCorrectly) {
+    // aten::narrow(Tensor(a) self, int dim, int start, int length) -> Tensor(a)
+    const auto graph_IR = gen_narrow_graph("2", "-3", "2", true);
+    baidu::mirana::poros::NarrowConverter narrowconverter;
+    select_test_helper(graph_IR, &narrowconverter, {4, 4, 4, 4});
+}
+
+TEST(Converters, ATenNarrowScalarNegtiveDimConvertsCorrectly) {
+    // aten::narrow(Tensor(a) self, int dim, int start, int length) -> Tensor(a)
+    const auto graph_IR = gen_narrow_graph("-2", "0", "2", true);
+    baidu::mirana::poros::NarrowConverter narrowconverter;
+    select_test_helper(graph_IR, &narrowconverter, {4, 4, 4, 4});
+}
+
+TEST(Converters, ATenNarrowScalarNegtiveDimStartConvertsCorrectly) {
+    // aten::narrow(Tensor(a) self, int dim, int start, int length) -> Tensor(a)
+    const auto graph_IR = gen_narrow_graph("-2", "-3", "2", true);
+    baidu::mirana::poros::NarrowConverter narrowconverter;
+    select_test_helper(graph_IR, &narrowconverter, {4, 4, 4, 4});
+}
+
+TEST(Converters, ATenSplitFixedTensorsConvertsCorrectly) {
+    // aten::split.Tensor(Tensor(a) self, int split_size, int dim=0) -> Tensor(a)[]
+    const auto graph_IR = R"IR(
+        graph(%1 : Tensor):
+          %2 : int = prim::Constant[value=3]()
+          %3 : int = prim::Constant[value=0]()
+          %4 : Tensor[] = aten::split(%1, %2, %3)
+          %5 : Tensor, %6 : Tensor = prim::ListUnpack(%4)
+          return (%5, %6))IR";
+    baidu::mirana::poros::SplitConverter splitconverter;
+    split_test_helper(graph_IR, &splitconverter, {6, 4, 3, 1}, 2);
+}
+
+TEST(Converters, ATenSplitUnfixedTensorsConvertsCorrectly) {
+    // aten::split.Tensor(Tensor(a) self, int split_size, int dim=0) -> Tensor(a)[]
+    const auto graph_IR = R"IR(
+        graph(%1 : Tensor):
+          %2 : int = prim::Constant[value=2]()
+          %3 : int = prim::Constant[value=1]()
+          %4 : Tensor[] = aten::split(%1, %2, %3)
+          %5 : Tensor, %6 : Tensor, %7 : Tensor = prim::ListUnpack(%4)
+          return (%5, %6, %7))IR";
+    baidu::mirana::poros::SplitConverter splitconverter;
+    split_test_helper(graph_IR, &splitconverter, {4, 5, 3, 1}, 3);
+}
+
+TEST(Converters, ATenSplitWithSizeDoubleTensorsConvertsCorrectly) {
+    // aten::split_with_sizes(Tensor(a) self, int[] split_sizes, int dim=0) -> Tensor(a)[]
+    const auto graph_IR = R"IR(
+        graph(%1 : Tensor):
+          %2 : int[] = prim::Constant[value=[4, 2]]()
+          %3 : int = prim::Constant[value=0]()
+          %4 : Tensor[] = aten::split_with_sizes(%1, %2, %3)
+          %5 : Tensor, %6 : Tensor = prim::ListUnpack(%4)
+          return (%5, %6))IR";
+    baidu::mirana::poros::SplitConverter splitconverter;
+    split_test_helper(graph_IR, &splitconverter, {6, 4, 3, 1}, 2);
+}
+
+TEST(Converters, ATenSplitWithSizeTrippleTensorsConvertsCorrectly) {
+    // aten::split_with_sizes(Tensor(a) self, int[] split_sizes, int dim=0) -> Tensor(a)[]
+    const auto graph_IR = R"IR(
+        graph(%1 : Tensor):
+          %2 : int[] = prim::Constant[value=[5, 1, 2]]()
+          %3 : int = prim::Constant[value=1]()
+          %4 : Tensor[] = aten::split_with_sizes(%1, %2, %3)
+          %5 : Tensor, %6 : Tensor, %7 : Tensor = prim::ListUnpack(%4)
+          return (%5, %6, %7))IR";
+    baidu::mirana::poros::SplitConverter splitconverter;
+    split_test_helper(graph_IR, &splitconverter, {2, 8, 3, 1}, 3);
+}
+
+TEST(Converters, ATenUnbindTensorsConvertsCorrectly) {
+    // aten::unbind.int(Tensor(a) self, int dim=0) -> Tensor(a)[]
+    const auto graph_IR = R"IR(
+        graph(%1 : Tensor):
+          %2 : int = prim::Constant[value=1]()
+          %3 : Tensor[] = aten::unbind(%1, %2)
+          %4 : Tensor, %5 : Tensor, %6 : Tensor = prim::ListUnpack(%3)
+          return (%4, %5, %6))IR";
+    baidu::mirana::poros::SplitConverter splitconverter;
+    split_test_helper(graph_IR, &splitconverter, {2, 3, 4}, 3);
+}
+
+TEST(Converters, EmbeddingConverterCorrectly) {
+    const auto graph_IR = R"IR(
+        graph(%weight.1 : Tensor,
+          %input.1 : Tensor):
+          %7 : bool = prim::Constant[value=0]()
+          %6 : int = prim::Constant[value=-1]()
+          %9 : Tensor = aten::embedding(%weight.1, %input.1, %6, %7, %7)
+          return (%9))IR";
+
+    baidu::mirana::poros::EmbeddingConverter embeddingconverter;
+    embedding_test_helper(graph_IR, &embeddingconverter);
+}
+
+static void gather_test_helper(const std::string& graph_IR, 
+                                baidu::mirana::poros::IConverter* converter,
+                                const std::vector<at::Tensor>& input_data,
+                                bool is_dynamic = false,
+                                std::vector<std::vector<at::Tensor>>* prewarm_data = nullptr) {
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = is_dynamic;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, converter, 
+                input_data, graph_output, poros_output, prewarm_data));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(graph_output[0].equal(poros_output[0]));
+}
+
+TEST(Converters, ATenGatherConverterCorrectly) {
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor, %1 : Tensor):
+          %2 : bool = prim::Constant[value=0]()
+          %3 : int = prim::Constant[value=1]()
+          %4 : Tensor = aten::gather(%0, %3, %1, %2)
+          return (%4))IR";
+
+    std::vector<at::Tensor> input_data;
+    auto input = at::randn({3, 4, 5, 6}, {at::kCUDA});
+    auto options_pyt = torch::TensorOptions().device(torch::kCUDA, 0).dtype(torch::kInt64);
+    auto index = at::randint(0, 2, {3, 4, 5, 6}, options_pyt);
+    
+    input_data.push_back(input);
+    input_data.push_back(index);
+
+    baidu::mirana::poros::GatherConverter gatherconverter;
+    gather_test_helper(graph_IR, &gatherconverter, input_data, false);
+}
+
+TEST(Converters, ATenGatherNegtiveDimConverterCorrectly) {
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor, %1 : Tensor):
+          %2 : bool = prim::Constant[value=0]()
+          %3 : int = prim::Constant[value=-1]()
+          %4 : Tensor = aten::gather(%0, %3, %1, %2)
+          return (%4))IR";
+
+    std::vector<at::Tensor> input_data;
+    auto input = at::randn({3, 4, 5, 6}, {at::kCUDA});
+    auto options_pyt = torch::TensorOptions().device(torch::kCUDA, 0).dtype(torch::kInt64);
+    auto index = at::randint(0, 2, {3, 4, 5, 6}, options_pyt);
+    
+    input_data.push_back(input);
+    input_data.push_back(index);
+
+    baidu::mirana::poros::GatherConverter gatherconverter;
+    gather_test_helper(graph_IR, &gatherconverter, input_data, false);
+}
+
+TEST(Converters,  ATenGatherDynamicConverterCorrectly) {
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor, %1 : Tensor):
+          %2 : bool = prim::Constant[value=0]()
+          %3 : int = prim::Constant[value=-1]()
+          %4 : Tensor = aten::gather(%0, %3, %1, %2)
+          return (%4))IR";
+    
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+
+    auto options_pyt = torch::TensorOptions().device(torch::kCUDA, 0).dtype(torch::kInt64);
+    // max
+    prewarm_data[0].push_back(at::randn({4, 5, 6, 7}, {at::kCUDA}));
+    prewarm_data[0].push_back(at::randint(0, 3, {4, 5, 6, 7}, options_pyt));
+    // min
+    prewarm_data[1].push_back(at::randn({3, 4, 5, 6}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randint(0, 2, {3, 4, 5, 6}, options_pyt));
+    // opt
+    prewarm_data[2].push_back(at::randn({3, 4, 5, 6}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randint(0, 2, {3, 4, 5, 6}, options_pyt));
+
+    std::vector<at::Tensor> input_data;
+    auto input = at::randn({3, 4, 5, 6}, {at::kCUDA});
+    auto index = at::randint(0, 2, {3, 4, 5, 6}, options_pyt);
+    
+    input_data.push_back(input);
+    input_data.push_back(index);
+
+    baidu::mirana::poros::GatherConverter gatherconverter;
+    gather_test_helper(graph_IR, &gatherconverter, input_data, true, &prewarm_data);
+}
+
+TEST(Converters,  ATenMaskedfillScalarValueConverterCorrectly) {
+// aten::masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor
+    const auto graph_IR = R"IR(
+      graph(%0 : Tensor, %1 : Tensor):
+        %2 : int = prim::Constant[value=-1]()
+        %3 : Tensor = aten::masked_fill(%0, %1, %2)
+        return (%3))IR";
+
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({2, 2}, {at::kCUDA}));
+    auto options_pyt = torch::TensorOptions().device(torch::kCUDA, 0).dtype(torch::kBool);
+    input_data.push_back(torch::tensor({false, true, false, true}, options_pyt).reshape({2, 2}));
+
+    baidu::mirana::poros::MaskedFillConverter maskedfillconverter;
+
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = false;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &maskedfillconverter, 
+                input_data, graph_output, poros_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(graph_output[0].equal(poros_output[0]));        
+}
+
+TEST(Converters,  ATenMaskedfillScalarValueDynamicConverterCorrectly) {
+// aten::masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor
+    const auto graph_IR = R"IR(
+      graph(%0 : Tensor, %1 : Tensor):
+        %2 : int = prim::Constant[value=-1]()
+        %3 : Tensor = aten::masked_fill(%0, %1, %2)
+        return (%3))IR";
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+
+    auto options_pyt = torch::TensorOptions().device(torch::kCUDA, 0).dtype(torch::kBool);
+    // max
+    prewarm_data[0].push_back(at::randn({4, 2}, {at::kCUDA}));
+    prewarm_data[0].push_back(torch::tensor({false, true, false, true, false, true, false, true}, options_pyt).reshape({4, 2}));
+    // min
+    prewarm_data[1].push_back(at::randn({1, 2}, {at::kCUDA}));
+    prewarm_data[1].push_back(torch::tensor({false, true}, options_pyt).reshape({1, 2}));
+    // opt
+    prewarm_data[2].push_back(at::randn({2, 2}, {at::kCUDA}));
+    prewarm_data[2].push_back(torch::tensor({false, true, false, true}, options_pyt).reshape({2, 2}));
+
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({2, 2}, {at::kCUDA}));
+    input_data.push_back(torch::tensor({false, true, false, true}, options_pyt).reshape({2, 2}));
+
+    baidu::mirana::poros::MaskedFillConverter maskedfillconverter;
+
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = true;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &maskedfillconverter,
+                input_data, graph_output, poros_output, &prewarm_data));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(graph_output[0].equal(poros_output[0]));
+}
+
+TEST(Converters,  ATenMaskedfillScalarValueDynamicMoreConverterCorrectly) {
+// aten::masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor
+    const auto graph_IR = R"IR(
+      graph(%0 : Tensor, %1 : Tensor):
+        %2 : int = prim::Constant[value=-1]()
+        %3 : Tensor = aten::masked_fill(%0, %1, %2)
+        return (%3))IR";
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+
+    auto options_pyt = torch::TensorOptions().device(torch::kCUDA, 0).dtype(torch::kBool);
+    // max
+    prewarm_data[0].push_back(at::randn({4, 2}, {at::kCUDA}));
+    prewarm_data[0].push_back(torch::tensor({false, true}, options_pyt).reshape({2}));
+    // min
+    prewarm_data[1].push_back(at::randn({1, 2}, {at::kCUDA}));
+    prewarm_data[1].push_back(torch::tensor({false, true}, options_pyt).reshape({2}));
+    // opt
+    prewarm_data[2].push_back(at::randn({2, 2}, {at::kCUDA}));
+    prewarm_data[2].push_back(torch::tensor({true, true}, options_pyt).reshape({2}));
+
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({2, 2}, {at::kCUDA}));
+    input_data.push_back(torch::tensor({false, true}, options_pyt).reshape({2}));
+
+    baidu::mirana::poros::MaskedFillConverter maskedfillconverter;
+
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = true;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &maskedfillconverter,
+                input_data, graph_output, poros_output, &prewarm_data));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(graph_output[0].equal(poros_output[0]));
+}
+
+TEST(Converters,  ATenMaskedfillTensorValueConverterCorrectly) {
+// aten::masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor
+    const auto graph_IR = R"IR(
+      graph(%0 : Tensor, %1 : Tensor):
+        %false : bool = prim::Constant[value=0]()
+        %2 : int = prim::Constant[value=2]()
+        %device : Device = prim::Constant[value="cuda:0"]()
+        %type : int = prim::dtype(%0)
+        %value : Tensor = aten::tensor(%2, %type, %device, %false)
+        %4 : Tensor = aten::masked_fill(%0, %1, %value)
+        return (%4))IR";
+
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({2, 2}, {at::kCUDA}));
+    auto options_pyt = torch::TensorOptions().device(torch::kCUDA, 0).dtype(torch::kBool);
+    input_data.push_back(torch::tensor({false, true, false, true}, options_pyt).reshape({2, 2}));
+
+    baidu::mirana::poros::MaskedFillConverter maskedfillconverter;
+
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = false;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &maskedfillconverter,
+                input_data, graph_output, poros_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(graph_output[0].equal(poros_output[0]));
+}
+
+
+TEST(Converters, ATenIndexOneDimConverterCorrectly) {
+// aten::index.Tensor(Tensor self, Tensor?[] indices) -> Tensor 
+    const auto graph_IR = R"IR(
+      graph(%0 : Tensor, %1 : Tensor):
+        %2 : Tensor?[] = prim::ListConstruct(%0)
+        %3 : Tensor = aten::index(%1, %2)
+        return (%3))IR";
+
+    std::vector<at::Tensor> input_data;
+    auto options_pyt = torch::TensorOptions().device(torch::kCUDA, 0).dtype(torch::kLong);
+    input_data.push_back(at::randint(0, 3, {2, 2}, options_pyt)); // indices
+    input_data.push_back(at::randint(0, 10, {3, 4, 5}, options_pyt)); // self
+
+    baidu::mirana::poros::IndexConverter indexconverter;
+
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = false;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &indexconverter, 
+                input_data, graph_output, poros_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(graph_output[0].equal(poros_output[0]));
+}
+
+TEST(Converters, ATenIndexOneDimDynamicConverterCorrectly) {
+// aten::index.Tensor(Tensor self, Tensor?[] indices) -> Tensor 
+    const auto graph_IR = R"IR(
+      graph(%0 : Tensor, %1 : Tensor):
+        %2 : Tensor?[] = prim::ListConstruct(%0)
+        %3 : Tensor = aten::index(%1, %2)
+        return (%3))IR";
+
+    std::vector<at::Tensor> input_data;
+    auto options_pyt = torch::TensorOptions().device(torch::kCUDA, 0).dtype(torch::kLong);
+    input_data.push_back(at::randint(0, 3, {2, 2}, options_pyt)); // indices
+    input_data.push_back(at::randint(0, 10, {3, 4, 5}, options_pyt)); // self
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randint(0, 4, {3, 4}, options_pyt)); // indices
+    prewarm_data[0].push_back(at::randint(0, 10, {4, 5, 6}, options_pyt)); // self
+    prewarm_data[1].push_back(at::randint(0, 3, {2, 2}, options_pyt)); // indices
+    prewarm_data[1].push_back(at::randint(0, 10, {3, 4, 5}, options_pyt)); // self
+    prewarm_data[2].push_back(at::randint(0, 3, {2, 2}, options_pyt)); // indices
+    prewarm_data[2].push_back(at::randint(0, 10, {3, 4, 5}, options_pyt)); // self
+
+    baidu::mirana::poros::IndexConverter indexconverter;
+
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = true;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &indexconverter, 
+                input_data, graph_output, poros_output, &prewarm_data));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(graph_output[0].equal(poros_output[0]));
+}
+
+TEST(Converters, ATenIndexPutConverterCorrectly) {
+//aten::index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor"
+    const auto graph_IR = R"IR(
+      graph(%0 : Tensor, %1 : Tensor, %2 : Tensor, %3 : Tensor):
+        %false : bool = prim::Constant[value=0]()
+        %none  : NoneType = prim::Constant()
+        %zeros : Tensor = aten::zeros_like(%0, %none, %none, %none, %none, %none)
+        %index : Tensor?[] = prim::ListConstruct(%1, %2)
+        %out : Tensor = aten::index_put(%zeros, %index, %3, %false)
+        return (%out))IR";
+
+    std::vector<at::Tensor> input_data;
+    auto options_pyt_long = torch::TensorOptions().device(torch::kCUDA, 0).dtype(torch::kLong);
+    auto options_pyt_float = torch::TensorOptions().device(torch::kCUDA, 0).dtype(torch::kFloat);
+    input_data.push_back(at::zeros({2, 5}, options_pyt_float));
+    input_data.push_back(torch::tensor({0, 0, 1, 1}, options_pyt_long));
+    input_data.push_back(torch::tensor({0, 2, 1, 3}, options_pyt_long));
+    input_data.push_back(torch::tensor({1, 2, 3, 4}, options_pyt_float));
+
+    baidu::mirana::poros::IndexPutConverter indexputconverter;
+
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = false;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &indexputconverter, 
+                input_data, graph_output, poros_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(graph_output[0].equal(poros_output[0]));
+}
+
+//2022.10.19 踩坑记录：不要对indexput 这个singular的IR 进行非dynamic的单测，
+//因为这个graph在static的情况下，会在数据预热阶段, 直接全图计算出结果，生成一个constant结果给tensorrt，
+//直接导致单测无法通过。
+TEST(Converters, ATenIndexPutConverterSingularValueDynamicCorrectly) {
+//aten::index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor"
+    const auto graph_IR = gen_indexput_with_singular_value_graph();
+    std::vector<at::Tensor> input_data;
+    auto options_pyt_float = torch::TensorOptions().device(torch::kCUDA, 0).dtype(torch::kFloat);
+    input_data.push_back(at::zeros({1, 16, 64}, options_pyt_float));
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::zeros({1, 30, 64}, options_pyt_float));
+    prewarm_data[1].push_back(at::zeros({1, 8, 64}, options_pyt_float));
+    prewarm_data[2].push_back(at::zeros({1, 20, 64}, options_pyt_float));
+
+    baidu::mirana::poros::IndexPutConverter indexputconverter;
+
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = true;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &indexputconverter,
+                input_data, graph_output, poros_output, &prewarm_data));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(graph_output[0].equal(poros_output[0]));
+} 
+
+TEST(Converters, ATenIndexPutConverterDynamicCorrectly) {
+//aten::index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor"
+    const auto graph_IR = R"IR(
+      graph(%0 : Tensor, %1 : Tensor, %2 : Tensor, %3 : Tensor):
+        %false : bool = prim::Constant[value=0]()
+        %none  : NoneType = prim::Constant()
+        %zeros : Tensor = aten::zeros_like(%0, %none, %none, %none, %none, %none)
+        %index : Tensor?[] = prim::ListConstruct(%1, %2)
+        %out : Tensor = aten::index_put(%zeros, %index, %3, %false)
+        return (%out))IR";
+
+    std::vector<at::Tensor> input_data;
+    auto options_pyt_long = torch::TensorOptions().device(torch::kCUDA, 0).dtype(torch::kLong);
+    auto options_pyt_float = torch::TensorOptions().device(torch::kCUDA, 0).dtype(torch::kFloat);
+    input_data.push_back(at::zeros({2, 5}, options_pyt_float));
+    input_data.push_back(torch::tensor({0, 0, 1, 1}, options_pyt_long));
+    input_data.push_back(torch::tensor({0, 2, 1, 3}, options_pyt_long));
+    input_data.push_back(torch::tensor({1, 2, 3, 4}, options_pyt_float));
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::zeros({8, 5}, options_pyt_float)); // indices
+    prewarm_data[0].push_back(torch::tensor({2, 3, 3, 4}, options_pyt_long));
+    prewarm_data[0].push_back(torch::tensor({0, 2, 1, 3}, options_pyt_long));
+    prewarm_data[0].push_back(torch::tensor({8, 8, 8, 8}, options_pyt_float));
+
+    prewarm_data[1].push_back(at::zeros({2, 5}, options_pyt_float)); // indices
+    prewarm_data[1].push_back(torch::tensor({0, 0, 1, 1}, options_pyt_long));
+    prewarm_data[1].push_back(torch::tensor({0, 2, 1, 3}, options_pyt_long));
+    prewarm_data[1].push_back(torch::tensor({3, 3, 3, 3}, options_pyt_float));
+
+    prewarm_data[2].push_back(at::zeros({4, 5}, options_pyt_float)); // indices
+    prewarm_data[2].push_back(torch::tensor({0, 0, 1, 1}, options_pyt_long));
+    prewarm_data[2].push_back(torch::tensor({0, 2, 1, 3}, options_pyt_long));
+    prewarm_data[2].push_back(torch::tensor({1, 2, 3, 4}, options_pyt_float));
+
+    baidu::mirana::poros::IndexPutConverter indexputconverter;
+
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = true;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &indexputconverter, 
+                input_data, graph_output, poros_output, &prewarm_data));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(graph_output[0].equal(poros_output[0]));
+}
+
+TEST(Converters, ATenIndexPutConverterDynamicFromCopyCorrectly) {
+//aten::index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor"
+    const auto graph_IR = gen_indexput_graph("21");
+
+    std::vector<at::Tensor> input_data;
+    auto options_pyt_float = torch::TensorOptions().device(torch::kCUDA, 0).dtype(torch::kFloat);
+    input_data.push_back(at::ones({8, 16, 64, 16, 16}, options_pyt_float));
+   
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::ones({64, 16, 64, 16, 16}, options_pyt_float));
+    prewarm_data[1].push_back(at::ones({8, 16, 64, 16, 16}, options_pyt_float));
+    prewarm_data[2].push_back(at::ones({32, 16, 64, 16, 16}, options_pyt_float));
+
+    baidu::mirana::poros::IndexPutConverter indexputconverter;
+
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = true;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &indexputconverter, 
+                input_data, graph_output, poros_output, &prewarm_data));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(graph_output[0].equal(poros_output[0]));
+}
+
+TEST(Converters, ATenIndexPutConverterStaticFromCopyCorrectly) {
+//aten::index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor"
+    const auto graph_IR = gen_indexput_graph("21");
+
+    std::vector<at::Tensor> input_data;
+    auto options_pyt_float = torch::TensorOptions().device(torch::kCUDA, 0).dtype(torch::kFloat);
+    input_data.push_back(at::ones({1, 16, 64, 56, 56}, options_pyt_float));
+
+    baidu::mirana::poros::IndexPutConverter indexputconverter;
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = false;
+
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &indexputconverter,
+                input_data, graph_output, poros_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(graph_output[0].equal(poros_output[0]));
+}
+
+TEST(Converters, ATenScatterConverterCorrectly) {
+// aten::scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> (Tensor)
+    const auto graph_IR = R"IR(
+      graph(%0 : Tensor, %1 : Tensor):
+        %2 : int = prim::Constant[value=1]()
+        %3 : float = prim::Constant[value=2.5]()
+        %4 : Tensor = aten::scatter(%0, %2, %1, %3)
+        return (%4))IR";
+
+    std::vector<at::Tensor> input_data;
+    auto options_pyt_long = torch::TensorOptions().device(torch::kCUDA, 0).dtype(torch::kLong);
+    auto options_pyt_float = torch::TensorOptions().device(torch::kCUDA, 0).dtype(torch::kFloat);
+    input_data.push_back(at::zeros({2, 4}, options_pyt_float));
+    input_data.push_back(torch::tensor({{0, 1, 2, 0}, {1, 2, 0, 3}}, options_pyt_long));
+
+    baidu::mirana::poros::ScatterConverter scatterconverter;
+
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = false;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &scatterconverter, 
+                input_data, graph_output, poros_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(graph_output[0].equal(poros_output[0]));
+}
+
+TEST(Converters, ATenScatterSelfValueDiffTypeConverterCorrectly) {
+// aten::scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> (Tensor)
+    const auto graph_IR = R"IR(
+      graph(%0 : Tensor, %1 : Tensor):
+        %2 : int = prim::Constant[value=1]()
+        %3 : float = prim::Constant[value=2.5]()
+        %4 : Tensor = aten::scatter(%0, %2, %1, %3)
+        return (%4))IR";
+
+    std::vector<at::Tensor> input_data;
+    auto options_pyt_long = torch::TensorOptions().device(torch::kCUDA, 0).dtype(torch::kLong);
+    auto options_pyt_int = torch::TensorOptions().device(torch::kCUDA, 0).dtype(torch::kInt);
+    input_data.push_back(at::zeros({2, 4}, options_pyt_int));
+    input_data.push_back(torch::tensor({{0, 1, 2, 3}, {1, 2, 0, 3}}, options_pyt_long));
+
+    baidu::mirana::poros::ScatterConverter scatterconverter;
+
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = false;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &scatterconverter, 
+                input_data, graph_output, poros_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(graph_output[0].equal(poros_output[0]));
+}
+
+TEST(Converters, ATenScatterSelfIndexDiffShapeConverterCorrectly) {
+// aten::scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> (Tensor)
+// Index tensor 和 self tensor 的shape可以不一致但是rank（dim数）必须一致
+    const auto graph_IR = R"IR(
+      graph(%0 : Tensor, %1 : Tensor):
+        %2 : int = prim::Constant[value=0]()
+        %3 : float = prim::Constant[value=2.5]()
+        %4 : Tensor = aten::scatter(%0, %2, %1, %3)
+        return (%4))IR";
+
+    std::vector<at::Tensor> input_data;
+    auto options_pyt_long = torch::TensorOptions().device(torch::kCUDA, 0).dtype(torch::kLong);
+    auto options_pyt_float = torch::TensorOptions().device(torch::kCUDA, 0).dtype(torch::kFloat);
+    input_data.push_back(at::zeros({2, 4}, options_pyt_float));
+    input_data.push_back(torch::tensor({{0}}, options_pyt_long));
+
+    baidu::mirana::poros::ScatterConverter scatterconverter;
+
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = false;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &scatterconverter, 
+                input_data, graph_output, poros_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(graph_output[0].equal(poros_output[0]));
+}
+
+TEST(Converters, ATenScatterDynamicConverterCorrectly) {
+// aten::scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> (Tensor)
+    const auto graph_IR = R"IR(
+      graph(%0 : Tensor, %1 : Tensor):
+        %2 : int = prim::Constant[value=1]()
+        %3 : float = prim::Constant[value=2.5]()
+        %4 : Tensor = aten::scatter(%0, %2, %1, %3)
+        return (%4))IR";
+
+    std::vector<at::Tensor> input_data;
+    auto options_pyt_long = torch::TensorOptions().device(torch::kCUDA, 0).dtype(torch::kLong);
+    auto options_pyt_float = torch::TensorOptions().device(torch::kCUDA, 0).dtype(torch::kFloat);
+    input_data.push_back(at::zeros({3, 4}, options_pyt_float));
+    input_data.push_back(torch::tensor({{0, 1}, {1, 2}}, options_pyt_long));
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    // max
+    prewarm_data[0].push_back(at::randint(0, 4, {3, 4}, options_pyt_float));
+    prewarm_data[0].push_back(at::randint(0, 2, {3, 4}, options_pyt_long));
+    // min
+    prewarm_data[1].push_back(at::randint(0, 4, {2, 3}, options_pyt_float));
+    prewarm_data[1].push_back(at::randint(0, 2, {1, 1}, options_pyt_long));
+    // opt
+    prewarm_data[2].push_back(at::randint(0, 4, {3, 4}, options_pyt_float));
+    prewarm_data[2].push_back(at::randint(0, 2, {1, 1}, options_pyt_long));
+
+    baidu::mirana::poros::ScatterConverter scatterconverter;
+
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = true;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &scatterconverter, 
+                input_data, graph_output, poros_output, &prewarm_data));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(graph_output[0].equal(poros_output[0]));
+}
+
+static void chunk_test_helper(const std::string& graph_IR, 
+                                std::vector<int64_t> shape,
+                                const int& output_num) {
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn(shape, {at::kCUDA}));
+
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    baidu::mirana::poros::ChunkConverter chunkconverter;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &chunkconverter, 
+                input_data, graph_output, poros_output));
+
+    ASSERT_EQ(output_num, graph_output.size());
+    ASSERT_EQ(graph_output.size(), poros_output.size());
+    for (size_t i = 0; i < graph_output.size(); i++) {
+        ASSERT_TRUE(graph_output[i].equal(poros_output[i]));
+    }
+}
+
+TEST(Converters, PrimConstantChunkTwoOutputsConverterCorrectly) {
+    // prim::chunk
+    const auto graph_IR = R"IR(
+      graph(%0 : Tensor):
+        %1 : Tensor, %2 : Tensor = prim::ConstantChunk[chunks=2, dim=-1](%0)
+        return (%1, %2))IR";
+    chunk_test_helper(graph_IR, {5, 6, 8}, 2);
+}
+
+TEST(Converters, PrimConstantChunkThreeOutputsConverterCorrectly) {
+    // prim::chunk
+    const auto graph_IR = R"IR(
+      graph(%0 : Tensor):
+        %1 : Tensor, %2 : Tensor, %3 : Tensor = prim::ConstantChunk[chunks=3, dim=0](%0)
+        return (%1, %2, %3))IR";
+    chunk_test_helper(graph_IR, {11, 6, 8}, 3);
+}
+
+TEST(Converters, PrimConstantChunkFourOutputsConverterCorrectly) {
+    // prim::chunk
+    const auto graph_IR = R"IR(
+      graph(%0 : Tensor):
+        %1 : Tensor, %2 : Tensor, %3 : Tensor, %4 : Tensor = prim::ConstantChunk[chunks=4, dim=1](%0)
+        return (%1, %2, %3, %4))IR";
+    chunk_test_helper(graph_IR, {7, 13, 8}, 4);
+}
\ No newline at end of file
diff --git a/poros/unittest/converter/shape_handle_test.cpp b/poros/unittest/converter/shape_handle_test.cpp
new file mode 100644
index 0000000000..b2685046af
--- /dev/null
+++ b/poros/unittest/converter/shape_handle_test.cpp
@@ -0,0 +1,119 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file shape_handle_test.cpp
+* @author tianshaoqing@baidu.com
+* @date Tues Jul 27 14:24:21 CST 2022
+* @brief 
+**/
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/converter/gpu/shape_handle.h"
+#include "poros/util/test_util.h"
+
+static void shape_handle_test_helper(const std::string& graph_IR, 
+                                baidu::mirana::poros::IConverter* converter,
+                                std::vector<int64_t> shape,
+                                bool is_dynamic = false,
+                                std::vector<std::vector<at::Tensor>>* prewarm_data = nullptr) {
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn(shape, {at::kCUDA}));
+
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = is_dynamic;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, converter, 
+                input_data, graph_output, poros_output, prewarm_data));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(graph_output[0].equal(poros_output[0]));
+}
+
+TEST(Converters, ATenShapeAsTensorConvertsCorrectly) {
+    // aten::_shape_as_tensor(Tensor self) -> (Tensor)
+    // aten::_shape_as_tensor output tensor is default on cpu. 
+    // To keep all data on same device, need to add aten::to.device.
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+            %1 : Tensor = aten::_shape_as_tensor(%0)
+            %2 : Device = prim::Constant[value="cuda"]()
+            %3 : int = prim::Constant[value=3]()
+            %4 : bool = prim::Constant[value=0]()
+            %5 : None = prim::Constant()
+            %6 : Tensor = aten::to(%1, %2, %3, %4, %4, %5)
+            return (%6))IR";
+    baidu::mirana::poros::ShapeastensorConverter shapeastensorconverter;
+    shape_handle_test_helper(graph_IR, &shapeastensorconverter, {4, 5, 3, 1});
+}
+
+TEST(Converters, ATenShapeAsTensorDynamicConvertsCorrectly) {
+    // aten::_shape_as_tensor(Tensor self) -> (Tensor)
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+            %1 : Tensor = aten::_shape_as_tensor(%0)
+            %2 : Device = prim::Constant[value="cuda"]()
+            %3 : int = prim::Constant[value=3]()
+            %4 : bool = prim::Constant[value=0]()
+            %5 : None = prim::Constant()
+            %6 : Tensor = aten::to(%1, %2, %3, %4, %4, %5)
+            return (%6))IR";
+    baidu::mirana::poros::ShapeastensorConverter shapeastensorconverter;
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({5, 10, 7, 8}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({4, 5, 6, 7}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({4, 5, 6, 7}, {at::kCUDA}));
+
+    shape_handle_test_helper(graph_IR, &shapeastensorconverter, {4, 5, 6, 7}, true, &prewarm_data);
+}
+
+// aten::len.Tensor(Tensor t) -> (int)
+// aten::len.t(t[] a) -> (int)
+TEST(Converters, ATenLenDynamicConvertsCorrectly) {
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+            %1 : int = aten::len(%0)
+            %2 : NoneType = prim::Constant()
+            %3 : bool = prim::Constant[value=0]()
+            %4 : Device = prim::Constant[value="cuda:0"]()
+            %5 : Tensor = aten::tensor(%1, %2, %4, %3)
+            return (%5))IR";
+
+    baidu::mirana::poros::LenConverter lenconverter;
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({7, 2}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({3, 2}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({5, 2}, {at::kCUDA}));
+
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::ones({7, 2}, {at::kCUDA}));
+
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = true;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &lenconverter, 
+                input_data, graph_output, poros_output, &prewarm_data));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(graph_output[0].equal(poros_output[0]));
+}
\ No newline at end of file
diff --git a/poros/unittest/converter/shuffle_test.cpp b/poros/unittest/converter/shuffle_test.cpp
new file mode 100644
index 0000000000..8c49c456d4
--- /dev/null
+++ b/poros/unittest/converter/shuffle_test.cpp
@@ -0,0 +1,313 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file shuffle_test.cpp
+* @author tianshaoqing@baidu.com
+* @date Wed Sep 27 11:24:21 CST 2021
+* @brief 
+**/
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/util/test_util.h"
+#include "poros/converter/gpu/shuffle.h"
+
+static void shuffle_test_helper(const std::string& graph_IR, 
+                                baidu::mirana::poros::IConverter* converter,
+                                std::vector<int64_t> shape){
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn(shape, {at::kCUDA}));
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, converter, 
+                input_data, graph_output, poros_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(graph_output[0].equal(poros_output[0]));
+}
+
+static void shuffle_dy_test_helper(const std::string& graph_IR, 
+                                const std::vector<at::Tensor>& input_data,
+                                baidu::mirana::poros::IConverter* converter,
+                                bool is_dynamic = false,
+                                std::vector<std::vector<at::Tensor>>* prewarm_data = nullptr) {
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = is_dynamic;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, converter, 
+                input_data, graph_output, poros_output, prewarm_data));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+}
+
+std::string gen_double_int_graph(const std::string& op, 
+                                const std::string& first_int,
+                                const std::string& second_int) {
+    return R"IR(
+        graph(%0 : Tensor):
+          %1 : int = prim::Constant[value=)IR" + first_int + R"IR(]()
+          %2 : int = prim::Constant[value=)IR" + second_int + R"IR(]()
+          %3 : Tensor = aten::)IR" + op + R"IR((%0, %1, %2)
+          return (%3))IR";
+}
+
+std::string gen_int_list_graph(const std::string& op, const std::string& int_list) {
+    return R"IR(
+        graph(%0 : Tensor):
+          %1 : int[] = prim::Constant[value=[)IR" + int_list + R"IR(]]()
+          %2 : Tensor = aten::)IR" + op + R"IR((%0, %1)
+          return (%2))IR";
+}
+
+std::string gen_pixel_shuffle_graph(const std::string& upscale_factor) {
+    return R"IR(
+        graph(%0 : Tensor):
+          %1 : int = prim::Constant[value=)IR" + upscale_factor + R"IR(]()
+          %2 : Tensor = aten::pixel_shuffle(%0, %1)
+          return (%2))IR";
+}
+
+TEST(Converters, ATenTransposeConvertsCorrectly) {
+    // aten::transpose.int(Tensor(a) self, int dim0, int dim1) -> Tensor(a)
+    const auto graph_IR = gen_double_int_graph("transpose", "1", "2");
+    baidu::mirana::poros::TransposeConverter transposeconverter;
+    shuffle_test_helper(graph_IR, &transposeconverter, {2, 3, 4});
+}
+
+TEST(Converters, ATenTransposeNegaiveConvertsCorrectly) {
+    // aten::transpose.int(Tensor(a) self, int dim0, int dim1) -> Tensor(a)
+    const auto graph_IR = gen_double_int_graph("transpose", "-1", "-3");
+    baidu::mirana::poros::TransposeConverter transposeconverter;
+    shuffle_test_helper(graph_IR, &transposeconverter, {2, 3, 4, 5, 6});
+}
+
+TEST(Converters, ATenViewConvertsCorrectly) {
+    // aten::view(Tensor(a) self, int[] size) -> Tensor(a)
+    const auto graph_IR = gen_int_list_graph("view", "1, 6");
+    baidu::mirana::poros::PermuteViewConverter permuteviewconverter;
+    shuffle_test_helper(graph_IR, &permuteviewconverter, {2, 3});
+}
+
+TEST(Converters, ATenViewNegtiveConvertsCorrectly) {
+    // aten::view(Tensor(a) self, int[] size) -> Tensor(a)
+    const auto graph_IR = gen_int_list_graph("view", "-1, 8");
+    baidu::mirana::poros::PermuteViewConverter permuteviewconverter;
+    shuffle_test_helper(graph_IR, &permuteviewconverter, {4, 4});
+}
+
+TEST(Converters, ATenPermuteConvertsCorrectly) {
+    // aten::permute(Tensor(a) self, int[] dims) -> Tensor(a)
+    const auto graph_IR = gen_int_list_graph("permute", "1, 0");
+    baidu::mirana::poros::PermuteViewConverter permuteviewconverter;
+    shuffle_test_helper(graph_IR, &permuteviewconverter, {2, 3});
+}
+
+TEST(Converters, ATenPermute3DConvertsCorrectly) {
+    // aten::permute(Tensor(a) self, int[] dims) -> Tensor(a)
+    const auto graph_IR = gen_int_list_graph("permute", "1, 2, 0");
+    baidu::mirana::poros::PermuteViewConverter permuteviewconverter;
+    shuffle_test_helper(graph_IR, &permuteviewconverter, {1, 2, 3});
+}
+
+TEST(Converters, ATenPermute5DConvertsCorrectly) {
+    // aten::permute(Tensor(a) self, int[] dims) -> Tensor(a)
+    const auto graph_IR = gen_int_list_graph("permute", "3, 1, 0, 2, 4");
+    baidu::mirana::poros::PermuteViewConverter permuteviewconverter;
+    shuffle_test_helper(graph_IR, &permuteviewconverter, {2, 3, 4, 5, 1});
+}
+
+TEST(Converters, ATenReshapeConvertsCorrectly) {
+  // aten::reshape(Tensor(a) self, int[] shape) -> Tensor(a)
+  const auto graph_IR = gen_int_list_graph("reshape", "3, 2");
+  baidu::mirana::poros::ReshapeConverter reshapeconverter;
+  shuffle_test_helper(graph_IR, &reshapeconverter, {2, 3});
+}
+
+TEST(Converters, ATenReshapeNegtiveConvertsCorrectly) {
+  // aten::reshape(Tensor(a) self, int[] shape) -> Tensor(a)
+  const auto graph_IR = gen_int_list_graph("reshape", "-1, 8");
+  baidu::mirana::poros::ReshapeConverter reshapeconverter;
+  shuffle_test_helper(graph_IR, &reshapeconverter, {4, 4});
+}
+
+TEST(Converters, ATenFlattenConvertsCorrectly) {
+  // aten::flatten.using_ints(Tensor(a) self, int start_dim=0, int end_dim=-1) -> Tensor(a)
+  const auto graph_IR = gen_double_int_graph("flatten", "0", "-1");
+  baidu::mirana::poros::FlattenConverter flattenconverter;
+  shuffle_test_helper(graph_IR, &flattenconverter, {1, 2, 3});
+}
+
+TEST(Converters, ATenFlattenStartEnddimConvertsCorrectly) {
+  // aten::flatten.using_ints(Tensor(a) self, int start_dim=0, int end_dim=-1) -> Tensor(a)
+  const auto graph_IR = gen_double_int_graph("flatten", "1", "2");
+  baidu::mirana::poros::FlattenConverter flattenconverter;
+  shuffle_test_helper(graph_IR, &flattenconverter, {1, 2, 3});
+}
+
+TEST(Converters, ATenT1DConvertsCorrectly) {
+  // aten::t(Tensor(a) self) -> Tensor(a)
+  const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %1 : Tensor = aten::t(%0)
+          %2 : Tensor = aten::relu(%1)
+          return (%2))IR";
+  baidu::mirana::poros::AtenTConverter atentConverter;
+  shuffle_test_helper(graph_IR, &atentConverter, {5});
+}
+
+TEST(Converters, ATenT2DConvertsCorrectly) {
+  // aten::t(Tensor(a) self) -> Tensor(a)
+  const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %1 : Tensor = aten::t(%0)
+          return (%1))IR";
+  baidu::mirana::poros::AtenTConverter atentConverter;
+  shuffle_test_helper(graph_IR, &atentConverter, {5, 6});
+}
+
+TEST(Converters, ATenPixelShuffleConvertsCorrectly) {
+  // aten::pixel_shuffle(Tensor self, int upscale_factor) -> Tensor
+  const auto graph_IR = gen_pixel_shuffle_graph("3");
+  baidu::mirana::poros::PixelShuffleConverter pixelshuffleconverter;
+  shuffle_test_helper(graph_IR, &pixelshuffleconverter, {1, 9, 4, 4});
+}
+
+TEST(Converters, ATenPixelShuffle3DConvertsCorrectly) {
+  // aten::pixel_shuffle(Tensor self, int upscale_factor) -> Tensor
+  const auto graph_IR = gen_pixel_shuffle_graph("3");
+  baidu::mirana::poros::PixelShuffleConverter pixelshuffleconverter;
+  shuffle_test_helper(graph_IR, &pixelshuffleconverter, {9, 5, 6});
+}
+
+TEST(Converters, ATenPixelShuffle5DConvertsCorrectly) {
+  // aten::pixel_shuffle(Tensor self, int upscale_factor) -> Tensor
+  const auto graph_IR = gen_pixel_shuffle_graph("3");
+  baidu::mirana::poros::PixelShuffleConverter pixelshuffleconverter;
+  shuffle_test_helper(graph_IR, &pixelshuffleconverter, {7, 8, 9, 5, 6});
+}
+
+static void shuffle_dynamic_test_helper(const std::string& graph_IR, 
+                                baidu::mirana::poros::IConverter* converter,
+                                const std::vector<at::Tensor>& input_data,
+                                bool is_dynamic = false,
+                                std::vector<std::vector<at::Tensor>>* prewarm_data = nullptr) {
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = is_dynamic;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, converter, 
+                input_data, graph_output, poros_output, prewarm_data));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(graph_output[0].equal(poros_output[0]));
+}
+
+TEST(Converters, ATenViewdynamicConvertsCorrectly) {
+    // aten::view(Tensor(a) self, int[] size) -> Tensor(a)
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %1 : int = prim::Constant[value=0]()
+          %2 : int = prim::Constant[value=1]()
+          %3 : int = aten::size(%0, %1)
+          %4 : int = aten::size(%0, %2)
+          %5 : int[] = prim::ListConstruct(%4, %3)
+          %6 : Tensor = aten::view(%0, %5)
+          return (%6))IR";
+    baidu::mirana::poros::PermuteViewConverter permuteviewconverter;
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({2, 3}, {at::kCUDA}));
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({4, 5}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({2, 3}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({2, 3}, {at::kCUDA}));
+
+    shuffle_dynamic_test_helper(graph_IR, &permuteviewconverter, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, ATenReshapedynamicConvertsCorrectly) {
+    // aten::reshape(Tensor(a) self, int[] shape) -> Tensor(a)
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+          %1 : int[] = aten::size(%0)
+          %2 : int, %3 : int = prim::ListUnpack(%1)
+          %4 : int[] = prim::ListConstruct(%3, %2)
+          %5 : Tensor = aten::reshape(%0, %4)
+          return (%5))IR";
+    baidu::mirana::poros::ReshapeConverter reshapeconverter;
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({2, 3}, {at::kCUDA}));
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({4, 5}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({2, 3}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({2, 3}, {at::kCUDA}));
+
+    shuffle_dynamic_test_helper(graph_IR, &reshapeconverter, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, ATenFlattenConvertsDynamicCorrectly) {
+  // aten::flatten.using_ints(Tensor(a) self, int start_dim=0, int end_dim=-1) -> Tensor(a)
+  const auto graph_IR = gen_double_int_graph("flatten", "0", "2");
+  
+  std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+  prewarm_data[0].push_back(at::randn({10, 64, 128}, {at::kCUDA}));
+  prewarm_data[1].push_back(at::randn({5, 32, 64}, {at::kCUDA}));
+  prewarm_data[2].push_back(at::randn({5, 32, 64}, {at::kCUDA}));
+
+  std::vector<at::Tensor> input_data;
+  input_data.push_back(at::randn({5, 32, 64}, {at::kCUDA}));
+  baidu::mirana::poros::FlattenConverter flattenconverter;
+  shuffle_dy_test_helper(graph_IR, input_data, &flattenconverter, true, &prewarm_data);
+}
+
+TEST(Converters, ATenFlattenConvertsDynamicNegStartEndCorrectly) {
+  // aten::flatten.using_ints(Tensor(a) self, int start_dim=0, int end_dim=-1) -> Tensor(a)
+  const auto graph_IR = gen_double_int_graph("flatten", "-3", "-2");
+  
+  std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+  prewarm_data[0].push_back(at::randn({10, 64, 128, 32}, {at::kCUDA}));
+  prewarm_data[1].push_back(at::randn({5, 32, 64, 16}, {at::kCUDA}));
+  prewarm_data[2].push_back(at::randn({5, 32, 64, 16}, {at::kCUDA}));
+
+  std::vector<at::Tensor> input_data;
+  input_data.push_back(at::randn({5, 32, 64, 16}, {at::kCUDA}));
+  baidu::mirana::poros::FlattenConverter flattenconverter;
+  shuffle_dy_test_helper(graph_IR, input_data, &flattenconverter, true, &prewarm_data);
+}
+
+TEST(Converters, ATenFlattenConvertsDynamicStartEqualEndCorrectly) {
+  // aten::flatten.using_ints(Tensor(a) self, int start_dim=0, int end_dim=-1) -> Tensor(a)
+  const auto graph_IR = gen_double_int_graph("flatten", "1", "1");
+  
+  std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+  prewarm_data[0].push_back(at::randn({10, 64, 128, 32}, {at::kCUDA}));
+  prewarm_data[1].push_back(at::randn({5, 32, 64, 16}, {at::kCUDA}));
+  prewarm_data[2].push_back(at::randn({5, 32, 64, 16}, {at::kCUDA}));
+
+  std::vector<at::Tensor> input_data;
+  input_data.push_back(at::randn({5, 32, 64, 16}, {at::kCUDA}));
+  baidu::mirana::poros::FlattenConverter flattenconverter;
+  shuffle_dy_test_helper(graph_IR, input_data, &flattenconverter, true, &prewarm_data);
+}
\ No newline at end of file
diff --git a/poros/unittest/converter/softmax_test.cpp b/poros/unittest/converter/softmax_test.cpp
new file mode 100644
index 0000000000..70f36a3ec1
--- /dev/null
+++ b/poros/unittest/converter/softmax_test.cpp
@@ -0,0 +1,148 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file softmax_test.cpp
+* @author tianshaoqing@baidu.com
+* @date Wed Sep 27 11:24:21 CST 2021
+* @brief 
+**/
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/converter/gpu/softmax.h"
+#include "poros/util/test_util.h"
+
+static void softmax_test_helper(const std::string& graph_IR, 
+                            std::vector<int64_t> shape = {5}){
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn(shape, {at::kCUDA}));
+    // input_data.push_back(at::randint(0, 5, {5}, {at::kCUDA}));
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    baidu::mirana::poros::SoftmaxConverter softmaxconverter;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &softmaxconverter, 
+                input_data, graph_output, poros_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+}
+
+static std::string gen_softmax_graph(const std::string& dim) {
+    return R"IR(
+        graph(%0 : Tensor):
+          %1 : None = prim::Constant()
+          %2 : int = prim::Constant[value=)IR" + dim + R"IR(]()
+          %3 : Tensor = aten::softmax(%0, %2, %1)
+          return (%3))IR";
+}
+
+TEST(Converters, ATenSoftmax1DConvertsCorrectly) {
+    // aten::softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
+    const auto graph_IR = gen_softmax_graph("0");
+    softmax_test_helper(graph_IR, {5});
+}
+
+TEST(Converters, ATenSoftmaxNDConvertsCorrectlySub3DIndex) {
+    // aten::softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
+    const auto graph_IR = gen_softmax_graph("1");
+    softmax_test_helper(graph_IR, {1, 2, 3, 4, 5});
+}
+
+TEST(Converters, ATenSoftmaxNDConvertsCorrectlyAbove3DIndex) {
+    // aten::softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
+    const auto graph_IR = gen_softmax_graph("3");
+    softmax_test_helper(graph_IR, {1, 2, 3, 4, 5});
+}
+
+TEST(Converters, ATenSoftmaxNDConvertsCorrectlyNegtiveOneIndex) {
+    // aten::softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
+    const auto graph_IR = gen_softmax_graph("-1");
+    softmax_test_helper(graph_IR, {1, 2, 3, 4, 5});
+}
+
+TEST(Converters, ATenSoftmaxNDConvertsCorrectlyNegtiveIndex) {
+    // aten::softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
+    const auto graph_IR = gen_softmax_graph("-2");
+    softmax_test_helper(graph_IR, {1, 2, 3, 4, 5});
+}
+
+static void softmax_dy_test_helper(const std::string& graph_IR, 
+                                const std::vector<at::Tensor>& input_data,
+                                bool is_dynamic = false,
+                                std::vector<std::vector<at::Tensor>>* prewarm_data = nullptr) {
+    baidu::mirana::poros::SoftmaxConverter softmaxconverter;
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = is_dynamic;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &softmaxconverter, 
+                input_data, graph_output, poros_output, prewarm_data));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+}
+
+TEST(Converters, ATenSoftmaxInputSingleDimDynamicConvertsCorrectly) {
+    // aten::softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
+    const auto graph_IR = gen_softmax_graph("0");
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+
+    prewarm_data[0].push_back(at::randn({60}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({40}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({40}, {at::kCUDA}));
+    
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({40}, {at::kCUDA}));
+    
+    softmax_dy_test_helper(graph_IR, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, ATenSoftmaxDynamicConvertsCorrectly) {
+    // aten::softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
+    const auto graph_IR = gen_softmax_graph("2");
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+
+    prewarm_data[0].push_back(at::randn({20, 30, 40, 50}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({10, 20, 30, 40}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({10, 20, 30, 40}, {at::kCUDA}));
+    
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({10, 20, 30, 40}, {at::kCUDA}));
+    
+    softmax_dy_test_helper(graph_IR, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, ATenSoftmaxDynamicNegtiveDimConvertsCorrectly) {
+    // aten::softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
+    const auto graph_IR = gen_softmax_graph("-2");
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+
+    prewarm_data[0].push_back(at::randn({20, 30, 40, 50}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({10, 20, 30, 40}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({10, 20, 30, 40}, {at::kCUDA}));
+    
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({10, 20, 30, 40}, {at::kCUDA}));
+    
+    softmax_dy_test_helper(graph_IR, input_data, true, &prewarm_data);
+}
\ No newline at end of file
diff --git a/poros/unittest/converter/squeeze_test.cpp b/poros/unittest/converter/squeeze_test.cpp
new file mode 100644
index 0000000000..31d060d549
--- /dev/null
+++ b/poros/unittest/converter/squeeze_test.cpp
@@ -0,0 +1,218 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file squeeze_test.cpp
+* @author tianshaoqing@baidu.com
+* @date Wed Sep 27 11:24:21 CST 2021
+* @brief 
+**/
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/converter/gpu/squeeze.h"
+#include "poros/util/test_util.h"
+
+static void squeeze_test_helper(const std::string& graph_IR, 
+                                baidu::mirana::poros::IConverter* converter,
+                                std::vector<int64_t> shape){
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn(shape, {at::kCUDA}));
+    
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, converter, 
+                input_data, graph_output, poros_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(graph_output[0].equal(poros_output[0]));
+}
+
+static std::string gen_squeeze_one_input_schema_graph(const std::string& op) {
+    return R"IR(
+        graph(%0 : Tensor):
+          %2 : Tensor = aten::)IR" + op + R"IR((%0)
+          %3 : Tensor = aten::relu(%2)
+          return (%3))IR";
+}
+
+TEST(Converters, ATenSqueezeOneInputConvertsCorrectly) {
+    // aten::squeeze.dim(Tensor(a) self, int dim) -> Tensor(a)
+    const auto graph_IR = gen_squeeze_one_input_schema_graph("squeeze");
+    baidu::mirana::poros::SqueezeConverter squeezeconverter;
+    squeeze_test_helper(graph_IR, &squeezeconverter, {4, 1, 3});
+    squeeze_test_helper(graph_IR, &squeezeconverter, {4, 1, 1, 5});
+}
+
+static std::string gen_squeeze_graph(const std::string& op, const std::string& dim) {
+    return R"IR(
+        graph(%0 : Tensor):
+          %1 : int = prim::Constant[value=)IR" + dim + R"IR(]()
+          %2 : Tensor = aten::)IR" + op + R"IR((%0, %1)
+          %3 : Tensor = aten::relu(%2)
+          return (%3))IR";
+}
+
+TEST(Converters, ATenSqueezeConvertsCorrectly) {
+    // aten::squeeze.dim(Tensor(a) self, int dim) -> Tensor(a)
+    const auto graph_IR = gen_squeeze_graph("squeeze", "1");
+    baidu::mirana::poros::SqueezeConverter squeezeconverter;
+    squeeze_test_helper(graph_IR, &squeezeconverter, {4, 1, 3});
+    squeeze_test_helper(graph_IR, &squeezeconverter, {4, 2, 3});
+}
+
+TEST(Converters, ATenSqueezeNegtiveConvertsCorrectly) {
+    // aten::squeeze.dim(Tensor(a) self, int dim) -> Tensor(a)
+    const auto graph_IR = gen_squeeze_graph("squeeze", "-1");
+    baidu::mirana::poros::SqueezeConverter squeezeconverter;
+    squeeze_test_helper(graph_IR, &squeezeconverter, {4, 3, 1});
+    squeeze_test_helper(graph_IR, &squeezeconverter, {4, 2, 3});
+}
+
+TEST(Converters, ATenUnSqueezeConvertsCorrectly) {
+    // aten::unsqueeze(Tensor(a) self, int dim) -> Tensor(a)
+    const auto graph_IR = gen_squeeze_graph("unsqueeze", "1");
+    baidu::mirana::poros::UnSqueezeConverter unsqueezeconverter;
+    squeeze_test_helper(graph_IR, &unsqueezeconverter, {4, 3, 2});
+}
+
+TEST(Converters, ATenUnSqueezeNegtiveConvertsCorrectly) {
+    // aten::unsqueeze(Tensor(a) self, int dim) -> Tensor(a)
+    const auto graph_IR = gen_squeeze_graph("unsqueeze", "-1");
+    baidu::mirana::poros::UnSqueezeConverter unsqueezeconverter;
+    squeeze_test_helper(graph_IR, &unsqueezeconverter, {4, 3, 2});
+}
+
+static void squeeze_dy_test_helper(const std::string& graph_IR, 
+                                baidu::mirana::poros::IConverter* converter,
+                                const std::vector<at::Tensor>& input_data,
+                                bool is_dynamic = false,
+                                std::vector<std::vector<at::Tensor>>* prewarm_data = nullptr) {
+
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = is_dynamic;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, converter, 
+                input_data, graph_output, poros_output, prewarm_data));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+}
+
+TEST(Converters, ATenSqueezeOneInputDynamicConvertsCorrectly) {
+    // aten::squeeze(Tensor(a) self, int dim) -> Tensor(a)
+    const auto graph_IR = gen_squeeze_one_input_schema_graph("squeeze");
+    baidu::mirana::poros::SqueezeConverter squeezeconverter;
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({40, 1, 1, 60}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({20, 1, 1, 40}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({20, 1, 1, 40}, {at::kCUDA}));
+    
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({20, 1, 1, 40}, {at::kCUDA}));
+    
+    squeeze_dy_test_helper(graph_IR, &squeezeconverter, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, ATenUnSqueezeDynamicConvertsCorrectly) {
+    // aten::unsqueeze(Tensor(a) self, int dim) -> Tensor(a)
+    const auto graph_IR = gen_squeeze_graph("unsqueeze", "2");
+    baidu::mirana::poros::UnSqueezeConverter unsqueezeconverter;
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+
+    prewarm_data[0].push_back(at::randn({40, 50, 60}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({20, 30, 40}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({20, 30, 40}, {at::kCUDA}));
+    
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({20, 30, 40}, {at::kCUDA}));
+    
+    squeeze_dy_test_helper(graph_IR, &unsqueezeconverter, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, ATenUnSqueezeInputSingleDimDynamicConvertsCorrectly) {
+    // aten::unsqueeze(Tensor(a) self, int dim) -> Tensor(a)
+    const auto graph_IR = gen_squeeze_graph("unsqueeze", "0");
+    baidu::mirana::poros::UnSqueezeConverter unsqueezeconverter;
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+
+    prewarm_data[0].push_back(at::randn({40}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({20}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({20}, {at::kCUDA}));
+    
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({20}, {at::kCUDA}));
+    
+    squeeze_dy_test_helper(graph_IR, &unsqueezeconverter, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, ATenUnSqueezeDynamicNegtiveDimConvertsCorrectly) {
+     // aten::unsqueeze(Tensor(a) self, int dim) -> Tensor(a)
+    const auto graph_IR = gen_squeeze_graph("unsqueeze", "-1");
+    baidu::mirana::poros::UnSqueezeConverter unsqueezeconverter;
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+
+    prewarm_data[0].push_back(at::randn({40, 50, 60}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({20, 30, 40}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({20, 30, 40}, {at::kCUDA}));
+    
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({20, 30, 40}, {at::kCUDA}));
+    
+    squeeze_dy_test_helper(graph_IR, &unsqueezeconverter, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, ATenSqueezeDynamicConvertsCorrectly) {
+    // aten::squeeze(Tensor(a) self, int dim) -> Tensor(a)
+    const auto graph_IR = gen_squeeze_graph("squeeze", "1");
+    baidu::mirana::poros::SqueezeConverter squeezeconverter;
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+
+    prewarm_data[0].push_back(at::randn({40, 1, 60}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({20, 1, 40}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({20, 1, 40}, {at::kCUDA}));
+    
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({20, 1, 40}, {at::kCUDA}));
+    
+    squeeze_dy_test_helper(graph_IR, &squeezeconverter, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, ATenSqueezeDynamicNegtiveDimConvertsCorrectly) {
+    // aten::squeeze(Tensor(a) self, int dim) -> Tensor(a)
+    const auto graph_IR = gen_squeeze_graph("squeeze", "-1");
+    baidu::mirana::poros::SqueezeConverter squeezeconverter;
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+
+    prewarm_data[0].push_back(at::randn({1, 60, 1}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({1, 40, 1}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({1, 40, 1}, {at::kCUDA}));
+    
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({1, 40, 1}, {at::kCUDA}));
+    
+    squeeze_dy_test_helper(graph_IR, &squeezeconverter, input_data, true, &prewarm_data);
+}
\ No newline at end of file
diff --git a/poros/unittest/converter/stack_test.cpp b/poros/unittest/converter/stack_test.cpp
new file mode 100644
index 0000000000..dafccec426
--- /dev/null
+++ b/poros/unittest/converter/stack_test.cpp
@@ -0,0 +1,186 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file stack_test.cpp
+* @author tianshaoqing@baidu.com
+* @date Wed Sep 27 11:24:21 CST 2021
+* @brief 
+**/
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/converter/gpu/stack.h"
+#include "poros/util/test_util.h"
+
+static void stack_test_helper(const std::string& graph_IR, 
+                                std::vector<int64_t> shape1 = {5}, 
+                                std::vector<int64_t> shape2 = {5},
+                                bool Triple_inputs = false,
+                                std::vector<int64_t> shape3 = {5}){
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn(shape1, {at::kCUDA}));
+    input_data.push_back(at::randn(shape2, {at::kCUDA}));
+    if (Triple_inputs){
+        input_data.push_back(at::randn(shape3, {at::kCUDA}));
+    }
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    baidu::mirana::poros::StackConverter stackconverter;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &stackconverter, 
+                input_data, graph_output, poros_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(graph_output[0].equal(poros_output[0]));
+}
+
+static std::string gen_double_inputs_stack_graph(const std::string& dim) {
+    return R"IR(
+        graph(%0 : Tensor, %1 : Tensor):
+          %2 : Tensor[] = prim::ListConstruct(%0, %1)
+          %3 : int = prim::Constant[value=)IR" + dim + R"IR(]()
+          %4 : Tensor = aten::stack(%2, %3)
+          return (%4))IR";
+}
+
+static std::string gen_triple_inputs_stack_graph(const std::string& dim) {
+    return R"IR(
+        graph(%0 : Tensor, %1 : Tensor, %2 : Tensor):
+          %3 : Tensor[] = prim::ListConstruct(%0, %1, %2)
+          %4 : int = prim::Constant[value=)IR" + dim + R"IR(]()
+          %5 : Tensor = aten::stack(%3, %4)
+          return (%5))IR";
+}
+
+TEST(Converters, ATenStackDoubleTensorConvertsCorrectly) {
+    // aten::stack(Tensor[] tensors, int dim=0) -> Tensor
+    const auto graph_IR = gen_double_inputs_stack_graph("0");
+    stack_test_helper(graph_IR);
+}
+
+TEST(Converters, ATenStackDoubleTensoroneDimConvertsCorrectly) {
+    // aten::stack(Tensor[] tensors, int dim=0) -> Tensor
+    const auto graph_IR = gen_double_inputs_stack_graph("1");
+    stack_test_helper(graph_IR, {5, 3}, {5, 3});
+}
+
+TEST(Converters, ATenStackTripleTensorConvertsCorrectly) {
+    // aten::stack(Tensor[] tensors, int dim=0) -> Tensor
+    const auto graph_IR = gen_triple_inputs_stack_graph("2");
+    stack_test_helper(graph_IR, {5, 2, 3}, {5, 2, 3}, true, {5, 2, 3});
+}
+
+TEST(Converters, ATenVstackDoubleTensorConvertsCorrectly) {
+    // aten::vstack(Tensor[] tensors) -> Tensor
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor, %1 : Tensor):
+          %2 : Tensor[] = prim::ListConstruct(%0, %1)
+          %3 : Tensor = aten::vstack(%2)
+          return (%3))IR";
+    stack_test_helper(graph_IR, {3, 1}, {3, 1});
+}
+
+TEST(Converters, ATenVstackTripleTensorConvertsCorrectly) {
+    // aten::vstack(Tensor[] tensors) -> Tensor
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor, %1 : Tensor, %2 : Tensor):
+          %3 : Tensor[] = prim::ListConstruct(%0, %1, %2)
+          %4 : Tensor = aten::vstack(%3)
+          return (%4))IR";
+    stack_test_helper(graph_IR, {5, 2, 3}, {5, 2, 3}, true, {5, 2, 3});
+}
+
+
+static void stack_dy_test_helper(const std::string& graph_IR, 
+                                const std::vector<at::Tensor>& input_data,
+                                bool is_dynamic = false,
+                                std::vector<std::vector<at::Tensor>>* prewarm_data = nullptr) {
+    baidu::mirana::poros::StackConverter stackconverter;
+
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = is_dynamic;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &stackconverter, 
+                input_data, graph_output, poros_output, prewarm_data));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(graph_output[0].equal(poros_output[0]));
+}
+
+TEST(Converters, ATenStackDoubleTensorDynamicTestConvertsCorrectly) {
+    // aten::stack(Tensor[] tensors, int dim=0) -> Tensor
+    const auto graph_IR = gen_double_inputs_stack_graph("2");
+    
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({10, 5, 3, 3}, {at::kCUDA}));
+    prewarm_data[0].push_back(at::randn({10, 5, 3, 3}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({5, 5, 3, 3}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({5, 5, 3, 3}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({5, 5, 3, 3}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({5, 5, 3, 3}, {at::kCUDA}));
+
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({5, 5, 3, 3}, {at::kCUDA}));
+    input_data.push_back(at::randn({5, 5, 3, 3}, {at::kCUDA}));
+
+    stack_dy_test_helper(graph_IR, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, ATenStackDoubleTensorDynamicNegDimTestConvertsCorrectly) {
+    // aten::stack(Tensor[] tensors, int dim=0) -> Tensor
+    const auto graph_IR = gen_double_inputs_stack_graph("-2");
+    
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({10, 5, 3, 3}, {at::kCUDA}));
+    prewarm_data[0].push_back(at::randn({10, 5, 3, 3}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({5, 5, 3, 3}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({5, 5, 3, 3}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({5, 5, 3, 3}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({5, 5, 3, 3}, {at::kCUDA}));
+
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({5, 5, 3, 3}, {at::kCUDA}));
+    input_data.push_back(at::randn({5, 5, 3, 3}, {at::kCUDA}));
+
+    stack_dy_test_helper(graph_IR, input_data, true, &prewarm_data);
+}
+
+TEST(Converters, ATenVStackDoubleTensorDynamicTestConvertsCorrectly) {
+    // aten::vstack(Tensor[] tensors) -> Tensor
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor, %1 : Tensor):
+          %3 : Tensor[] = prim::ListConstruct(%0, %1)
+          %4 : Tensor = aten::vstack(%3)
+          return (%4))IR";
+    
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({10, 5, 3, 3}, {at::kCUDA}));
+    prewarm_data[0].push_back(at::randn({10, 5, 3, 3}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({5, 5, 3, 3}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({5, 5, 3, 3}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({5, 5, 3, 3}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({5, 5, 3, 3}, {at::kCUDA}));
+
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn({5, 5, 3, 3}, {at::kCUDA}));
+    input_data.push_back(at::randn({5, 5, 3, 3}, {at::kCUDA}));
+
+    stack_dy_test_helper(graph_IR, input_data, true, &prewarm_data);
+}
\ No newline at end of file
diff --git a/poros/unittest/converter/to_test.cpp b/poros/unittest/converter/to_test.cpp
new file mode 100644
index 0000000000..ce643cbe96
--- /dev/null
+++ b/poros/unittest/converter/to_test.cpp
@@ -0,0 +1,69 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file to_test.cpp
+* @author wangrui39@baidu.com
+* @date Sunday November 14 11:36:11 CST 2021
+* @brief 
+**/
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/converter/gpu/to.h"
+#include "poros/util/test_util.h"
+
+static void add_test_helper(const std::string& graph_IR, 
+                            baidu::mirana::poros::IConverter* converter,
+                            std::vector<int64_t> shape1 = {5},
+                            std::vector<int64_t> shape2 = {5}){
+    std::vector<at::Tensor> input_data;
+
+    input_data.push_back(at::ones(shape1, {at::kCUDA}));
+    input_data.push_back(at::ones(shape2, {at::kCUDA}));
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, converter, 
+                input_data, graph_output, poros_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+}
+
+static std::string gen_to_graph() {
+    std::string graph = R"IR(
+      graph(%0 : Tensor, %1 : Tensor):
+        %2 : float = prim::Constant[value=2]()
+        %3 : int = prim::Constant[value=3]()
+        %4 : bool = prim::Constant[value=0]()
+        %5 : None = prim::Constant()
+        %6 : Tensor = aten::to(%0, %3, %4, %4, %5)
+        %7 : Tensor = aten::to(%1, %6, %4, %4, %5)
+        %35 : Device = prim::Constant[value="cuda"]()
+        %6 : Tensor = aten::to(%6, %35, %3, %4, %4, %5)
+        %7 : Tensor = aten::to(%7, %35, %3, %4, %4, %5)
+        %8 : int = prim::Constant[value=1]()
+        %9 : Tensor = aten::add(%6, %7, %8)
+        return (%9))IR"; 
+    return graph;
+}
+
+TEST(Converters, ATenToConvertsCorrectly) {
+    const auto graph_IR = gen_to_graph();
+    baidu::mirana::poros::ToConverter toconverter;
+    add_test_helper(graph_IR, &toconverter, {3, 4}, {3, 4});
+}
\ No newline at end of file
diff --git a/poros/unittest/converter/topk_test.cpp b/poros/unittest/converter/topk_test.cpp
new file mode 100644
index 0000000000..bee9f473f5
--- /dev/null
+++ b/poros/unittest/converter/topk_test.cpp
@@ -0,0 +1,85 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file topk_test.cpp
+* @author tianshaoqing@baidu.com
+* @date Wed Sep 27 11:24:21 CST 2021
+* @brief 
+**/
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/converter/gpu/topk.h"
+#include "poros/util/test_util.h"
+
+static void topk_test_helper(const std::string& graph_IR,
+                            std::vector<int64_t> shape) {
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::randn(shape, {at::kCUDA}));
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    baidu::mirana::poros::TopkConverter topkconverter;
+    
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &topkconverter, 
+                input_data, graph_output, poros_output));
+
+    ASSERT_EQ(2, graph_output.size());
+    ASSERT_EQ(2, poros_output.size());
+    
+    // ASSERT_TRUE(baidu::mirana::poros::testutil::almostEqual(graph_output[0], poros_output[0], 2e-6));
+    ASSERT_TRUE(graph_output[0].equal(poros_output[0]));
+    ASSERT_TRUE(graph_output[1].equal(poros_output[1]));
+}
+static std::string gen_topk_graph(const std::string& k,
+                                const std::string& dim,
+                                const std::string& largest,
+                                const std::string& sorted) {
+    return R"IR(
+        graph(%0 : Tensor):
+          %1 : int = prim::Constant[value=)IR" + k + R"IR(]()
+          %2 : int = prim::Constant[value=)IR" + dim + R"IR(]()
+          %3 : bool = prim::Constant[value=)IR" + largest + R"IR(]()
+          %4 : bool = prim::Constant[value=)IR" + sorted + R"IR(]()
+          %5 : Tensor, %6 : Tensor = aten::topk(%0, %1, %2, %3, %4)
+          return (%5, %6))IR";
+}
+
+TEST(Converters, ATenTopkConvertsCorrectly) {
+    // aten::topk(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)
+    const auto graph_IR = gen_topk_graph("10", "0", "1", "1");
+    topk_test_helper(graph_IR, {20, 10});
+}
+
+TEST(Converters, ATenTopkDimConvertsCorrectly) {
+    // aten::topk(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)
+    const auto graph_IR = gen_topk_graph("5", "1", "1", "1");
+    topk_test_helper(graph_IR, {20, 10});
+}
+
+TEST(Converters, ATenTopkDimNegtiveConvertsCorrectly) {
+    // aten::topk(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)
+    const auto graph_IR = gen_topk_graph("5", "-1", "1", "1");
+    topk_test_helper(graph_IR, {20, 10});
+}
+
+TEST(Converters, ATenTopklargestConvertsCorrectly) {
+    // aten::topk(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)
+    const auto graph_IR = gen_topk_graph("10", "0", "0", "1");
+    topk_test_helper(graph_IR, {20, 10});
+}
+
+// sorted argument is not used in TensorRT for aten::topk
\ No newline at end of file
diff --git a/poros/unittest/converter/unary_test.cpp b/poros/unittest/converter/unary_test.cpp
new file mode 100644
index 0000000000..6bda6ea8e8
--- /dev/null
+++ b/poros/unittest/converter/unary_test.cpp
@@ -0,0 +1,216 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file unary_test.cpp
+* @author tianshaoqing@baidu.com
+* @date Wed Sep 27 11:24:21 CST 2021
+* @brief 
+**/
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/converter/gpu/unary.h"
+#include "poros/util/test_util.h"
+
+static void unary_test_helper(const std::string& op,
+                            std::vector<int64_t> shape = {10}){
+    const auto graph_IR = R"IR(
+      graph(%0 : Tensor):
+        %1 : Tensor = aten::)IR" +op + R"IR((%0)
+        return (%1))IR";
+    std::vector<at::Tensor> input_data;
+    float offset = 0;
+    if(op == "acosh"){
+        offset += 1;
+    }
+    if(op == "abs" || op == "neg"){
+        offset -= 0.5;
+    }
+    auto input_tensor = at::empty(shape, {at::kCUDA}).uniform_(0 + offset, 0.5 + offset); 
+    if(op == "round") {
+        input_tensor = input_tensor * 50;
+    }
+    input_data.push_back(input_tensor);
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    baidu::mirana::poros::UnaryConverter unaryconverter;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &unaryconverter, 
+                input_data, graph_output, poros_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    
+    ASSERT_TRUE(baidu::mirana::poros::testutil::almost_equal(graph_output[0], poros_output[0], 2e-6));
+    // ASSERT_TRUE(graph_output[0].equal(poros_output[0]));
+}
+
+TEST(Converters, ATenCosConvertsCorrectly) {
+    // aten::cos(Tensor self) -> Tensor
+    unary_test_helper("cos");
+}
+
+TEST(Converters, ATenAcosConvertsCorrectly) {
+    // aten::acos(Tensor self) -> Tensor
+    unary_test_helper("acos");
+}
+
+TEST(Converters, ATenCoshConvertsCorrectly) {
+    // aten::cosh(Tensor self) -> Tensor
+    unary_test_helper("cosh");
+}
+
+TEST(Converters, ATenSinConvertsCorrectly) {
+    // aten::sin(Tensor self) -> Tensor
+    unary_test_helper("sin");
+}
+
+TEST(Converters, ATenAsinConvertsCorrectly) {
+    // aten::asin(Tensor self) -> Tensor
+    unary_test_helper("asin");
+}
+
+TEST(Converters, ATenSinhConvertsCorrectly) {
+    // aten::sinh(Tensor self) -> Tensor
+    unary_test_helper("sinh");
+}
+
+TEST(Converters, ATenTanConvertsCorrectly) {
+    // aten::tan(Tensor self) -> Tensor
+    unary_test_helper("tan");
+}
+
+TEST(Converters, ATenAtanConvertsCorrectly) {
+    // aten::atan(Tensor self) -> Tensor
+    unary_test_helper("atan");
+}
+
+TEST(Converters, ATenAbsConvertsCorrectly) {
+    // aten::abs(Tensor self) -> Tensor
+    unary_test_helper("abs");
+}
+
+TEST(Converters, ATenFloorConvertsCorrectly) {
+    // aten::floor(Tensor self) -> Tensor
+    unary_test_helper("floor");
+}
+
+TEST(Converters, ATenReciprocalConvertsCorrectly) {
+    // aten::reciprocal(Tensor self) -> Tensor
+    unary_test_helper("reciprocal");
+}
+
+TEST(Converters, ATenLogConvertsCorrectly) {
+    // aten::log(Tensor self) -> Tensor
+    unary_test_helper("log");
+}
+
+TEST(Converters, ATenCeilConvertsCorrectly) {
+    // aten::ceil(Tensor self) -> Tensor
+    unary_test_helper("ceil");
+}
+
+TEST(Converters, ATenSqrtConvertsCorrectly) {
+    // aten::sqrt(Tensor self) -> Tensor
+    unary_test_helper("sqrt");
+}
+
+TEST(Converters, ATenExpConvertsCorrectly) {
+    // aten::exp(Tensor self) -> Tensor
+    unary_test_helper("exp");
+}
+
+TEST(Converters, ATenNegConvertsCorrectly) {
+    // aten::neg(Tensor self) -> Tensor
+    unary_test_helper("neg");
+}
+
+TEST(Converters, ATenErfConvertsCorrectly) {
+    // aten::erf(Tensor self) -> Tensor
+    unary_test_helper("erf");
+}
+
+TEST(Converters, ATenAsinhConvertsCorrectly) {
+    // aten::asinh(Tensor self) -> Tensor
+    unary_test_helper("asinh");
+}
+
+TEST(Converters, ATenAcoshConvertsCorrectly) {
+    // aten::acosh(Tensor self) -> Tensor
+    unary_test_helper("acosh");
+}
+
+TEST(Converters, ATenAtanhConvertsCorrectly) {
+    // aten::atanh(Tensor self) -> Tensor
+    unary_test_helper("atanh");
+}
+
+TEST(Converters, ATenLog2ConvertsCorrectly) {
+    // aten::log2(Tensor self) -> Tensor
+    unary_test_helper("log2");
+}
+
+TEST(Converters, ATenLog10ConvertsCorrectly) {
+    // aten::log10(Tensor self) -> Tensor
+    unary_test_helper("log10");
+}
+
+TEST(Converters, ATenRoundConvertsCorrectly) {
+    // aten::round(Tensor self) -> (Tensor)
+    unary_test_helper("round");
+}
+
+TEST(Converters, ATenFloorFloat2IntConvertsCorrectly) {
+    // aten::floor.float(float a) -> (int)
+    const auto graph_IR = R"IR(
+        graph(%0 : Tensor):
+            %dim0 : int = prim::Constant[value=0]()
+            %dim1 : int = prim::Constant[value=1]()
+            %1 : float = prim::Constant[value=-1.5]()
+            %2 : int = aten::size(%0, %dim0)
+            %3 : int = aten::size(%0, %dim1)
+            %4 : float = aten::div(%2, %3)
+            %5 : int = aten::floor(%4)
+            %6 : int = aten::floor(%1)
+            %7 : int[] = prim::ListConstruct(%5, %6)
+            %8 : NoneType = prim::Constant()
+            %9 : bool = prim::Constant[value=0]()
+            %10 : Device = prim::Constant[value="cuda:0"]()
+            %11 : Tensor = aten::tensor(%7, %8, %10, %9)
+            return (%11))IR";
+
+    baidu::mirana::poros::UnaryConverter unaryconverter;
+
+    std::vector<std::vector<at::Tensor>> prewarm_data = {{}, {}, {}};
+    prewarm_data[0].push_back(at::randn({7, 2}, {at::kCUDA}));
+    prewarm_data[1].push_back(at::randn({3, 2}, {at::kCUDA}));
+    prewarm_data[2].push_back(at::randn({5, 2}, {at::kCUDA}));
+
+    std::vector<at::Tensor> input_data;
+    input_data.push_back(at::ones({7, 2}, {at::kCUDA}));
+
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    poros_option.is_dynamic = true;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> poros_output;
+    ASSERT_TRUE(baidu::mirana::poros::testutil::run_graph_and_poros(graph_IR, poros_option, &unaryconverter, 
+                input_data, graph_output, poros_output, &prewarm_data));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, poros_output.size());
+    ASSERT_TRUE(graph_output[0].equal(poros_output[0]));
+}
\ No newline at end of file
diff --git a/poros/unittest/op_fuser/fuse_conv_bn_test.cpp b/poros/unittest/op_fuser/fuse_conv_bn_test.cpp
new file mode 100644
index 0000000000..0416b9308a
--- /dev/null
+++ b/poros/unittest/op_fuser/fuse_conv_bn_test.cpp
@@ -0,0 +1,233 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file fuse_conv_bn_test.cpp
+* @author Lin Xiao Chun (linxiaochun@baidu.com)
+* @date 2022-03-31 16:11:18
+* @brief
+**/
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/lowering/fuse_conv_bn.h"
+#include "poros/lowering/op_fuse_pass.h"
+#include "poros/util/graph_test_helper.h"
+
+std::vector<int64_t> ones(size_t n) {
+    return std::vector<int64_t>(n, 1);
+}
+
+std::vector<int64_t> zeros(size_t n) {
+    return std::vector<int64_t>(n, 0);
+}
+
+static void fuse_test_helper(const std::string &graph_IR,
+                             const size_t &dim,
+                             std::shared_ptr<baidu::mirana::poros::IFuser> fuser,
+                             std::vector<int64_t> input_shape,
+                             std::vector<int64_t> conv_w_shape,
+                             std::vector<int64_t> conv_b_shape
+) {
+    std::vector<at::IValue> input_data;
+    input_data.push_back(at::randn(input_shape, {at::kCPU}));
+    input_data.push_back(at::randn(conv_w_shape, {at::kCPU}));
+    input_data.push_back(at::randn(conv_b_shape, {at::kCPU}));
+
+    input_data.push_back(at::IntArrayRef(ones(dim))); //stride
+    input_data.push_back(at::IntArrayRef(zeros(dim))); //padding
+    input_data.push_back(at::IntArrayRef(ones(dim))); //dilation
+
+    auto bn_shape = conv_b_shape;
+    input_data.push_back(at::randn(bn_shape, {at::kCPU})); //weight
+    input_data.push_back(at::randn(bn_shape, {at::kCPU})); //bias
+    input_data.push_back(at::randn(bn_shape, {at::kCPU})); //mean
+    input_data.push_back(at::abs(at::randn(bn_shape, {at::kCPU}))); //var
+    const std::vector<baidu::mirana::poros::graphtester::InputTypeEnum> input_data_type_mask = {
+            baidu::mirana::poros::graphtester::InputTensor,
+
+            baidu::mirana::poros::graphtester::ConstantTensor,
+            baidu::mirana::poros::graphtester::ConstantTensor,
+
+            baidu::mirana::poros::graphtester::ConstantIntVector,
+            baidu::mirana::poros::graphtester::ConstantIntVector,
+            baidu::mirana::poros::graphtester::ConstantIntVector,
+
+            baidu::mirana::poros::graphtester::ConstantTensor,
+            baidu::mirana::poros::graphtester::ConstantTensor,
+            baidu::mirana::poros::graphtester::ConstantTensor,
+            baidu::mirana::poros::graphtester::ConstantTensor,
+
+    };
+
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> fused_output;
+    ASSERT_TRUE(baidu::mirana::poros::graphtester::run_graph_and_fused_graph(graph_IR, poros_option, fuser,
+                                                                          input_data, input_data_type_mask,
+                                                                          graph_output, fused_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, fused_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::graphtester::almost_equal(graph_output[0], fused_output[0], 1e-6));
+}
+
+static std::string gen_conv3d_batch_norm3d_graph() {
+
+    return R"IR(
+        graph(%x : Tensor, %conv_w : Tensor, %conv_b : Tensor, %conv_stride : Tensor, %conv_padding : Tensor, %conv_dilation : Tensor, %bn_w : Tensor, %bn_b : Tensor, %bn_m : Tensor, %bn_v : Tensor):
+
+          %3 : int = prim::Constant[value=1]()
+          %conv_out : Tensor = aten::conv3d(%x, %conv_w, %conv_b, %conv_stride, %conv_padding, %conv_dilation, %3)
+          %7 : bool = prim::Constant[value=0]()
+          %4 : bool = prim::Constant[value=1]()
+          %8 : float = prim::Constant[value=1.0000000000000001e-05]()
+          %9 : float = prim::Constant[value=0.10000000000000001]()
+          %10 : Tensor = aten::batch_norm(%conv_out, %bn_w, %bn_b, %bn_m, %bn_v, %7, %9, %8, %4)
+          return (%10))IR";
+}
+
+static std::string gen_conv2d_batch_norm2d_graph() {
+
+    return R"IR(
+        graph(%x : Tensor, %conv_w : Tensor, %conv_b : Tensor, %conv_stride : Tensor, %conv_padding : Tensor, %conv_dilation : Tensor, %bn_w : Tensor, %bn_b : Tensor, %bn_m : Tensor, %bn_v : Tensor):
+
+          %3 : int = prim::Constant[value=1]()
+          %conv_out : Tensor = aten::conv2d(%x, %conv_w, %conv_b, %conv_stride, %conv_padding, %conv_dilation, %3)
+          %7 : bool = prim::Constant[value=0]()
+          %4 : bool = prim::Constant[value=1]()
+          %8 : float = prim::Constant[value=1.0000000000000001e-05]()
+          %9 : float = prim::Constant[value=0.10000000000000001]()
+          %10 : Tensor = aten::batch_norm(%conv_out, %bn_w, %bn_b, %bn_m, %bn_v, %7, %9, %8, %4)
+          return (%10))IR";
+}
+
+static std::string gen_conv1d_batch_norm1d_graph() {
+
+    return R"IR(
+        graph(%x : Tensor, %conv_w : Tensor, %conv_b : Tensor, %conv_stride : Tensor, %conv_padding : Tensor, %conv_dilation : Tensor, %bn_w : Tensor, %bn_b : Tensor, %bn_m : Tensor, %bn_v : Tensor):
+
+          %3 : int = prim::Constant[value=1]()
+          %conv_out : Tensor = aten::conv1d(%x, %conv_w, %conv_b, %conv_stride, %conv_padding, %conv_dilation, %3)
+          %7 : bool = prim::Constant[value=0]()
+          %4 : bool = prim::Constant[value=1]()
+          %8 : float = prim::Constant[value=1.0000000000000001e-05]()
+          %9 : float = prim::Constant[value=0.10000000000000001]()
+          %10 : Tensor = aten::batch_norm(%conv_out, %bn_w, %bn_b, %bn_m, %bn_v, %7, %9, %8, %4)
+          return (%10))IR";
+}
+
+static std::string gen_convolution_batch_norm3d_graph() {
+
+    return R"IR(
+        graph(%x : Tensor, %conv_w : Tensor, %conv_b : Tensor, %conv_stride : Tensor, %conv_padding : Tensor, %conv_dilation : Tensor, %bn_w : Tensor, %bn_b : Tensor, %bn_m : Tensor, %bn_v : Tensor):
+
+          %7 : bool = prim::Constant[value=0]()
+          %4 : bool = prim::Constant[value=1]()
+          %3 : int = prim::Constant[value=1]()
+          %output_padding : int[] = prim::Constant[value=[0, 0, 0]]()
+          %conv_out : Tensor = aten::_convolution(%x, %conv_w, %conv_b, %conv_stride, %conv_padding, %conv_dilation, %7, %output_padding, %3, %7, %7, %4, %4)
+          %8 : float = prim::Constant[value=1.0000000000000001e-05]()
+          %9 : float = prim::Constant[value=0.10000000000000001]()
+          %10 : Tensor = aten::batch_norm(%conv_out, %bn_w, %bn_b, %bn_m, %bn_v, %7, %9, %8, %4)
+          return (%10))IR";
+}
+
+static std::string gen_convolution_batch_norm2d_graph() {
+
+    return R"IR(
+        graph(%x : Tensor, %conv_w : Tensor, %conv_b : Tensor, %conv_stride : Tensor, %conv_padding : Tensor, %conv_dilation : Tensor, %bn_w : Tensor, %bn_b : Tensor, %bn_m : Tensor, %bn_v : Tensor):
+
+          %7 : bool = prim::Constant[value=0]()
+          %4 : bool = prim::Constant[value=1]()
+          %3 : int = prim::Constant[value=1]()
+          %output_padding : int[] = prim::Constant[value=[0, 0]]()
+          %conv_out : Tensor = aten::_convolution(%x, %conv_w, %conv_b, %conv_stride, %conv_padding, %conv_dilation, %7, %output_padding, %3, %7, %7, %4, %4)
+          %8 : float = prim::Constant[value=1.0000000000000001e-05]()
+          %9 : float = prim::Constant[value=0.10000000000000001]()
+          %10 : Tensor = aten::batch_norm(%conv_out, %bn_w, %bn_b, %bn_m, %bn_v, %7, %9, %8, %4)
+          return (%10))IR";
+}
+
+static std::string gen_convolution_batch_norm1d_graph() {
+
+    return R"IR(
+        graph(%x : Tensor, %conv_w : Tensor, %conv_b : Tensor, %conv_stride : Tensor, %conv_padding : Tensor, %conv_dilation : Tensor, %bn_w : Tensor, %bn_b : Tensor, %bn_m : Tensor, %bn_v : Tensor):
+
+          %7 : bool = prim::Constant[value=0]()
+          %4 : bool = prim::Constant[value=1]()
+          %3 : int = prim::Constant[value=1]()
+          %output_padding : int[] = prim::Constant[value=[0]]()
+          %conv_out : Tensor = aten::_convolution(%x, %conv_w, %conv_b, %conv_stride, %conv_padding, %conv_dilation, %7, %output_padding, %3, %7, %7, %4, %4)
+          %8 : float = prim::Constant[value=1.0000000000000001e-05]()
+          %9 : float = prim::Constant[value=0.10000000000000001]()
+          %10 : Tensor = aten::batch_norm(%conv_out, %bn_w, %bn_b, %bn_m, %bn_v, %7, %9, %8, %4)
+          return (%10))IR";
+}
+
+TEST(Fusers, ATenFuseConvBN3d_Test) {
+    const auto graph_IR = gen_conv3d_batch_norm3d_graph();
+    auto fuser = std::make_shared<baidu::mirana::poros::FuseConvBatchNorm>();
+
+    fuse_test_helper(graph_IR, 3, fuser, {1, 2, 3, 4, 5}, {3, 2, 3, 3, 3}, {3});
+    fuse_test_helper(graph_IR, 3, fuser, {3, 5, 8, 4, 6}, {12, 5, 3, 3, 3}, {12});
+    fuse_test_helper(graph_IR, 3, fuser, {1, 2, 3, 4, 5}, {3, 2, 3, 3, 3}, {3});
+}
+
+TEST(Fusers, ATenFuseConvBN2d_Test) {
+    const auto graph_IR = gen_conv2d_batch_norm2d_graph();
+    auto fuser = std::make_shared<baidu::mirana::poros::FuseConvBatchNorm>();
+
+    fuse_test_helper(graph_IR, 2, fuser, {1, 2, 4, 5}, {3, 2, 3, 3}, {3});
+    fuse_test_helper(graph_IR, 2, fuser, {3, 5, 4, 6}, {12, 5, 3, 3}, {12});
+    fuse_test_helper(graph_IR, 2, fuser, {1, 2, 4, 5}, {3, 2, 3, 3}, {3});
+}
+
+TEST(Fusers, ATenFuseConvBN1d_Test) {
+    const auto graph_IR = gen_conv1d_batch_norm1d_graph();
+    auto fuser = std::make_shared<baidu::mirana::poros::FuseConvBatchNorm>();
+
+    fuse_test_helper(graph_IR, 1, fuser, {1, 2, 5}, {3, 2, 3}, {3});
+    fuse_test_helper(graph_IR, 1, fuser, {3, 5, 6}, {12, 5, 3}, {12});
+    fuse_test_helper(graph_IR, 1, fuser, {1, 2, 5}, {3, 2, 3}, {3});
+}
+
+TEST(Fusers, ATenFuseConvolutionBN3d_Test) {
+    const auto graph_IR = gen_convolution_batch_norm3d_graph();
+    auto fuser = std::make_shared<baidu::mirana::poros::FuseConvBatchNorm>();
+
+    fuse_test_helper(graph_IR, 3, fuser, {1, 2, 3, 4, 5}, {3, 2, 3, 3, 3}, {3});
+    fuse_test_helper(graph_IR, 3, fuser, {3, 5, 8, 4, 6}, {12, 5, 3, 3, 3}, {12});
+    fuse_test_helper(graph_IR, 3, fuser, {1, 2, 3, 4, 5}, {3, 2, 3, 3, 3}, {3});
+}
+
+TEST(Fusers, ATenFuseConvolutionBN2d_Test) {
+    const auto graph_IR = gen_convolution_batch_norm2d_graph();
+    auto fuser = std::make_shared<baidu::mirana::poros::FuseConvBatchNorm>();
+
+    fuse_test_helper(graph_IR, 2, fuser, {1, 2, 4, 5}, {3, 2, 3, 3}, {3});
+    fuse_test_helper(graph_IR, 2, fuser, {3, 5, 4, 6}, {12, 5, 3, 3}, {12});
+    fuse_test_helper(graph_IR, 2, fuser, {1, 2, 4, 5}, {3, 2, 3, 3}, {3});
+}
+
+TEST(Fusers, ATenFuseConvolutionBN1d_Test) {
+    const auto graph_IR = gen_convolution_batch_norm1d_graph();
+    auto fuser = std::make_shared<baidu::mirana::poros::FuseConvBatchNorm>();
+
+    fuse_test_helper(graph_IR, 1, fuser, {1, 2, 5}, {3, 2, 3}, {3});
+    fuse_test_helper(graph_IR, 1, fuser, {3, 5, 6}, {12, 5, 3}, {12});
+    fuse_test_helper(graph_IR, 1, fuser, {1, 2, 5}, {3, 2, 3}, {3});
+}
\ No newline at end of file
diff --git a/poros/unittest/op_fuser/fuse_conv_mul_test.cpp b/poros/unittest/op_fuser/fuse_conv_mul_test.cpp
new file mode 100644
index 0000000000..a74ccb2822
--- /dev/null
+++ b/poros/unittest/op_fuser/fuse_conv_mul_test.cpp
@@ -0,0 +1,137 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file: /icode-poros/baidu/mirana/poros/unnitest/op_fuser/fuse_conv_mul_test.cpp
+* @author: zhangfan51@baidu.com
+* @data: 2022-04-24 19:00:03
+* @brief: 
+**/ 
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/lowering/fuse_conv_mul.h"
+#include "poros/lowering/op_fuse_pass.h"
+#include "poros/util/graph_test_helper.h"
+
+static std::vector<int64_t> ones(size_t n) {
+    return std::vector<int64_t>(n, 1);
+}
+
+static std::vector<int64_t> zeros(size_t n) {
+    return std::vector<int64_t>(n, 0);
+}
+
+static void fuse_test_helper(const std::string &graph_IR,
+                             const size_t &dim,
+                             std::shared_ptr<baidu::mirana::poros::IFuser> fuser,
+                             std::vector<int64_t> input_shape,
+                             std::vector<int64_t> conv_w_shape,
+                             std::vector<int64_t> conv_b_shape
+) {
+    std::vector<at::IValue> input_data;
+    input_data.push_back(at::randn(input_shape, {at::kCPU}));
+    input_data.push_back(at::randn(conv_w_shape, {at::kCPU}));
+    input_data.push_back(at::randn(conv_b_shape, {at::kCPU}));
+
+    input_data.push_back(at::IntArrayRef(ones(dim))); //stride
+    input_data.push_back(at::IntArrayRef(zeros(dim))); //padding
+    input_data.push_back(at::IntArrayRef(ones(dim))); //dilation
+
+    const std::vector<baidu::mirana::poros::graphtester::InputTypeEnum> input_data_type_mask = {
+            baidu::mirana::poros::graphtester::InputTensor,
+
+            baidu::mirana::poros::graphtester::ConstantTensor,
+            baidu::mirana::poros::graphtester::ConstantTensor,
+
+            baidu::mirana::poros::graphtester::ConstantIntVector,
+            baidu::mirana::poros::graphtester::ConstantIntVector,
+            baidu::mirana::poros::graphtester::ConstantIntVector,
+    };
+
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> fused_output;
+    ASSERT_TRUE(baidu::mirana::poros::graphtester::run_graph_and_fused_graph(graph_IR, poros_option, fuser,
+                                                                          input_data, input_data_type_mask,
+                                                                          graph_output, fused_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, fused_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::graphtester::almost_equal(graph_output[0], fused_output[0], 1e-6));
+}
+
+static std::string gen_conv3d_mul_graph() {
+    return R"IR(
+        graph(%x : Tensor, %conv_w : Tensor, %conv_b : Tensor, %conv_stride : Tensor, 
+            %conv_padding : Tensor, %conv_dilation : Tensor):
+          %3 : int = prim::Constant[value=1]()
+          %4 : float = prim::Constant[value=2.0]()
+          %conv_out : Tensor = aten::conv3d(%x, %conv_w, %conv_b, %conv_stride, %conv_padding, %conv_dilation, %3)
+          %5 : Tensor = aten::mul(%conv_out, %4)
+          return (%5))IR";
+}
+
+static std::string gen_conv2d_mul_graph() {
+
+    return R"IR(
+        graph(%x : Tensor, %conv_w : Tensor, %conv_b : Tensor, %conv_stride : Tensor, 
+            %conv_padding : Tensor, %conv_dilation : Tensor):
+          %3 : int = prim::Constant[value=1]()
+          %4 : float = prim::Constant[value=2.0]()
+          %conv_out : Tensor = aten::conv2d(%x, %conv_w, %conv_b, %conv_stride, %conv_padding, %conv_dilation, %3)
+          %5 : Tensor = aten::mul(%conv_out, %4)
+          return (%5))IR";
+}
+
+static std::string gen_conv1d_mul_graph() {
+
+    return R"IR(
+        graph(%x : Tensor, %conv_w : Tensor, %conv_b : Tensor, %conv_stride : Tensor, 
+            %conv_padding : Tensor, %conv_dilation : Tensor):
+          %3 : int = prim::Constant[value=1]()
+          %4 : float = prim::Constant[value=2.0]()
+          %conv_out : Tensor = aten::conv1d(%x, %conv_w, %conv_b, %conv_stride, %conv_padding, %conv_dilation, %3)
+          %5 : Tensor = aten::mul(%conv_out, %4)
+          return (%5))IR";
+}
+
+TEST(Fusers, ATenFuseConv3dMul_Test) {
+    const auto graph_IR = gen_conv3d_mul_graph();
+    auto fuser = std::make_shared<baidu::mirana::poros::FuseConvMul>();
+
+    fuse_test_helper(graph_IR, 3, fuser, {1, 2, 3, 4, 5}, {3, 2, 3, 3, 3}, {3});
+    fuse_test_helper(graph_IR, 3, fuser, {3, 5, 8, 4, 6}, {12, 5, 3, 3, 3}, {12});
+    fuse_test_helper(graph_IR, 3, fuser, {1, 2, 3, 4, 5}, {3, 2, 3, 3, 3}, {3});
+}
+
+TEST(Fusers, ATenFuseConv2dMul_Test) {
+    const auto graph_IR = gen_conv2d_mul_graph();
+    auto fuser = std::make_shared<baidu::mirana::poros::FuseConvMul>();
+
+    fuse_test_helper(graph_IR, 2, fuser, {1, 2, 4, 5}, {3, 2, 3, 3}, {3});
+    fuse_test_helper(graph_IR, 2, fuser, {3, 5, 4, 6}, {12, 5, 3, 3}, {12});
+    fuse_test_helper(graph_IR, 2, fuser, {1, 2, 4, 5}, {3, 2, 3, 3}, {3});
+}
+
+TEST(Fusers, ATenFuseConv1dMul_Test) {
+    const auto graph_IR = gen_conv1d_mul_graph();
+    auto fuser = std::make_shared<baidu::mirana::poros::FuseConvMul>();
+
+    fuse_test_helper(graph_IR, 1, fuser, {1, 2, 5}, {3, 2, 3}, {3});
+    fuse_test_helper(graph_IR, 1, fuser, {3, 5, 6}, {12, 5, 3}, {12});
+    fuse_test_helper(graph_IR, 1, fuser, {1, 2, 5}, {3, 2, 3}, {3});
+}
\ No newline at end of file
diff --git a/poros/unittest/op_fuser/fuse_copy_test.cpp b/poros/unittest/op_fuser/fuse_copy_test.cpp
new file mode 100644
index 0000000000..7a0ebdae82
--- /dev/null
+++ b/poros/unittest/op_fuser/fuse_copy_test.cpp
@@ -0,0 +1,380 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file fuse_copy_test.cpp
+* @author tianjinjin@baidu.com
+* @date Mon Aug 22 10:47:14 CST 2022
+* @brief
+**/
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/lowering/fuse_copy.h"
+#include "poros/lowering/op_fuse_pass.h"
+#include "poros/util/graph_test_helper.h"
+
+static void fuse_test_helper(const std::string &graph_IR,
+                             std::shared_ptr<baidu::mirana::poros::IFuser> fuser,
+                             std::vector<int64_t> input_shape,
+                             bool with_single_value
+) {
+    std::vector<at::IValue> input_data;
+    input_data.push_back(at::randn(input_shape, {at::kCPU}));
+    std::vector<baidu::mirana::poros::graphtester::InputTypeEnum> input_data_type_mask = {
+            baidu::mirana::poros::graphtester::InputTensor,
+    };
+
+    if (with_single_value) {
+        input_data.push_back(at::randn({1}, {at::kCPU}));
+        input_data_type_mask.push_back(baidu::mirana::poros::graphtester::ConstantTensor);
+    }
+
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    // poros_option.debug = true;
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> fused_output;
+    ASSERT_TRUE(baidu::mirana::poros::graphtester::run_graph_and_fused_graph(graph_IR, poros_option, fuser,
+                                                                             input_data, input_data_type_mask,
+                                                                             graph_output, fused_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, fused_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::graphtester::almost_equal(graph_output[0], fused_output[0], 1e-6));
+}
+
+/**
+ * this IR is generated from python code below:
+def shift(x, n_segment, fold_div=3, inplace=False):  
+    nt, c, h, w = x.size()
+    n_batch = nt // n_segment
+    x = x.view(n_batch, n_segment, c, h, w)
+    
+    fold = c // fold_div
+    
+    out = torch.zeros_like(x)
+    out[:, :-1, :fold] = x[:, 1:, :fold]
+    return out.view(nt, c, h, w)
+ * **/
+static std::string gen_simple_slice_graph() {
+    std::string graph = R"IR(
+      graph(%x : Tensor):
+        %none : NoneType = prim::Constant()
+        %0 : int = prim::Constant[value=0]()
+        %1 : int = prim::Constant[value=1]()
+        %2 : int = prim::Constant[value=2]()
+        %3 : int = prim::Constant[value=-1]()
+        %8 : int = prim::Constant[value=8]()
+        %16 : int = prim::Constant[value=16]()
+        %false : bool = prim::Constant[value=0]()
+        %fold : int = prim::Constant[value=21]()
+
+        %292 : int[] = aten::size(%x)
+        %nt.3 : int, %c.3 : int, %h.3 : int, %w.3 : int = prim::ListUnpack(%292)
+        %n_batch.3 : int = aten::floordiv(%nt.3, %16)
+        %298 : int[] = prim::ListConstruct(%n_batch.3, %16, %c.3, %h.3, %w.3)
+        %x1.7 : Tensor = aten::view(%x, %298)
+
+        %out : Tensor = aten::zeros_like(%x1.7, %none, %none, %none, %none, %none) # temporal_shift.py:12:18
+        
+        %302 : Tensor = aten::slice(%x1.7, %0, %none, %none, %1) # temporal_shift.py:13:33
+        %303 : Tensor = aten::slice(%302, %1, %1, %none, %1) # temporal_shift.py:13:33
+        %304 : Tensor = aten::slice(%303, %2, %none, %fold, %1) # temporal_shift.py:13:33
+   
+        %305 : Tensor = aten::slice(%out, %0, %none, %none, %1) # temporal_shift.py:13:12
+        %306 : Tensor = aten::slice(%305, %1, %none, %3, %1) # temporal_shift.py:13:12
+        %307 : Tensor = aten::slice(%306, %2, %none, %fold, %1) # temporal_shift.py:13:12
+        %308 : Tensor = aten::copy_(%307, %304, %false) # temporal_shift.py:13:12
+
+        %322 : int[] = prim::ListConstruct(%nt.3, %c.3, %h.3, %w.3)
+        %x0.7 : Tensor = aten::view(%out, %322)
+        return (%x0.7))IR";
+    return graph;
+}
+
+/**
+ * this IR is generated from python code below:
+def shift(x, n_segment, fold_div=3, inplace=False):  
+    nt, c, h, w = x.size()
+    n_batch = nt // n_segment
+    x = x.view(n_batch, n_segment, c, h, w)
+    
+    fold = c // fold_div
+    
+    out = torch.zeros_like(x)
+    out[:, :-1, :fold] = x[:, 1:, :fold]  
+    out[:, 1:, fold: 2 * fold] = x[:, :-1, fold: 2 * fold]
+    out[:, :, 2 * fold:] = x[:, :, 2 * fold:]
+    return out.view(nt, c, h, w)
+ * **/
+static std::string gen_complex_slice_graph() {
+    std::string graph = R"IR(
+      graph(%x : Tensor):
+        %none : NoneType = prim::Constant()
+        %0 : int = prim::Constant[value=0]()
+        %1 : int = prim::Constant[value=1]()
+        %2 : int = prim::Constant[value=2]()
+        %3 : int = prim::Constant[value=-1]()
+        %8 : int = prim::Constant[value=8]()
+        %16 : int = prim::Constant[value=16]()
+        %false : bool = prim::Constant[value=0]()
+        %fold : int = prim::Constant[value=21]()
+
+        %292 : int[] = aten::size(%x)
+        %nt.3 : int, %c.3 : int, %h.3 : int, %w.3 : int = prim::ListUnpack(%292)
+        %n_batch.3 : int = aten::floordiv(%nt.3, %16)
+        %298 : int[] = prim::ListConstruct(%n_batch.3, %16, %c.3, %h.3, %w.3)
+        %x1.7 : Tensor = aten::view(%x, %298)
+
+        %out : Tensor = aten::zeros_like(%x1.7, %none, %none, %none, %none, %none) # temporal_shift.py:12:18
+        
+        %302 : Tensor = aten::slice(%x1.7, %0, %none, %none, %1) # temporal_shift.py:13:33
+        %303 : Tensor = aten::slice(%302, %1, %1, %none, %1) # temporal_shift.py:13:33
+        %304 : Tensor = aten::slice(%303, %2, %none, %fold, %1) # temporal_shift.py:13:33
+   
+        %305 : Tensor = aten::slice(%out, %0, %none, %none, %1) # temporal_shift.py:13:12
+        %306 : Tensor = aten::slice(%305, %1, %none, %3, %1) # temporal_shift.py:13:12
+        %307 : Tensor = aten::slice(%306, %2, %none, %fold, %1) # temporal_shift.py:13:12
+        %308 : Tensor = aten::copy_(%307, %304, %false) # temporal_shift.py:13:12
+
+        %309 : Tensor = aten::slice(%302, %1, %none, %3, %1) # temporal_shift.py:14:41
+        %310 : int = aten::mul(%2, %fold) # temporal_shift.py:14:57
+        %311 : Tensor = aten::slice(%309, %2, %fold, %310, %1) # temporal_shift.py:14:41
+        
+        %312 : Tensor = aten::slice(%out, %0, %none, %none, %1) # temporal_shift.py:14:12
+        %313 : Tensor = aten::slice(%312, %1, %1, %none, %1) # temporal_shift.py:14:12
+        %314 : Tensor = aten::slice(%313, %2, %fold, %310, %1) # temporal_shift.py:14:12
+        %315 : Tensor = aten::copy_(%314, %311, %false) # temporal_shift.py:14:12
+        
+        %316 : Tensor = aten::slice(%302, %1, %none, %none, %1) # temporal_shift.py:15:35
+        %317 : Tensor = aten::slice(%316, %2, %310, %none, %1) # temporal_shift.py:15:35
+        
+        %318 : Tensor = aten::slice(%out, %0, %none, %none, %1) # temporal_shift.py:15:12
+        %319 : Tensor = aten::slice(%318, %1, %none, %none, %1) # temporal_shift.py:15:12
+        %320 : Tensor = aten::slice(%319, %2, %310, %none, %1) # temporal_shift.py:15:12
+        %321 : Tensor = aten::copy_(%320, %317, %false) # temporal_shift.py:15:12
+
+        %322 : int[] = prim::ListConstruct(%nt.3, %c.3, %h.3, %w.3)
+        %x0.7 : Tensor = aten::view(%out, %322)
+        return (%x0.7))IR";
+    return graph;
+}
+
+/**
+ * this IR is generated from python code below:
+ * class SliceTest(torch.nn.Module):
+    def __init__(self):
+        super(SliceTest, self).__init__()
+
+    def forward(self, x):
+        size = x.size()
+        #resize = size[:-1]
+        attention_mask = torch.zeros(size)
+        attention_mask[2:3:1, 2, :, 0, :] = 1
+        out = attention_mask * 3
+        return out
+ * **/
+static std::string gen_select_graph_with_single_value() {
+    std::string graph = R"IR(
+      graph(%x.1 : Tensor, %value : Tensor):
+        %33 : bool = prim::Constant[value=0]()
+        %5 : NoneType = prim::Constant()
+        %10 : int = prim::Constant[value=1]() # ../../test.py:11:44
+        %12 : int = prim::Constant[value=2]() # ../../test.py:11:30
+        %13 : int = prim::Constant[value=0]() # ../../test.py:11:36
+        %15 : int = prim::Constant[value=3]() # ../../test.py:11:25
+        %size.1 : int[] = aten::size(%x.1) # ../../test.py:8:15
+        %attention_mask.1 : Tensor = aten::zeros(%size.1, %5, %5, %5, %5) # ../../test.py:10:25
+        %16 : Tensor = aten::slice(%attention_mask.1, %13, %12, %15, %10) # ../../test.py:11:8
+        %18 : Tensor = aten::select(%16, %10, %12) # ../../test.py:11:8
+        %23 : Tensor = aten::slice(%18, %10, %5, %5, %10) # ../../test.py:11:8
+        %25 : Tensor = aten::select(%23, %12, %13) # ../../test.py:11:8
+        %30 : Tensor = aten::slice(%25, %12, %5, %5, %10) # ../../test.py:11:8
+        %36 : Tensor = aten::copy_(%30, %value, %33) # ../../test.py:11:8
+        %out.1 : Tensor = aten::mul(%attention_mask.1, %15) # ../../test.py:12:14
+        return (%out.1))IR";
+    return graph;
+}
+
+/**
+ * this IR is generated from python code below:
+ * class SliceTest(torch.nn.Module):
+    def __init__(self):
+        super(SliceTest, self).__init__()
+
+    def forward(self, x):
+        size = x.size()
+        #resize = size[:-1]
+        attention_mask = torch.zeros(size)
+        attention_mask[0, 2, 1:4:1, 0, :] = 1
+        out = attention_mask * 3
+        return out
+ * **/
+static std::string gen_select_graph_with_single_value2() {
+    std::string graph = R"IR(
+      graph(%x.1 : Tensor, %value : Tensor):
+        %30 : bool = prim::Constant[value=0]()
+        %5 : NoneType = prim::Constant()
+        %10 : int = prim::Constant[value=1]() # ../../test.py:11:44
+        %12 : int = prim::Constant[value=0]() # ../../test.py:11:23
+        %13 : int = prim::Constant[value=2]() # ../../test.py:11:26
+        %19 : int = prim::Constant[value=4]() # ../../test.py:11:31
+        %35 : int = prim::Constant[value=3]() # ../../test.py:12:31
+        %size.1 : int[] = aten::size(%x.1) # ../../test.py:8:15
+        %attention_mask.1 : Tensor = aten::zeros(%size.1, %5, %5, %5, %5) # ../../test.py:10:25
+        %15 : Tensor = aten::select(%attention_mask.1, %12, %12) # ../../test.py:11:8
+        %17 : Tensor = aten::select(%15, %12, %13) # ../../test.py:11:8
+        %20 : Tensor = aten::slice(%17, %12, %10, %19, %10) # ../../test.py:11:8
+        %22 : Tensor = aten::select(%20, %10, %12) # ../../test.py:11:8
+        %27 : Tensor = aten::slice(%22, %10, %5, %5, %10) # ../../test.py:11:8
+        %33 : Tensor = aten::copy_(%27, %value, %30) # ../../test.py:11:8
+        %out.1 : Tensor = aten::mul(%attention_mask.1, %35) # ../../test.py:12:14
+        return (%out.1))IR";
+    return graph;
+}
+
+/**
+ * this IR is generated from python code below:
+class ClipBoxes(torch.nn.Module):
+    def __init__(self):
+        super(ClipBoxes, self).__init__()
+
+    def forward(self, boxes):
+        boxes[:, :, 0] = torch.clamp(boxes[:, :, 0], min=0)
+        return boxes * 2
+ * **/
+static std::string gen_select_with_tensor_value() {
+    std::string graph = R"IR(
+        graph(%boxes.1 : Tensor):
+            %31 : bool = prim::Constant[value=0]()
+            %6 : NoneType = prim::Constant()
+            %5 : int = prim::Constant[value=1]() # test.py:54:37
+            %3 : int = prim::Constant[value=0]() # test.py:54:49
+            %34 : int = prim::Constant[value=2]() # test.py:60:23
+            %8 : Tensor = aten::slice(%boxes.1, %3, %6, %6, %5) # test.py:54:37
+            %13 : Tensor = aten::slice(%8, %5, %6, %6, %5) # test.py:54:37
+            %15 : Tensor = aten::select(%13, %34, %3) # test.py:54:37
+            %17 : Tensor = aten::clamp(%15, %3, %6) # test.py:54:25
+            %23 : Tensor = aten::slice(%boxes.1, %3, %6, %6, %5) # test.py:54:8
+            %28 : Tensor = aten::slice(%23, %5, %6, %6, %5) # test.py:54:8
+            %30 : Tensor = aten::select(%28, %34, %3) # test.py:54:8
+            %32 : Tensor = aten::copy_(%30, %17, %31) # test.py:54:8
+            %35 : Tensor = aten::mul(%boxes.1, %34) # test.py:60:15
+            return (%35))IR";
+    return graph;
+}
+
+/**
+ * this IR is generated from python code below:
+class ClipBoxes(torch.nn.Module):
+    def __init__(self):
+        super(ClipBoxes, self).__init__()
+
+    def forward(self, boxes):
+        boxes[:, 0, :, :] = torch.clamp(boxes[:, 0, :, :], min=0)
+        return boxes * 2
+ * **/
+static std::string gen_select_with_tensor_value2() {
+    std::string graph = R"IR(
+        graph(%boxes.1 : Tensor):
+            %41 : bool = prim::Constant[value=0]()
+            %6 : NoneType = prim::Constant()
+            %5 : int = prim::Constant[value=1]() # test.py:46:40
+            %3 : int = prim::Constant[value=0]() # test.py:46:49
+            %44 : int = prim::Constant[value=2]() # test.py:47:23
+            %8 : Tensor = aten::slice(%boxes.1, %3, %6, %6, %5) # test.py:46:40
+            %10 : Tensor = aten::select(%8, %5, %3) # test.py:46:40
+            %15 : Tensor = aten::slice(%10, %5, %6, %6, %5) # test.py:46:40
+            %20 : Tensor = aten::slice(%15, %44, %6, %6, %5) # test.py:46:40
+            %22 : Tensor = aten::clamp(%20, %3, %6) # test.py:46:28
+            %28 : Tensor = aten::slice(%boxes.1, %3, %6, %6, %5) # test.py:46:8
+            %30 : Tensor = aten::select(%28, %5, %3) # test.py:46:8
+            %35 : Tensor = aten::slice(%30, %5, %6, %6, %5) # test.py:46:8
+            %40 : Tensor = aten::slice(%35, %44, %6, %6, %5) # test.py:46:8
+            %42 : Tensor = aten::copy_(%40, %22, %41) # test.py:46:8
+            %45 : Tensor = aten::mul(%boxes.1, %44) # test.py:47:15
+            return (%45))IR";
+    return graph;
+}
+
+/**
+ * this IR is generated from python code below:
+class ClipBoxes(torch.nn.Module):
+    def __init__(self):
+        super(ClipBoxes, self).__init__()
+
+    def forward(self, boxes):
+        boxes[:, 0, :, 1] = torch.clamp(boxes[:, 0, :, 1], min=0)
+        return boxes * 2
+ * **/
+static std::string gen_select_with_tensor_value3() {
+    std::string graph = R"IR(
+        graph(%boxes.1 : Tensor):
+            %36 : bool = prim::Constant[value=0]()
+            %7 : NoneType = prim::Constant()
+            %3 : int = prim::Constant[value=0]() # test.py:50:49
+            %4 : int = prim::Constant[value=1]() # test.py:50:55
+            %39 : int = prim::Constant[value=2]() # test.py:51:23
+            %9 : Tensor = aten::slice(%boxes.1, %3, %7, %7, %4) # test.py:50:40
+            %11 : Tensor = aten::select(%9, %4, %3) # test.py:50:40
+            %16 : Tensor = aten::slice(%11, %4, %7, %7, %4) # test.py:50:40
+            %18 : Tensor = aten::select(%16, %39, %4) # test.py:50:40
+            %20 : Tensor = aten::clamp(%18, %3, %7) # test.py:50:28
+            %26 : Tensor = aten::slice(%boxes.1, %3, %7, %7, %4) # test.py:50:8
+            %28 : Tensor = aten::select(%26, %4, %3) # test.py:50:8
+            %33 : Tensor = aten::slice(%28, %4, %7, %7, %4) # test.py:50:8
+            %35 : Tensor = aten::select(%33, %39, %4) # test.py:50:8
+            %37 : Tensor = aten::copy_(%35, %20, %36) # test.py:50:8
+            %40 : Tensor = aten::mul(%boxes.1, %39) # test.py:51:15
+            return (%40))IR";
+    return graph;
+}
+
+TEST(Fusers, ATenFuseCopySliceTest) {
+    auto fuser = std::make_shared<baidu::mirana::poros::FuseCopy>();
+    //situation1: out[:, :-1, :fold] = x[:, 1:, :fold] 
+    const auto slice_graph_simple = gen_simple_slice_graph();
+    fuse_test_helper(slice_graph_simple, fuser, {16, 64, 16, 16}, false);
+    //situation2:  multi-copy 
+    const auto slice_graph = gen_complex_slice_graph();
+    fuse_test_helper(slice_graph, fuser, {16, 64, 16, 16}, false);
+}
+
+TEST(Fusers, ATenFuseCopySelectWithSingleValueTest) {
+    auto fuser = std::make_shared<baidu::mirana::poros::FuseCopy>();
+    //situation1: attention_mask[2:3:1, 2, :, 0, :] = 1
+    const auto select_graph_IR = gen_select_graph_with_single_value();
+    fuse_test_helper(select_graph_IR, fuser, {4, 4, 5, 4, 3}, true);
+
+    //situation2: attention_mask[0, 2, 1:4:1, 0, :] = 1
+    const auto select_graph_IR2 = gen_select_graph_with_single_value2();
+    fuse_test_helper(select_graph_IR2, fuser, {4, 4, 5, 4, 3}, true);
+}
+
+TEST(Fusers, ATenFuseCopySelectWithTensorValueTest) {
+    auto fuser = std::make_shared<baidu::mirana::poros::FuseCopy>();
+    //situation1:  boxes[:, :, 0] = torch.clamp(boxes[:, :, 0], min=0)
+    const auto  select_graph_IR= gen_select_with_tensor_value();
+    fuse_test_helper(select_graph_IR, fuser, {1, 20, 4}, false);
+
+    //situation2: boxes[:, 0, :, :] = torch.clamp(boxes[:, 0, :, :], min=0)
+    const auto select_graph_IR2 = gen_select_with_tensor_value2();
+    fuse_test_helper(select_graph_IR2, fuser, {1, 20, 4, 5}, false);
+
+    //situation3: boxes[:, 0, :, 1] = torch.clamp(boxes[:, 0, :, 1], min=0)
+    const auto select_graph_IR3 = gen_select_with_tensor_value3();
+    fuse_test_helper(select_graph_IR3, fuser, {1, 20, 4, 5}, false);
+}
\ No newline at end of file
diff --git a/poros/unittest/op_fuser/fuse_hard_swish_test.cpp b/poros/unittest/op_fuser/fuse_hard_swish_test.cpp
new file mode 100644
index 0000000000..0719a033f6
--- /dev/null
+++ b/poros/unittest/op_fuser/fuse_hard_swish_test.cpp
@@ -0,0 +1,71 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file fuse_hard_swish_test.cpp
+* @author Lin Xiao Chun (linxiaochun@baidu.com)
+* @date 2022-04-07 15:31:03
+* @brief
+**/
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/lowering/fuse_hard_swish.h"
+#include "poros/lowering/op_fuse_pass.h"
+#include "poros/util/graph_test_helper.h"
+
+static void fuse_test_helper(const std::string &graph_IR,
+                             std::shared_ptr<baidu::mirana::poros::IFuser> fuser,
+                             std::vector<int64_t> input_shape
+) {
+    std::vector<at::IValue> input_data;
+    input_data.push_back(at::randn(input_shape, {at::kCPU}));
+
+    const std::vector<baidu::mirana::poros::graphtester::InputTypeEnum> input_data_type_mask = {
+            baidu::mirana::poros::graphtester::InputTensor,
+    };
+
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> fused_output;
+    ASSERT_TRUE(baidu::mirana::poros::graphtester::run_graph_and_fused_graph(graph_IR, poros_option, fuser,
+                                                                             input_data, input_data_type_mask,
+                                                                             graph_output, fused_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, fused_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::graphtester::almost_equal(graph_output[0], fused_output[0], 1e-6));
+}
+
+static std::string gen_hardswish_graph() {
+
+    std::string hardsiwsh = R"IR(
+    graph(%x):
+        %out: Tensor = aten::hardswish(%x)
+        return (%out))IR";
+    return hardsiwsh;
+}
+
+TEST(Fusers, ATenFuseHardSwish_Test) {
+    const auto graph_IR = gen_hardswish_graph();
+    auto fuser = std::make_shared<baidu::mirana::poros::FuseHardSwish>();
+
+    fuse_test_helper(graph_IR, fuser, {2, 3, 4, 5});
+    fuse_test_helper(graph_IR, fuser, {3, 4, 5});
+    fuse_test_helper(graph_IR, fuser, {4, 5});
+    fuse_test_helper(graph_IR, fuser, {5});
+}
+
diff --git a/poros/unittest/op_fuser/fuse_meshgrid_test.cpp b/poros/unittest/op_fuser/fuse_meshgrid_test.cpp
new file mode 100644
index 0000000000..0ebc44538d
--- /dev/null
+++ b/poros/unittest/op_fuser/fuse_meshgrid_test.cpp
@@ -0,0 +1,77 @@
+// Copyright (c) 2022 Baidu, Inc.  All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+* @file fuse_meshgrid_test.cpp
+* @author Lin Xiao Chun (linxiaochun@baidu.com)
+* @date 2022-04-29 14:56:38
+* @brief
+**/
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "poros/lowering/fuse_meshgrid.h"
+#include "poros/lowering/op_fuse_pass.h"
+#include "poros/util/graph_test_helper.h"
+
+static void fuse_test_helper(const std::string &graph_IR,
+                             std::shared_ptr<baidu::mirana::poros::IFuser> fuser,
+                             std::pair<int64_t,int64_t> input_shape
+) {
+    std::vector<at::IValue> input_data;
+    input_data.push_back(at::randn(input_shape.first, {at::kCPU}));
+    input_data.push_back(at::randn(input_shape.second, {at::kCPU}));
+    const std::vector<baidu::mirana::poros::graphtester::InputTypeEnum> input_data_type_mask = {
+            baidu::mirana::poros::graphtester::InputTensor,
+            baidu::mirana::poros::graphtester::InputTensor,
+    };
+
+    baidu::mirana::poros::PorosOptions poros_option; // default device GPU
+    // 运行原图与engine获取结果
+    std::vector<at::Tensor> graph_output;
+    std::vector<at::Tensor> fused_output;
+    ASSERT_TRUE(baidu::mirana::poros::graphtester::run_graph_and_fused_graph(graph_IR, poros_option, fuser,
+                                                                             input_data, input_data_type_mask,
+                                                                             graph_output, fused_output));
+
+    ASSERT_EQ(1, graph_output.size());
+    ASSERT_EQ(1, fused_output.size());
+    ASSERT_TRUE(baidu::mirana::poros::graphtester::almost_equal(graph_output[0], fused_output[0], 1e-6));
+}
+
+static std::string gen_meshgrid_graph() {
+
+    std::string graph = R"IR(
+      graph(%x.1 : Tensor,
+        %y.1 : Tensor):
+        %10 : int = prim::Constant[value=1]()
+        %4 : Tensor[] = prim::ListConstruct(%x.1, %y.1)
+        %5 : Tensor[] = aten::meshgrid(%4)
+        %grid_x.1 : Tensor, %grid_y.1 : Tensor = prim::ListUnpack(%5)
+        %11 : Tensor = aten::add(%grid_x.1, %grid_y.1, %10)
+        return (%11))IR";
+    return graph;
+}
+
+TEST(Fusers, ATenFuseMeshgrid_Test) {
+    const auto graph_IR = gen_meshgrid_graph();
+    auto fuser = std::make_shared<baidu::mirana::poros::FuseMeshgrid>();
+
+    fuse_test_helper(graph_IR, fuser, {2, 3});
+    fuse_test_helper(graph_IR, fuser, {100, 200});
+    fuse_test_helper(graph_IR, fuser, {1000, 1000});
+    fuse_test_helper(graph_IR, fuser, {1,2});
+}
+