diff --git a/.github/workflows/benchmarks-nightly.yml b/.github/workflows/benchmarks-nightly.yml
new file mode 100644
index 0000000000..3da0d09c7a
--- /dev/null
+++ b/.github/workflows/benchmarks-nightly.yml
@@ -0,0 +1,38 @@
+name: Compute Benchmarks Nightly
+
+on:
+  schedule:
+    - cron: '0 0 * * *'  # Runs at midnight UTC every day
+
+permissions:
+  contents: read
+  pull-requests: write
+
+jobs:
+  nightly:
+    name: Compute Benchmarks Nightly level-zero
+    uses: ./.github/workflows/benchmarks-reusable.yml
+    with:
+      str_name: 'level_zero'
+      unit: 'gpu'
+      pr_no: 0
+      bench_script_params: '--save baseline'
+      sycl_config_params: ''
+      sycl_repo: 'intel/llvm'
+      sycl_commit: ''
+
+  nightly2:
+    # we need to wait until previous job is done so that the html report
+    # contains both runs
+    needs: nightly
+    name: Compute Benchmarks Nightly level-zero v2
+    uses: ./.github/workflows/benchmarks-reusable.yml
+    with:
+        str_name: 'level_zero_v2'
+        unit: 'gpu'
+        pr_no: 0
+        bench_script_params: '--save baseline-v2'
+        sycl_config_params: ''
+        sycl_repo: 'intel/llvm'
+        sycl_commit: ''
+        upload_report: true
diff --git a/.github/workflows/benchmarks_compute.yml b/.github/workflows/benchmarks-reusable.yml
similarity index 79%
rename from .github/workflows/benchmarks_compute.yml
rename to .github/workflows/benchmarks-reusable.yml
index ee74a52ad0..dafa754cbd 100644
--- a/.github/workflows/benchmarks_compute.yml
+++ b/.github/workflows/benchmarks-reusable.yml
@@ -1,50 +1,39 @@
-name: Compute Benchmarks
+name: Benchmarks Reusable
 
 on:
-  # Can be triggered via manual "dispatch" (from workflow view in GitHub Actions tab)
-  workflow_dispatch:
-    # acceptable input for adapter-specific runs
+  workflow_call:
     inputs:
       str_name:
-        description: Formatted adapter name
-        type: choice
         required: true
-        default: 'level_zero'
-        options:
-          - level_zero
-          - level_zero_v2
+        type: string
       unit:
-        description: Test unit (cpu/gpu)
-        type: choice
         required: true
-        default: 'gpu'
-        options:
-          - cpu
-          - gpu
+        type: string
       pr_no:
-        description: PR number (if 0, it'll run on the main)
-        type: number
         required: true
-      bench_script_params:
-        description: Parameters passed to script executing benchmark
+        # even though this is a number, this is a workaround for issues with
+        # reusable workflow calls that result in "Unexpected value '0'" error.
         type: string
+      bench_script_params:
         required: false
+        type: string
         default: ''
       sycl_config_params:
-        description: Extra params for SYCL configuration
-        type: string
         required: false
+        type: string
         default: ''
       sycl_repo:
-        description: 'Compiler repo'
-        type: string
         required: true
+        type: string
         default: 'intel/llvm'
       sycl_commit:
-        description: 'Compiler commit'
-        type: string
         required: false
+        type: string
         default: ''
+      upload_report:
+        required: false
+        type: boolean
+        default: false
 
 permissions:
   contents: read
@@ -56,19 +45,17 @@ jobs:
     strategy:
       matrix:
         adapter: [
-          {str_name: "${{inputs.str_name}}",
-          sycl_config: "${{inputs.sycl_config_params}}",
-          unit: "${{inputs.unit}}"
+          {str_name: "${{ inputs.str_name }}",
+          sycl_config: "${{ inputs.sycl_config_params }}",
+          unit: "${{ inputs.unit }}"
           }
         ]
         build_type: [Release]
         compiler: [{c: clang, cxx: clang++}]
 
-    runs-on: "${{inputs.str_name}}_PERF"
+    runs-on: "${{ inputs.str_name }}_PERF"
 
     steps:
-    # Workspace on self-hosted runners is not cleaned automatically.
-    # We have to delete the files created outside of using actions.
     - name: Cleanup self-hosted workspace
       if: always()
       run: |
@@ -99,7 +86,8 @@ jobs:
         path: ur-repo
 
     - name: Install pip packages
-      run: pip install -r ${{github.workspace}}/ur-repo/third_party/requirements.txt
+      run: |
+        pip install --force-reinstall -r ${{github.workspace}}/ur-repo/third_party/benchmark_requirements.txt
 
     # We need to fetch special ref for proper PR's merge commit. Note, this ref may be absent if the PR is already merged.
     - name: Fetch PR's merge commit
@@ -169,13 +157,15 @@ jobs:
       run: cmake --install ${{github.workspace}}/ur_build
 
     - name: Run benchmarks
+      working-directory: ${{ github.workspace }}/ur-repo/
       id: benchmarks
       run: >
-        numactl -N 0 ${{ github.workspace }}/ur-repo/scripts/benchmarks/main.py 
-        ~/bench_workdir 
-        ${{github.workspace}}/sycl_build
-        ${{github.workspace}}/ur_install
-        ${{ matrix.adapter.str_name }}
+        ${{ github.workspace }}/ur-repo/scripts/benchmarks/main.py
+        ~/bench_workdir
+        --sycl ${{ github.workspace }}/sycl_build
+        --ur ${{ github.workspace }}/ur_install
+        --adapter ${{ matrix.adapter.str_name }}
+        ${{ inputs.upload_report && '--output-html' || '' }}
         ${{ inputs.bench_script_params }}
 
     - name: Add comment to PR
@@ -204,3 +194,10 @@ jobs:
             repo: context.repo.repo,
             body: body
           })
+
+    - name: Upload HTML report
+      if: ${{ always() && inputs.upload_report }}
+      uses: actions/cache/save@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2
+      with:
+        path: ${{ github.workspace }}/ur-repo/benchmark_results.html
+        key: benchmark-results-${{ matrix.adapter.str_name }}-${{ github.run_id }}
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
new file mode 100644
index 0000000000..af62d40e85
--- /dev/null
+++ b/.github/workflows/benchmarks.yml
@@ -0,0 +1,68 @@
+name: Compute Benchmarks
+
+on:
+  workflow_dispatch:
+    inputs:
+      str_name:
+        description: Formatted adapter name
+        type: choice
+        required: true
+        default: 'level_zero'
+        options:
+          - level_zero
+          - level_zero_v2
+      unit:
+        description: Test unit (cpu/gpu)
+        type: choice
+        required: true
+        default: 'gpu'
+        options:
+          - cpu
+          - gpu
+      pr_no:
+        description: PR number (if 0, it'll run on the main)
+        type: number
+        required: true
+      bench_script_params:
+        description: Parameters passed to script executing benchmark
+        type: string
+        required: false
+        default: ''
+      sycl_config_params:
+        description: Extra params for SYCL configuration
+        type: string
+        required: false
+        default: ''
+      sycl_repo:
+        description: 'Compiler repo'
+        type: string
+        required: true
+        default: 'intel/llvm'
+      sycl_commit:
+        description: 'Compiler commit'
+        type: string
+        required: false
+        default: ''
+      upload_report:
+        description: 'Upload HTML report'
+        type: boolean
+        required: false
+        default: false
+
+permissions:
+  contents: read
+  pull-requests: write
+
+jobs:
+  manual:
+    name: Compute Benchmarks
+    uses: ./.github/workflows/benchmarks-reusable.yml
+    with:
+      str_name: ${{ inputs.str_name }}
+      unit: ${{ inputs.unit }}
+      pr_no: ${{ inputs.pr_no }}
+      bench_script_params: ${{ inputs.bench_script_params }}
+      sycl_config_params: ${{ inputs.sycl_config_params }}
+      sycl_repo: ${{ inputs.sycl_repo }}
+      sycl_commit: ${{ inputs.sycl_commit }}
+      upload_report: ${{ inputs.upload_report }}
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 710aa659c8..bdd4cf1c52 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -45,7 +45,23 @@ jobs:
 
       - name: Build Documentation
         working-directory: ${{github.workspace}}/scripts
-        run: python3 run.py --core
+        run: |
+          python3 run.py --core
+          mkdir -p ${{ github.workspace }}/ur-repo/
+          mkdir -p ${{github.workspace}}/docs/html
+
+      - name: Download benchmark HTML
+        id: download-bench-html
+        uses: actions/cache/restore@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2
+        with:
+          path: ${{ github.workspace }}/ur-repo/benchmark_results.html
+          key: benchmark-results-
+
+      - name: Move benchmark HTML
+        # exact or partial cache hit
+        if: steps.download-bench-html.outputs.cache-hit != ''
+        run: |
+          mv ${{ github.workspace }}/ur-repo/benchmark_results.html ${{ github.workspace }}/docs/html/
 
       - name: Upload artifact
         uses: actions/upload-pages-artifact@0252fc4ba7626f0298f0cf00902a25c6afc77fa8 # v3.0.0
diff --git a/README.md b/README.md
index 262a861b9d..dc70f43876 100644
--- a/README.md
+++ b/README.md
@@ -8,6 +8,7 @@
 [![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/oneapi-src/unified-runtime/badge)](https://securityscorecards.dev/viewer/?uri=github.com/oneapi-src/unified-runtime)
 [![Trivy](https://github.com/oneapi-src/unified-runtime/actions/workflows/trivy.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/trivy.yml)
 [![Deploy documentation to Pages](https://github.com/oneapi-src/unified-runtime/actions/workflows/docs.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/docs.yml)
+[![Compute Benchmarks Nightly](https://github.com/oneapi-src/unified-runtime/actions/workflows/benchmarks-nightly.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/benchmarks-nightly.yml)
 
 <!-- TODO: add general description and purpose of the project -->
 
diff --git a/scripts/benchmarks/README.md b/scripts/benchmarks/README.md
index 64a7a3eeb9..bd6de60a0a 100644
--- a/scripts/benchmarks/README.md
+++ b/scripts/benchmarks/README.md
@@ -37,9 +37,10 @@ By default, the benchmark results are not stored. To store them, use the option
 
 To compare a benchmark run with a previously stored result, use the option `--compare <name>`. You can compare with more than one result.
 
-If no `--compare` option is specified, the benchmark run is compared against a previously stored `baseline`. This baseline is **not** automatically updated. To update it, use the `--save baseline` option.
-The recommended way of updating the baseline is running the benchmarking
-job on main after a merge of relevant changes.
+If no `--compare` option is specified, the benchmark run is compared against a previously stored `baseline`.
+
+Baseline, as well as baseline-v2 (for the level-zero adapter v2) is updated automatically during a nightly job. The results
+are stored [here](https://oneapi-src.github.io/unified-runtime/benchmark_results.html).
 
 ## Requirements
 
diff --git a/scripts/benchmarks/benches/SobelFilter.py b/scripts/benchmarks/benches/SobelFilter.py
deleted file mode 100644
index b9e7619e47..0000000000
--- a/scripts/benchmarks/benches/SobelFilter.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
-# See LICENSE.TXT
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-from .base import Benchmark
-from .result import Result
-from .velocity import VelocityBase, VelocityBench
-from utils.utils import run
-import re
-
-class SobelFilter(VelocityBase):
-    def __init__(self, vb: VelocityBench):
-        super().__init__("sobel_filter", "sobel_filter", vb)
-
-    def download_deps(self):
-        self.download_untar("sobel_filter", "https://github.com/oneapi-src/Velocity-Bench/raw/main/sobel_filter/res/sobel_filter_data.tgz?download=", "sobel_filter_data.tgz")
-        return
-
-    def name(self):
-        return "Velocity-Bench Sobel Filter"
-
-    def unit(self):
-        return "ms"
-
-    def bin_args(self) -> list[str]:
-        return ["-i", f"{self.data_path}/sobel_filter_data/silverfalls_32Kx32K.png",
-                "-n", "5"]
-
-    def extra_env_vars(self) -> dict:
-        return {"OPENCV_IO_MAX_IMAGE_PIXELS" : "1677721600"}
-
-    def parse_output(self, stdout: str) -> float:
-        match = re.search(r'sobelfilter - total time for whole calculation: (\d+\.\d+) s', stdout)
-        if match:
-            return round(float(match.group(1)) * 1000, 3)
-        else:
-            raise ValueError("{self.__class__.__name__}: Failed to parse benchmark output.")
-
diff --git a/scripts/benchmarks/benches/base.py b/scripts/benchmarks/benches/base.py
index 36f252cb42..3871938bfd 100644
--- a/scripts/benchmarks/benches/base.py
+++ b/scripts/benchmarks/benches/base.py
@@ -20,7 +20,7 @@ def __init__(self, directory):
     def get_adapter_full_path():
         for libs_dir_name in ['lib', 'lib64']:
             adapter_path = os.path.join(
-                options.ur_dir, libs_dir_name, f"libur_adapter_{options.ur_adapter_name}.so")
+                options.ur, libs_dir_name, f"libur_adapter_{options.ur_adapter}.so")
             if os.path.isfile(adapter_path):
                 return adapter_path
         assert False, \
@@ -28,8 +28,10 @@ def get_adapter_full_path():
 
     def run_bench(self, command, env_vars):
         env_vars_with_forced_adapter = env_vars.copy()
-        env_vars_with_forced_adapter.update(
-            {'UR_ADAPTERS_FORCE_LOAD': Benchmark.get_adapter_full_path()})
+        if options.ur is not None:
+            env_vars_with_forced_adapter.update(
+                {'UR_ADAPTERS_FORCE_LOAD': Benchmark.get_adapter_full_path()})
+
         return run(
             command=command,
             env_vars=env_vars_with_forced_adapter,
@@ -76,3 +78,10 @@ def run(self, env_vars) -> list[Result]:
 
     def teardown(self):
         raise NotImplementedError()
+
+class Suite:
+    def benchmarks(self) -> list[Benchmark]:
+        raise NotImplementedError()
+
+    def setup(self):
+        return
diff --git a/scripts/benchmarks/benches/bitcracker.py b/scripts/benchmarks/benches/bitcracker.py
deleted file mode 100644
index bb198433fa..0000000000
--- a/scripts/benchmarks/benches/bitcracker.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
-# See LICENSE.TXT
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-from .base import Benchmark
-from .result import Result
-from .velocity import VelocityBase, VelocityBench
-from utils.utils import run
-import os
-import re
-
-class Bitcracker(VelocityBase):
-    def __init__(self, vb: VelocityBench):
-        super().__init__("bitcracker", "bitcracker", vb)
-        self.data_path = os.path.join(vb.repo_path, "bitcracker", "hash_pass")
-
-    def name(self):
-        return "Velocity-Bench Bitcracker"
-
-    def unit(self):
-        return "s"
-
-    def bin_args(self) -> list[str]:
-        return ["-f", f"{self.data_path}/img_win8_user_hash.txt",
-                "-d", f"{self.data_path}/user_passwords_60000.txt",
-                "-b", "60000"]
-
-    def parse_output(self, stdout: str) -> float:
-        match = re.search(r'bitcracker - total time for whole calculation: (\d+\.\d+) s', stdout)
-        if match:
-            return float(match.group(1))
-        else:
-            raise ValueError("{self.__class__.__name__}: Failed to parse benchmark output.")
diff --git a/scripts/benchmarks/benches/compute.py b/scripts/benchmarks/benches/compute.py
index 9c84739f75..e08109f77e 100644
--- a/scripts/benchmarks/benches/compute.py
+++ b/scripts/benchmarks/benches/compute.py
@@ -7,17 +7,16 @@
 import csv
 import io
 from utils.utils import run, git_clone, create_build_path
-from .base import Benchmark
+from .base import Benchmark, Suite
 from .result import Result
 from .options import options
 
-class ComputeBench:
+class ComputeBench(Suite):
     def __init__(self, directory):
         self.directory = directory
-        self.built = False
 
     def setup(self):
-        if self.built:
+        if options.sycl is None:
             return
 
         repo_path = git_clone(self.directory, "compute-benchmarks-repo", "https://github.com/intel/compute-benchmarks.git", "aa6a3b2108bb86202b654ad28129156fa746d41d")
@@ -31,10 +30,14 @@ def setup(self):
             f"-DBUILD_SYCL=ON",
             f"-DSYCL_COMPILER_ROOT={options.sycl}",
             f"-DALLOW_WARNINGS=ON",
-            f"-DBUILD_UR=ON",
-            f"-Dunified-runtime_DIR={options.ur_dir}/lib/cmake/unified-runtime",
         ]
 
+        if options.ur is not None:
+            configure_command += [
+                f"-DBUILD_UR=ON",
+                f"-Dunified-runtime_DIR={options.ur}/lib/cmake/unified-runtime",
+            ]
+
         print(f"{self.__class__.__name__}: Run {configure_command}")
         run(configure_command, add_sycl=True)
         print(f"{self.__class__.__name__}: Run cmake --build {build_path} -j")
@@ -42,6 +45,36 @@ def setup(self):
 
         self.built = True
 
+    def benchmarks(self) -> list[Benchmark]:
+        if options.sycl is None:
+            return []
+
+        benches = [
+            SubmitKernelSYCL(self, 0),
+            SubmitKernelSYCL(self, 1),
+            QueueInOrderMemcpy(self, 0, 'Device', 'Device', 1024),
+            QueueInOrderMemcpy(self, 0, 'Host', 'Device', 1024),
+            QueueMemcpy(self, 'Device', 'Device', 1024),
+            StreamMemory(self, 'Triad', 10 * 1024, 'Device'),
+            ExecImmediateCopyQueue(self, 0, 1, 'Device', 'Device', 1024),
+            ExecImmediateCopyQueue(self, 1, 1, 'Device', 'Host', 1024),
+            VectorSum(self),
+            MemcpyExecute(self, 400, 8, 1024, 100),
+            MemcpyExecute(self, 400, 8, 102400, 10),
+            MemcpyExecute(self, 500, 8, 102400, 10),
+            MemcpyExecute(self, 400, 1, 1024, 1000),
+            MemcpyExecute(self, 10, 16, 1024, 1000),
+            MemcpyExecute(self, 10, 16, 102400, 100),
+        ]
+
+        if options.ur is not None:
+            benches += [
+                SubmitKernelUR(self, 0),
+                SubmitKernelUR(self, 1),
+            ]
+
+        return benches
+
 class ComputeBenchmark(Benchmark):
     def __init__(self, bench, name, test):
         self.bench = bench
@@ -60,7 +93,6 @@ def unit(self):
 
     def setup(self):
         self.benchmark_bin = os.path.join(self.bench.directory, 'compute-benchmarks-build', 'bin', self.bench_name)
-        self.bench.setup()
 
     def run(self, env_vars) -> list[Result]:
         command = [
@@ -75,7 +107,7 @@ def run(self, env_vars) -> list[Result]:
 
         result = self.run_bench(command, env_vars)
         (label, mean) = self.parse_output(result)
-        return [ Result(label=self.name(), value=mean, command=command, env=env_vars, stdout=result, lower_is_better=self.lower_is_better()) ]
+        return [ Result(label=self.name(), value=mean, command=command, env=env_vars, stdout=result) ]
 
     def parse_output(self, output):
         csv_file = io.StringIO(output)
diff --git a/scripts/benchmarks/benches/cudaSift.py b/scripts/benchmarks/benches/cudaSift.py
deleted file mode 100644
index 482d258052..0000000000
--- a/scripts/benchmarks/benches/cudaSift.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
-# See LICENSE.TXT
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-from .base import Benchmark
-from .result import Result
-from .velocity import VelocityBase, VelocityBench
-from utils.utils import run
-import os
-import re
-import shutil
-
-class CudaSift(VelocityBase):
-    def __init__(self, vb: VelocityBench):
-        super().__init__("cudaSift", "cudaSift", vb)
-
-    def download_deps(self):
-        images = os.path.join(self.vb.repo_path, self.bench_name, 'inputData')
-        dest = os.path.join(self.directory, 'inputData')
-        if not os.path.exists(dest):
-            shutil.copytree(images, dest)
-
-    def name(self):
-        return "Velocity-Bench CudaSift"
-
-    def unit(self):
-        return "ms"
-
-    def parse_output(self, stdout: str) -> float:
-        match = re.search(r'Avg workload time = (\d+\.\d+) ms', stdout)
-        if match:
-            return float(match.group(1))
-        else:
-            raise ValueError("Failed to parse benchmark output.")
diff --git a/scripts/benchmarks/benches/easywave.py b/scripts/benchmarks/benches/easywave.py
deleted file mode 100644
index 2f89482329..0000000000
--- a/scripts/benchmarks/benches/easywave.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
-# See LICENSE.TXT
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-from .base import Benchmark
-from .result import Result
-from .velocity import VelocityBase, VelocityBench
-from utils.utils import run
-from .options import options
-import re
-import os
-
-class Easywave(VelocityBase):
-    def __init__(self, vb: VelocityBench):
-        super().__init__("easywave", "easyWave_sycl", vb)
-
-    def download_deps(self):
-        self.download_untar("easywave", "https://git.gfz-potsdam.de/id2/geoperil/easyWave/-/raw/master/data/examples.tar.gz", "examples.tar.gz")
-
-    def name(self):
-        return "Velocity-Bench Easywave"
-
-    def unit(self):
-        return "ms"
-
-    def bin_args(self) -> list[str]:
-        return ["-grid", f"{self.data_path}/examples/e2Asean.grd",
-                "-source", f"{self.data_path}/examples/BengkuluSept2007.flt",
-                "-time", "120"]
-
-    # easywave doesn't output a useful single perf value. Instead, we parse the
-    # output logs looking for the very last line containing the elapsed time of the
-    # application.
-    def get_last_elapsed_time(self, log_file_path) -> float:
-        elapsed_time_pattern = re.compile(r'Model time = (\d{2}:\d{2}:\d{2}),\s+elapsed: (\d+) msec')
-        last_elapsed_time = None
-
-        try:
-            with open(log_file_path, 'r') as file:
-                for line in file:
-                    match = elapsed_time_pattern.search(line)
-                    if match:
-                        last_elapsed_time = int(match.group(2))
-            
-            if last_elapsed_time is not None:
-                return last_elapsed_time
-            else:
-                raise ValueError("No elapsed time found in the log file.")
-        except FileNotFoundError:
-            raise FileNotFoundError(f"The file {log_file_path} does not exist.")
-        except Exception as e:
-            raise e
-
-    def parse_output(self, stdout: str) -> float:
-        return self.get_last_elapsed_time(os.path.join(options.benchmark_cwd, "easywave.log"))
diff --git a/scripts/benchmarks/benches/hashtable.py b/scripts/benchmarks/benches/hashtable.py
deleted file mode 100644
index c5ed397dbb..0000000000
--- a/scripts/benchmarks/benches/hashtable.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
-# See LICENSE.TXT
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-from .base import Benchmark
-from .result import Result
-from .velocity import VelocityBase, VelocityBench
-from utils.utils import run
-import os
-import re
-
-class Hashtable(VelocityBase):
-    def __init__(self, vb: VelocityBench):
-        super().__init__("hashtable", "hashtable_sycl", vb)
-
-    def name(self):
-        return "Velocity-Bench Hashtable"
-
-    def unit(self):
-        return "M keys/sec"
-
-    def bin_args(self) -> list[str]:
-        return ["--no-verify"]
-
-    def lower_is_better(self):
-        return False
-
-    def parse_output(self, stdout: str) -> float:
-        match = re.search(r'(\d+\.\d+) million keys/second', stdout)
-        if match:
-            return float(match.group(1))
-        else:
-            raise ValueError("{self.__class__.__name__}: Failed to parse keys per second from benchmark output.")
diff --git a/scripts/benchmarks/benches/options.py b/scripts/benchmarks/benches/options.py
index c035ce6800..5997cdedb8 100644
--- a/scripts/benchmarks/benches/options.py
+++ b/scripts/benchmarks/benches/options.py
@@ -1,13 +1,26 @@
 from dataclasses import dataclass
+from enum import Enum
+
+class Compare(Enum):
+    LATEST = 'latest'
+    AVERAGE = 'average'
+    MEDIAN = 'median'
 
 @dataclass
 class Options:
-    sycl: str = ""
+    sycl: str = None
+    ur: str = None
+    ur_adapter: str = None
     rebuild: bool = True
     benchmark_cwd: str = "INVALID"
     timeout: float = 600
     iterations: int = 5
     verbose: bool = False
+    compare: Compare = Compare.LATEST
+    compare_max: int = 10 # average/median over how many results
+    output_html: bool = False
+    output_markdown: bool = True
+    dry_run: bool = False
 
 options = Options()
 
diff --git a/scripts/benchmarks/benches/quicksilver.py b/scripts/benchmarks/benches/quicksilver.py
deleted file mode 100644
index b7600d11be..0000000000
--- a/scripts/benchmarks/benches/quicksilver.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
-# See LICENSE.TXT
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-from .base import Benchmark
-from .result import Result
-from .velocity import VelocityBase, VelocityBench
-from utils.utils import run
-import os
-import re
-
-class QuickSilver(VelocityBase):
-    def __init__(self, vb: VelocityBench):
-        super().__init__("QuickSilver", "qs", vb)
-        self.data_path = os.path.join(vb.repo_path, "QuickSilver", "Examples", "AllScattering")
-
-    def run(self, env_vars) -> list[Result]:
-        # TODO: fix the crash in QuickSilver when UR_L0_USE_IMMEDIATE_COMMANDLISTS=0
-        if 'UR_L0_USE_IMMEDIATE_COMMANDLISTS' in env_vars and env_vars['UR_L0_USE_IMMEDIATE_COMMANDLISTS'] == '0':
-            return None
-
-        return super().run(env_vars)
-
-    def name(self):
-        return "Velocity-Bench QuickSilver"
-
-    def unit(self):
-        return "MMS/CTT"
-
-    def lower_is_better(self):
-        return False
-
-    def bin_args(self) -> list[str]:
-        return ["-i", f"{self.data_path}/scatteringOnly.inp"]
-
-    def extra_env_vars(self) -> dict:
-        return {"QS_DEVICE" : "GPU"}
-
-    def parse_output(self, stdout: str) -> float:
-        match = re.search(r'Figure Of Merit\s+(\d+\.\d+)', stdout)
-        if match:
-            return float(match.group(1))
-        else:
-            raise ValueError("{self.__class__.__name__}: Failed to parse benchmark output.")
diff --git a/scripts/benchmarks/benches/result.py b/scripts/benchmarks/benches/result.py
index 6fc7e16095..07ee70148a 100644
--- a/scripts/benchmarks/benches/result.py
+++ b/scripts/benchmarks/benches/result.py
@@ -4,7 +4,9 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 from dataclasses import dataclass
+from typing import Optional
 from dataclasses_json import dataclass_json
+from datetime import datetime
 
 @dataclass_json
 @dataclass
@@ -15,6 +17,17 @@ class Result:
     env: str
     stdout: str
     passed: bool = True
+    # values should not be set by the benchmark
     unit: str = ""
     name: str = ""
     lower_is_better: bool = True
+    git_hash: str = ''
+    date: Optional[datetime] = None
+
+@dataclass_json
+@dataclass
+class BenchmarkRun:
+    results: list[Result]
+    name: str = 'This PR'
+    git_hash: str = ''
+    date: datetime = None
diff --git a/scripts/benchmarks/benches/syclbench.py b/scripts/benchmarks/benches/syclbench.py
index b9d6e50623..fbfd009935 100644
--- a/scripts/benchmarks/benches/syclbench.py
+++ b/scripts/benchmarks/benches/syclbench.py
@@ -7,19 +7,20 @@
 import csv
 import io
 from utils.utils import run, git_clone, create_build_path
-from .base import Benchmark
+from .base import Benchmark, Suite
 from .result import Result
 from .options import options
 
-class SyclBench:
+class SyclBench(Suite):
     def __init__(self, directory):
+        if options.sycl is None:
+            return
+
         self.directory = directory
-        self.built = False
-        self.setup()
         return
 
     def setup(self):
-        if self.built:
+        if options.sycl is None:
             return
 
         build_path = create_build_path(self.directory, 'sycl-bench-build')
@@ -40,6 +41,50 @@ def setup(self):
 
         self.built = True
 
+    def benchmarks(self) -> list[Benchmark]:
+        if options.sycl is None:
+            return []
+
+        return [
+            # Blocked_transform(self), # run time < 1ms
+            DagTaskI(self),
+            DagTaskS(self),
+            HostDevBandwidth(self),
+            LocalMem(self),
+            Pattern_L2(self),
+            Reduction(self),
+            ScalarProd(self),
+            SegmentReduction(self),
+            UsmAccLatency(self),
+            UsmAllocLatency(self),
+            UsmInstrMix(self),
+            UsmPinnedOverhead(self),
+            VecAdd(self),
+
+            # *** sycl-bench single benchmarks
+            # TwoDConvolution(self), # run time < 1ms
+            Two_mm(self),
+            Three_mm(self),
+            # Arith(self), # run time < 1ms
+            Atax(self),
+            # Atomic_reduction(self), # run time < 1ms
+            Bicg(self),
+            Correlation(self),
+            Covariance(self),
+            Gemm(self),
+            Gesumv(self),
+            Gramschmidt(self),
+            KMeans(self),
+            LinRegCoeff(self),
+            # LinRegError(self), # run time < 1ms
+            MatmulChain(self),
+            MolDyn(self),
+            Mvt(self),
+            Sf(self),
+            Syr2k(self),
+            Syrk(self),
+        ]
+
 class SyclBenchmark(Benchmark):
     def __init__(self, bench, name, test):
         self.bench = bench
@@ -58,7 +103,6 @@ def unit(self):
         return "ms"
 
     def setup(self):
-        self.bench.setup()
         self.benchmark_bin = os.path.join(self.directory, 'sycl-bench-build', self.bench_name)
 
     def run(self, env_vars) -> list[Result]:
diff --git a/scripts/benchmarks/benches/test.py b/scripts/benchmarks/benches/test.py
new file mode 100644
index 0000000000..88bc29a649
--- /dev/null
+++ b/scripts/benchmarks/benches/test.py
@@ -0,0 +1,68 @@
+# Copyright (C) 2024 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import random
+from utils.utils import git_clone
+from .base import Benchmark, Suite
+from .result import Result
+from utils.utils import run, create_build_path
+from .options import options
+import os
+
+class TestSuite(Suite):
+    def __init__(self):
+        return
+
+    def setup(self):
+        return
+
+    def benchmarks(self) -> list[Benchmark]:
+        bench_configs = [
+            ("Memory Bandwidth", 2000, 200),
+            ("Latency", 100, 20),
+            ("Throughput", 1500, 150),
+            ("FLOPS", 3000, 300),
+            ("Cache Miss Rate", 250, 25),
+        ]
+
+        result = []
+        for base_name, base_value, base_diff in bench_configs:
+            for variant in range(6):
+                value_multiplier = 1.0 + (variant * 0.2)
+                name = f"{base_name} {variant+1}"
+                value = base_value * value_multiplier
+                diff = base_diff * value_multiplier
+
+                result.append(TestBench(name, value, diff))
+
+        return result
+
+class TestBench(Benchmark):
+    def __init__(self, name, value, diff):
+        self.bname = name
+        self.value = value
+        self.diff = diff
+        super().__init__("")
+
+    def name(self):
+        return self.bname
+
+    def unit(self):
+        return "ms"
+
+    def lower_is_better(self):
+        return True
+
+    def setup(self):
+        return
+
+    def run(self, env_vars) -> list[Result]:
+        random_value = self.value + random.uniform(-1 * (self.diff), self.diff)
+        return [
+            Result(label=self.name(), value=random_value, command="", env={"A": "B"}, stdout="no output")
+        ]
+
+    def teardown(self):
+        return
diff --git a/scripts/benchmarks/benches/velocity.py b/scripts/benchmarks/benches/velocity.py
index 3c903bf11b..38efa42f56 100644
--- a/scripts/benchmarks/benches/velocity.py
+++ b/scripts/benchmarks/benches/velocity.py
@@ -3,18 +3,41 @@
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+import re
+import shutil
 from utils.utils import git_clone
-from .base import Benchmark
+from .base import Benchmark, Suite
 from .result import Result
 from utils.utils import run, create_build_path
 from .options import options
 import os
 
-class VelocityBench:
+class VelocityBench(Suite):
     def __init__(self, directory):
+        if options.sycl is None:
+            return
+
         self.directory = directory
+
+    def setup(self):
+        if options.sycl is None:
+            return
+
         self.repo_path = git_clone(self.directory, "velocity-bench-repo", "https://github.com/oneapi-src/Velocity-Bench/", "b22215c16f789100449c34bf4eaa3fb178983d69")
 
+    def benchmarks(self) -> list[Benchmark]:
+        if options.sycl is None:
+            return []
+
+        return [
+            Hashtable(self),
+            Bitcracker(self),
+            CudaSift(self),
+            Easywave(self),
+            QuickSilver(self),
+            SobelFilter(self)
+        ]
+
 class VelocityBase(Benchmark):
     def __init__(self, name: str, bin_name: str, vb: VelocityBench):
         super().__init__(vb.directory)
@@ -60,7 +83,185 @@ def run(self, env_vars) -> list[Result]:
 
         result = self.run_bench(command, env_vars)
 
-        return [ Result(label=self.name(), value=self.parse_output(result), command=command, env=env_vars, stdout=result, lower_is_better=self.lower_is_better()) ]
+        return [ Result(label=self.name(), value=self.parse_output(result), command=command, env=env_vars, stdout=result) ]
 
     def teardown(self):
         return
+
+class Hashtable(VelocityBase):
+    def __init__(self, vb: VelocityBench):
+        super().__init__("hashtable", "hashtable_sycl", vb)
+
+    def name(self):
+        return "Velocity-Bench Hashtable"
+
+    def unit(self):
+        return "M keys/sec"
+
+    def bin_args(self) -> list[str]:
+        return ["--no-verify"]
+
+    def lower_is_better(self):
+        return False
+
+    def parse_output(self, stdout: str) -> float:
+        match = re.search(r'(\d+\.\d+) million keys/second', stdout)
+        if match:
+            return float(match.group(1))
+        else:
+            raise ValueError("{self.__class__.__name__}: Failed to parse keys per second from benchmark output.")
+
+
+class Bitcracker(VelocityBase):
+    def __init__(self, vb: VelocityBench):
+        super().__init__("bitcracker", "bitcracker", vb)
+        self.data_path = os.path.join(vb.repo_path, "bitcracker", "hash_pass")
+
+    def name(self):
+        return "Velocity-Bench Bitcracker"
+
+    def unit(self):
+        return "s"
+
+    def bin_args(self) -> list[str]:
+        return ["-f", f"{self.data_path}/img_win8_user_hash.txt",
+                "-d", f"{self.data_path}/user_passwords_60000.txt",
+                "-b", "60000"]
+
+    def parse_output(self, stdout: str) -> float:
+        match = re.search(r'bitcracker - total time for whole calculation: (\d+\.\d+) s', stdout)
+        if match:
+            return float(match.group(1))
+        else:
+            raise ValueError("{self.__class__.__name__}: Failed to parse benchmark output.")
+
+class SobelFilter(VelocityBase):
+    def __init__(self, vb: VelocityBench):
+        super().__init__("sobel_filter", "sobel_filter", vb)
+
+    def download_deps(self):
+        self.download_untar("sobel_filter", "https://github.com/oneapi-src/Velocity-Bench/raw/main/sobel_filter/res/sobel_filter_data.tgz?download=", "sobel_filter_data.tgz")
+        return
+
+    def name(self):
+        return "Velocity-Bench Sobel Filter"
+
+    def unit(self):
+        return "ms"
+
+    def bin_args(self) -> list[str]:
+        return ["-i", f"{self.data_path}/sobel_filter_data/silverfalls_32Kx32K.png",
+                "-n", "5"]
+
+    def extra_env_vars(self) -> dict:
+        return {"OPENCV_IO_MAX_IMAGE_PIXELS" : "1677721600"}
+
+    def parse_output(self, stdout: str) -> float:
+        match = re.search(r'sobelfilter - total time for whole calculation: (\d+\.\d+) s', stdout)
+        if match:
+            return round(float(match.group(1)) * 1000, 3)
+        else:
+            raise ValueError("{self.__class__.__name__}: Failed to parse benchmark output.")
+
+
+class QuickSilver(VelocityBase):
+    def __init__(self, vb: VelocityBench):
+        super().__init__("QuickSilver", "qs", vb)
+        self.data_path = os.path.join(vb.repo_path, "QuickSilver", "Examples", "AllScattering")
+
+    def run(self, env_vars) -> list[Result]:
+        # TODO: fix the crash in QuickSilver when UR_L0_USE_IMMEDIATE_COMMANDLISTS=0
+        if 'UR_L0_USE_IMMEDIATE_COMMANDLISTS' in env_vars and env_vars['UR_L0_USE_IMMEDIATE_COMMANDLISTS'] == '0':
+            return None
+
+        return super().run(env_vars)
+
+    def name(self):
+        return "Velocity-Bench QuickSilver"
+
+    def unit(self):
+        return "MMS/CTT"
+
+    def lower_is_better(self):
+        return False
+
+    def bin_args(self) -> list[str]:
+        return ["-i", f"{self.data_path}/scatteringOnly.inp"]
+
+    def extra_env_vars(self) -> dict:
+        return {"QS_DEVICE" : "GPU"}
+
+    def parse_output(self, stdout: str) -> float:
+        match = re.search(r'Figure Of Merit\s+(\d+\.\d+)', stdout)
+        if match:
+            return float(match.group(1))
+        else:
+            raise ValueError("{self.__class__.__name__}: Failed to parse benchmark output.")
+
+class Easywave(VelocityBase):
+    def __init__(self, vb: VelocityBench):
+        super().__init__("easywave", "easyWave_sycl", vb)
+
+    def download_deps(self):
+        self.download_untar("easywave", "https://git.gfz-potsdam.de/id2/geoperil/easyWave/-/raw/master/data/examples.tar.gz", "examples.tar.gz")
+
+    def name(self):
+        return "Velocity-Bench Easywave"
+
+    def unit(self):
+        return "ms"
+
+    def bin_args(self) -> list[str]:
+        return ["-grid", f"{self.data_path}/examples/e2Asean.grd",
+                "-source", f"{self.data_path}/examples/BengkuluSept2007.flt",
+                "-time", "120"]
+
+    # easywave doesn't output a useful single perf value. Instead, we parse the
+    # output logs looking for the very last line containing the elapsed time of the
+    # application.
+    def get_last_elapsed_time(self, log_file_path) -> float:
+        elapsed_time_pattern = re.compile(r'Model time = (\d{2}:\d{2}:\d{2}),\s+elapsed: (\d+) msec')
+        last_elapsed_time = None
+
+        try:
+            with open(log_file_path, 'r') as file:
+                for line in file:
+                    match = elapsed_time_pattern.search(line)
+                    if match:
+                        last_elapsed_time = int(match.group(2))
+
+            if last_elapsed_time is not None:
+                return last_elapsed_time
+            else:
+                raise ValueError("No elapsed time found in the log file.")
+        except FileNotFoundError:
+            raise FileNotFoundError(f"The file {log_file_path} does not exist.")
+        except Exception as e:
+            raise e
+
+    def parse_output(self, stdout: str) -> float:
+        return self.get_last_elapsed_time(os.path.join(options.benchmark_cwd, "easywave.log"))
+
+
+class CudaSift(VelocityBase):
+    def __init__(self, vb: VelocityBench):
+        super().__init__("cudaSift", "cudaSift", vb)
+
+    def download_deps(self):
+        images = os.path.join(self.vb.repo_path, self.bench_name, 'inputData')
+        dest = os.path.join(self.directory, 'inputData')
+        if not os.path.exists(dest):
+            shutil.copytree(images, dest)
+
+    def name(self):
+        return "Velocity-Bench CudaSift"
+
+    def unit(self):
+        return "ms"
+
+    def parse_output(self, stdout: str) -> float:
+        match = re.search(r'Avg workload time = (\d+\.\d+) ms', stdout)
+        if match:
+            return float(match.group(1))
+        else:
+            raise ValueError("Failed to parse benchmark output.")
diff --git a/scripts/benchmarks/history.py b/scripts/benchmarks/history.py
new file mode 100644
index 0000000000..5b83ef9479
--- /dev/null
+++ b/scripts/benchmarks/history.py
@@ -0,0 +1,135 @@
+# Copyright (C) 2024 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import os
+import json
+from pathlib import Path
+from benches.result import Result, BenchmarkRun
+from benches.options import Compare, options
+from datetime import datetime, timezone
+from utils.utils import run;
+
+class BenchmarkHistory:
+    benchmark_run_index_max = 0
+    runs = []
+
+    def __init__(self, dir):
+        self.dir = dir
+
+    def load_result(self, file_path: Path) -> BenchmarkRun:
+        if file_path.exists():
+            with file_path.open('r') as file:
+                data = json.load(file)
+                return BenchmarkRun.from_json(data)
+        else:
+            return None
+
+    def load(self, n: int):
+        results_dir = Path(self.dir) / 'results'
+        if not results_dir.exists() or not results_dir.is_dir():
+            return []
+
+        # Get all JSON files in the results directory
+        benchmark_files = list(results_dir.glob('*.json'))
+
+        # Extract index numbers and sort files by index number
+        def extract_index(file_path: Path) -> int:
+            try:
+                return int(file_path.stem.split('_')[0])
+            except (IndexError, ValueError):
+                return -1
+
+        benchmark_files = [file for file in benchmark_files if extract_index(file) != -1]
+        benchmark_files.sort(key=extract_index)
+
+        # Load the first n benchmark files
+        benchmark_runs = []
+        for file_path in benchmark_files[n::-1]:
+            benchmark_run = self.load_result(file_path)
+            if benchmark_run:
+                benchmark_runs.append(benchmark_run)
+
+        if benchmark_files:
+            self.benchmark_run_index_max = extract_index(benchmark_files[-1])
+
+        self.runs = benchmark_runs
+
+    def create_run(self, name: str, results: list[Result]) -> BenchmarkRun:
+        try:
+            result = run("git rev-parse --short HEAD")
+            git_hash = result.stdout.decode().strip()
+        except:
+            git_hash = 'unknown'
+
+        return BenchmarkRun(name = name, git_hash = git_hash, date = datetime.now(tz=timezone.utc), results = results)
+
+    def save(self, save_name, results: list[Result], to_file = True):
+        benchmark_data = self.create_run(save_name, results)
+        self.runs.append(benchmark_data)
+
+        if not to_file:
+            return
+
+        serialized = benchmark_data.to_json()
+        results_dir = Path(os.path.join(self.dir, 'results'))
+        os.makedirs(results_dir, exist_ok=True)
+
+        self.benchmark_run_index_max += 1
+        file_path = Path(os.path.join(results_dir, f"{self.benchmark_run_index_max}_{save_name}.json"))
+        with file_path.open('w') as file:
+            json.dump(serialized, file, indent=4)
+        print(f"Benchmark results saved to {file_path}")
+
+    def find_first(self, name: str) -> BenchmarkRun:
+        for r in self.runs:
+            if r.name == name:
+                return r
+        return None
+
+    def compute_average(self, data: list[BenchmarkRun]):
+        first_run = data[0]
+        average_results = []
+
+        for i in range(len(first_run.results)):
+            all_values = [run.results[i].value for run in data]
+
+            # Calculate the average value for the current result index
+            average_value = sum(all_values) / len(all_values)
+
+            average_result = first_run.results[i]
+            average_result.value = average_value
+
+            average_results.append(average_result)
+
+        average_benchmark_run = BenchmarkRun(
+            results = average_results,
+            name = first_run.name,
+            git_hash = "average",
+            date = first_run.date # should this be different?
+        )
+
+        return average_benchmark_run
+
+    def get_compare(self, name: str) -> BenchmarkRun:
+        if options.compare == Compare.LATEST:
+            return self.find_first(name)
+
+        data = []
+        for r in self.runs:
+            if r.name == name:
+                data.append(r)
+                if len(data) == options.compare_max:
+                    break
+
+        if len(data) == 0:
+            return None
+
+        if options.compare == Compare.MEDIAN:
+            return data[len(data) // 2]
+
+        if options.compare == Compare.AVERAGE:
+            return self.compute_average(data)
+
+        raise Exception("invalid compare type")
diff --git a/scripts/benchmarks/main.py b/scripts/benchmarks/main.py
index 85d9b6b608..0756554e77 100755
--- a/scripts/benchmarks/main.py
+++ b/scripts/benchmarks/main.py
@@ -5,108 +5,53 @@
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-from utils.utils import prepare_workdir, load_benchmark_results, save_benchmark_results;
 from benches.compute import *
-from benches.hashtable import Hashtable
-from benches.bitcracker import Bitcracker
-from benches.cudaSift import CudaSift
-from benches.easywave import Easywave
-from benches.quicksilver import QuickSilver
-from benches.SobelFilter import SobelFilter
 from benches.velocity import VelocityBench
 from benches.syclbench import *
-from benches.options import options
-from output import generate_markdown
+from benches.test import TestSuite
+from benches.options import Compare, options
+from output_markdown import generate_markdown
+from output_html import generate_html
+from history import BenchmarkHistory
+from utils.utils import prepare_workdir;
+
 import argparse
 import re
-import subprocess
 
 # Update this if you are changing the layout of the results files
-INTERNAL_WORKDIR_VERSION = '1.7'
+INTERNAL_WORKDIR_VERSION = '2.0'
 
 def main(directory, additional_env_vars, save_name, compare_names, filter):
     prepare_workdir(directory, INTERNAL_WORKDIR_VERSION)
 
-    cb = ComputeBench(directory)
-    vb = VelocityBench(directory)
-    sb = SyclBench(directory)
-
-    benchmarks = [
-        # *** Compute benchmarks
-        SubmitKernelSYCL(cb, 0),
-        SubmitKernelSYCL(cb, 1),
-        SubmitKernelUR(cb, 0),
-        SubmitKernelUR(cb, 1),
-        QueueInOrderMemcpy(cb, 0, 'Device', 'Device', 1024),
-        QueueInOrderMemcpy(cb, 0, 'Host', 'Device', 1024),
-        QueueMemcpy(cb, 'Device', 'Device', 1024),
-        StreamMemory(cb, 'Triad', 10 * 1024, 'Device'),
-        ExecImmediateCopyQueue(cb, 0, 1, 'Device', 'Device', 1024),
-        ExecImmediateCopyQueue(cb, 1, 1, 'Device', 'Host', 1024),
-        VectorSum(cb),
-        MemcpyExecute(cb, 400, 8, 1024, 100),
-        MemcpyExecute(cb, 400, 8, 102400, 10),
-        MemcpyExecute(cb, 500, 8, 102400, 10),
-        MemcpyExecute(cb, 400, 1, 1024, 1000),
-        MemcpyExecute(cb, 10, 16, 1024, 1000),
-        MemcpyExecute(cb, 10, 16, 102400, 100),
-
-        # *** Velocity benchmarks
-        Hashtable(vb),
-        Bitcracker(vb),
-        CudaSift(vb),
-        Easywave(vb),
-        QuickSilver(vb),
-        SobelFilter(vb),
-
-        # *** sycl-bench multi benchmarks
-        # Blocked_transform(sb), # run time < 1ms
-        DagTaskI(sb),
-        DagTaskS(sb),
-        HostDevBandwidth(sb),
-        LocalMem(sb),
-        Pattern_L2(sb),
-        Reduction(sb),
-        ScalarProd(sb),
-        SegmentReduction(sb),
-        UsmAccLatency(sb),
-        UsmAllocLatency(sb),
-        UsmInstrMix(sb),
-        UsmPinnedOverhead(sb),
-        VecAdd(sb),
-
-        # *** sycl-bench single benchmarks
-        # TwoDConvolution(sb), # run time < 1ms
-        Two_mm(sb),
-        Three_mm(sb),
-        # Arith(sb), # run time < 1ms
-        Atax(sb),
-        # Atomic_reduction(sb), # run time < 1ms
-        Bicg(sb),
-        Correlation(sb),
-        Covariance(sb),
-        Gemm(sb),
-        Gesumv(sb),
-        Gramschmidt(sb),
-        KMeans(sb),
-        LinRegCoeff(sb),
-        # LinRegError(sb), # run time < 1ms
-        MatmulChain(sb),
-        MolDyn(sb),
-        Mvt(sb),
-        Sf(sb),
-        Syr2k(sb),
-        Syrk(sb),
-    ]
+    suites = [
+        ComputeBench(directory),
+        VelocityBench(directory),
+        SyclBench(directory)
+        #TestSuite()
+    ] if not options.dry_run else []
+
+    benchmarks = []
+
+    for s in suites:
+        print(f"Setting up {type(s).__name__}")
+        s.setup()
+        print(f"{type(s).__name__} setup complete.")
+
+    for s in suites:
+        benchmarks += s.benchmarks()
 
     if filter:
         benchmarks = [benchmark for benchmark in benchmarks if filter.search(benchmark.name())]
 
+    for b in benchmarks:
+        print(b.name())
+
     for benchmark in benchmarks:
         try:
-            print(f"setting up {benchmark.name()}... ", end='', flush=True)
+            print(f"Setting up {benchmark.name()}... ")
             benchmark.setup()
-            print("complete.")
+            print(f"{benchmark.name()} setup complete.")
 
         except Exception as e:
             if options.exit_on_failure:
@@ -131,7 +76,7 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
                         iteration_results.append(bench_result)
                 else:
                     print(f"did not finish (OK for sycl-bench).")
-                    break;
+                    break
 
             if len(iteration_results) == 0:
                 continue
@@ -145,6 +90,7 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
 
                     median_result.unit = benchmark.unit()
                     median_result.name = label
+                    median_result.lower_is_better = benchmark.lower_is_better()
 
                     results.append(median_result)
         except Exception as e:
@@ -158,23 +104,44 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
         benchmark.teardown()
         print("complete.")
 
-    chart_data = {"This PR" : results}
+    this_name = "This PR"
+
+    chart_data = {this_name : results}
+
+    history = BenchmarkHistory(directory)
+    # limit how many files we load.
+    # should this be configurable?
+    history.load(1000)
 
     for name in compare_names:
         print(f"compare name: {name}")
-        compare_result = load_benchmark_results(directory, name)
+        compare_result = history.get_compare(name)
         if compare_result:
-            chart_data[name] = compare_result
+            chart_data[name] = compare_result.results
+
+    if options.output_markdown:
+        markdown_content = generate_markdown(this_name, chart_data)
+
+        with open('benchmark_results.md', 'w') as file:
+            file.write(markdown_content)
+
+    saved_name = save_name if save_name is not None else this_name
 
-    if save_name:
-        save_benchmark_results(directory, save_name, results)
+    # It's important we don't save the current results into history before
+    # we calculate historical averages or get latest results for compare.
+    # Otherwise we might be comparing the results to themselves.
+    if not options.dry_run:
+        history.save(saved_name, results, save_name is not None)
+        print(f"Markdown with benchmark results has been written to {os.getcwd()}/benchmark_results.md")
+        compare_names.append(saved_name)
 
-    markdown_content = generate_markdown(chart_data)
+    if options.output_html:
+        html_content = generate_html(history.runs, 'oneapi-src/unified-runtime', compare_names)
 
-    with open('benchmark_results.md', 'w') as file:
-        file.write(markdown_content)
+        with open('benchmark_results.html', 'w') as file:
+            file.write(html_content)
 
-    print(f"Markdown with benchmark results has been written to {os.getcwd()}/benchmark_results.md")
+    print(f"HTML with benchmark results has been written to {os.getcwd()}/benchmark_results.html")
 
 def validate_and_parse_env_args(env_args):
     env_vars = {}
@@ -188,9 +155,9 @@ def validate_and_parse_env_args(env_args):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description='Unified Runtime Benchmark Runner')
     parser.add_argument('benchmark_directory', type=str, help='Working directory to setup benchmarks.')
-    parser.add_argument('sycl', type=str, help='Root directory of the SYCL compiler.')
-    parser.add_argument('ur_dir', type=str, help='UR install prefix path')
-    parser.add_argument('ur_adapter_name', type=str, help='Options to build the Unified Runtime as part of the benchmark')
+    parser.add_argument('--sycl', type=str, help='Root directory of the SYCL compiler.', default=None)
+    parser.add_argument('--ur', type=str, help='UR install prefix path', default=None)
+    parser.add_argument('--adapter', type=str, help='Options to build the Unified Runtime as part of the benchmark', default="level_zero")
     parser.add_argument("--no-rebuild", help='Rebuild the benchmarks from scratch.', action="store_true")
     parser.add_argument("--env", type=str, help='Use env variable for a benchmark run.', action="append", default=[])
     parser.add_argument("--save", type=str, help='Save the results for comparison under a specified name.')
@@ -200,7 +167,12 @@ def validate_and_parse_env_args(env_args):
     parser.add_argument("--filter", type=str, help='Regex pattern to filter benchmarks by name.', default=None)
     parser.add_argument("--epsilon", type=float, help='Threshold to consider change of performance significant', default=0.005)
     parser.add_argument("--verbose", help='Print output of all the commands.', action="store_true")
-    parser.add_argument("--exit_on_failure", help='Exit on first failure.', action="store_true")
+    parser.add_argument("--exit-on-failure", help='Exit on first failure.', action="store_true")
+    parser.add_argument("--compare-type", type=str, choices=[e.value for e in Compare], help='Compare results against previously saved data.', default=Compare.LATEST.value)
+    parser.add_argument("--compare-max", type=int, help='How many results to read for comparisions', default=10)
+    parser.add_argument("--output-html", help='Create HTML output', action="store_true", default=False)
+    parser.add_argument("--output-markdown", help='Create Markdown output', action="store_true", default=True)
+    parser.add_argument("--dry-run", help='Do not run any actual benchmarks', action="store_true", default=False)
 
     args = parser.parse_args()
     additional_env_vars = validate_and_parse_env_args(args.env)
@@ -211,9 +183,14 @@ def validate_and_parse_env_args(env_args):
     options.iterations = args.iterations
     options.timeout = args.timeout
     options.epsilon = args.epsilon
-    options.ur_dir = args.ur_dir
-    options.ur_adapter_name = args.ur_adapter_name
+    options.ur = args.ur
+    options.ur_adapter = args.adapter
     options.exit_on_failure = args.exit_on_failure
+    options.compare = Compare(args.compare_type)
+    options.compare_max = args.compare_max
+    options.output_html = args.output_html
+    options.output_markdown = args.output_markdown
+    options.dry_run = args.dry_run
 
     benchmark_filter = re.compile(args.filter) if args.filter else None
 
diff --git a/scripts/benchmarks/output_html.py b/scripts/benchmarks/output_html.py
new file mode 100644
index 0000000000..8249bc75c9
--- /dev/null
+++ b/scripts/benchmarks/output_html.py
@@ -0,0 +1,358 @@
+# Copyright (C) 2024 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import matplotlib.pyplot as plt
+import mpld3
+from collections import defaultdict
+from dataclasses import dataclass
+import matplotlib.dates as mdates
+import numpy as np
+from benches.result import BenchmarkRun, Result
+
+@dataclass
+class BenchmarkMetadata:
+    unit: str
+    lower_is_better: bool
+
+@dataclass
+class BenchmarkSeries:
+    label: str
+    metadata: BenchmarkMetadata
+    runs: list[BenchmarkRun]
+
+@dataclass
+class LatestResults:
+    benchmark_label: str
+    run_values: dict[str, float]
+
+    @classmethod
+    def from_dict(cls, label: str, values: dict[str, float]) -> 'LatestResults':
+        return cls(benchmark_label=label, run_values=values)
+
+def get_latest_results(benchmarks: list[BenchmarkSeries]) -> dict[str, LatestResults]:
+    latest_results: dict[str, LatestResults] = {}
+    for benchmark in benchmarks:
+        run_values = {
+            run.name: max(run.results, key=lambda x: x.date).value
+            for run in benchmark.runs
+        }
+        latest_results[benchmark.label] = LatestResults.from_dict(benchmark.label, run_values)
+    return latest_results
+
+def prepare_normalized_data(latest_results: dict[str, LatestResults], 
+                          benchmarks: list[BenchmarkSeries],
+                          group_benchmarks: list[str],
+                          non_baseline_runs: list[str],
+                          baseline_name: str) -> list[list[float]]:
+    normalized_data = []
+    benchmark_map = {b.label: b for b in benchmarks}
+
+    for run_name in non_baseline_runs:
+        run_data: list[float] = []
+        for benchmark_label in group_benchmarks:
+            benchmark_data = latest_results[benchmark_label].run_values
+            if run_name not in benchmark_data or baseline_name not in benchmark_data:
+                run_data.append(None)
+                continue
+
+            baseline_value = benchmark_data[baseline_name]
+            current_value = benchmark_data[run_name]
+
+            normalized_value = ((baseline_value / current_value) if benchmark_map[benchmark_label].metadata.lower_is_better
+                              else (current_value / baseline_value)) * 100
+            run_data.append(normalized_value)
+        normalized_data.append(run_data)
+    return normalized_data
+
+def format_benchmark_label(label: str) -> list[str]:
+    words = label.split()
+    if len(words) <= 2:
+        return [label]
+
+    mid = len(words) // 2
+    return [' '.join(words[:mid]), ' '.join(words[mid:])]
+
+def create_bar_plot(ax: plt.Axes,
+                   normalized_data: list[list[float]],
+                   group_benchmarks: list[str],
+                   non_baseline_runs: list[str],
+                   latest_results: dict[str, LatestResults],
+                   benchmarks: list[BenchmarkSeries],
+                   baseline_name: str) -> float:
+    x = np.arange(len(group_benchmarks))
+    width = 0.8 / len(non_baseline_runs)
+    max_height = 0
+    benchmark_map = {b.label: b for b in benchmarks}
+
+    for i, (run_name, run_data) in enumerate(zip(non_baseline_runs, normalized_data)):
+        offset = width * i - width * (len(non_baseline_runs) - 1) / 2
+        positions = x + offset
+        valid_data = [v if v is not None else 0 for v in run_data]
+        rects = ax.bar(positions, valid_data, width, label=run_name)
+
+        for rect, value, benchmark_label in zip(rects, run_data, group_benchmarks):
+            if value is not None:
+                height = rect.get_height()
+                if height > max_height:
+                    max_height = height
+
+                ax.text(rect.get_x() + rect.get_width()/2., height + 2,
+                       f'{value:.1f}%',
+                       ha='center', va='bottom')
+
+                benchmark_data = latest_results[benchmark_label].run_values
+                baseline_value = benchmark_data[baseline_name]
+                current_value = benchmark_data[run_name]
+                unit = benchmark_map[benchmark_label].metadata.unit
+
+                tooltip_labels = [
+                    f"Run: {run_name}\n"
+                    f"Benchmark: {benchmark_label}\n"
+                    f"Value: {current_value:.2f} {unit}\n"
+                    f"Baseline ({baseline_name}): {baseline_value:.2f} {unit}\n"
+                    f"Normalized: {value:.1f}%"
+                ]
+                tooltip = mpld3.plugins.LineHTMLTooltip(rect, tooltip_labels, css='.mpld3-tooltip{background:white;padding:8px;border:1px solid #ddd;border-radius:4px;font-family:monospace;white-space:pre;}')
+                mpld3.plugins.connect(ax.figure, tooltip)
+
+    return max_height
+
+def add_chart_elements(ax: plt.Axes,
+                      group_benchmarks: list[str],
+                      group_name: str,
+                      max_height: float) -> None:
+    top_padding = max_height * 0.2
+    ax.set_ylim(0, max_height + top_padding)
+    ax.set_ylabel('Performance relative to baseline (%)')
+    ax.set_title(f'Performance Comparison (Normalized to Baseline) - {group_name} Group')
+    ax.set_xticks([])
+
+    for idx, label in enumerate(group_benchmarks):
+        split_labels = format_benchmark_label(label)
+        for i, sublabel in enumerate(split_labels):
+            y_pos = max_height + (top_padding * 0.5) + 2 - (i * top_padding * 0.15)
+            ax.text(idx, y_pos, sublabel,
+                   ha='center',
+                   style='italic',
+                   color='#666666')
+
+    ax.grid(True, axis='y', alpha=0.2)
+    ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
+
+def create_normalized_bar_chart(benchmarks: list[BenchmarkSeries], baseline_name: str) -> list[str]:
+    latest_results = get_latest_results(benchmarks)
+
+    run_names = sorted(list(set(
+        name for result in latest_results.values()
+        for name in result.run_values.keys()
+    )))
+
+    if baseline_name not in run_names:
+        return []
+
+    benchmark_labels = [b.label for b in benchmarks]
+
+    benchmark_groups = defaultdict(list)
+    for label in benchmark_labels:
+        group_name = label.split()[0]
+        benchmark_groups[group_name].append(label)
+
+    html_charts = []
+
+    for group_name, group_benchmarks in benchmark_groups.items():
+        plt.close('all')
+        non_baseline_runs = [n for n in run_names if n != baseline_name]
+
+        if len(non_baseline_runs) == 0:
+            continue
+
+        normalized_data = prepare_normalized_data(
+            latest_results, benchmarks, group_benchmarks,
+            non_baseline_runs, baseline_name
+        )
+
+        fig, ax = plt.subplots(figsize=(10, 6))
+        max_height = create_bar_plot(
+            ax, normalized_data, group_benchmarks, non_baseline_runs,
+            latest_results, benchmarks, baseline_name
+        )
+        add_chart_elements(ax, group_benchmarks, group_name, max_height)
+
+        plt.tight_layout()
+        html_charts.append(mpld3.fig_to_html(fig))
+        plt.close(fig)
+
+    return html_charts
+
+def create_time_series_chart(benchmarks: list[BenchmarkSeries], github_repo: str) -> str:
+    plt.close('all')
+
+    num_benchmarks = len(benchmarks)
+    if num_benchmarks == 0:
+        return
+
+    fig, axes = plt.subplots(num_benchmarks, 1, figsize=(10, max(4 * num_benchmarks, 30)))
+
+    if num_benchmarks == 1:
+        axes = [axes]
+
+    for idx, benchmark in enumerate(benchmarks):
+        ax = axes[idx]
+
+        for run in benchmark.runs:
+            sorted_points = sorted(run.results, key=lambda x: x.date)
+            dates = [point.date for point in sorted_points]
+            values = [point.value for point in sorted_points]
+
+            ax.plot_date(dates, values, '-', label=run.name, alpha=0.5)
+            scatter = ax.scatter(dates, values, picker=True)
+
+            tooltip_labels = [
+                f"Date: {point.date.strftime('%Y-%m-%d %H:%M:%S')}\n"
+                f"Value: {point.value:.2f}\n"
+                f"Git Hash: {point.git_hash}"
+                for point in sorted_points
+            ]
+
+            targets = [f"https://github.com/{github_repo}/commit/{point.git_hash}"
+                      for point in sorted_points]
+
+            tooltip = mpld3.plugins.PointHTMLTooltip(scatter, tooltip_labels,
+                css='.mpld3-tooltip{background:white;padding:8px;border:1px solid #ddd;border-radius:4px;font-family:monospace;white-space:pre;}',
+                targets=targets)
+            mpld3.plugins.connect(fig, tooltip)
+
+        ax.set_title(benchmark.label, pad=20)
+        performance_indicator = "lower is better" if benchmark.metadata.lower_is_better else "higher is better"
+        ax.text(0.5, 1.05, f"({performance_indicator})",
+                ha='center',
+                transform=ax.transAxes,
+                style='italic',
+                fontsize=7,
+                color='#666666')
+
+        ax.set_xlabel('')
+        unit = benchmark.metadata.unit
+        ax.set_ylabel(f"Value ({unit})" if unit else "Value")
+        ax.grid(True, alpha=0.2)
+        ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
+        ax.xaxis.set_major_formatter(mdates.ConciseDateFormatter('%Y-%m-%d %H:%M:%S'))
+        ax.xaxis.set_major_locator(mdates.AutoDateLocator())
+
+    plt.tight_layout()
+    html = mpld3.fig_to_html(fig)
+
+    plt.close(fig)
+    return html
+
+def process_benchmark_data(benchmark_runs: list[BenchmarkRun], compare_names: list[str]) -> list[BenchmarkSeries]:
+    benchmark_metadata: dict[str, BenchmarkMetadata] = {}
+    run_map: dict[str, dict[str, list[Result]]] = defaultdict(lambda: defaultdict(list))
+
+    for run in benchmark_runs:
+        if run.name not in compare_names:
+            continue
+
+        for result in run.results:
+            if result.label not in benchmark_metadata:
+                benchmark_metadata[result.label] = BenchmarkMetadata(
+                    unit=result.unit,
+                    lower_is_better=result.lower_is_better
+                )
+
+            result.date = run.date
+            result.git_hash = run.git_hash
+            run_map[result.label][run.name].append(result)
+
+    benchmark_series = []
+    for label, metadata in benchmark_metadata.items():
+        runs = [
+            BenchmarkRun(name=run_name, results=results)
+            for run_name, results in run_map[label].items()
+        ]
+        benchmark_series.append(BenchmarkSeries(
+            label=label,
+            metadata=metadata,
+            runs=runs
+        ))
+
+    return benchmark_series
+
+def generate_html(benchmark_runs: list[BenchmarkRun], github_repo: str, compare_names: list[str]) -> str:
+    baseline_name = compare_names[0]
+    benchmarks = process_benchmark_data(benchmark_runs, compare_names)
+
+    comparison_html_charts = create_normalized_bar_chart(benchmarks, baseline_name)
+    timeseries_html = create_time_series_chart(benchmarks, github_repo)
+    comparison_charts_html = '\n'.join(f'<div class="chart"><div>{chart}</div></div>' for chart in comparison_html_charts)
+
+    html_template = f"""
+    <!DOCTYPE html>
+    <html>
+    <head>
+        <meta charset="utf-8">
+        <meta name="viewport" content="width=device-width, initial-scale=1">
+        <title>Benchmark Results</title>
+        <style>
+            body {{
+                font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
+                margin: 0;
+                padding: 16px;
+                background: #f8f9fa;
+            }}
+            .container {{
+                max-width: 1100px;
+                margin: 0 auto;
+            }}
+            h1, h2 {{
+                color: #212529;
+                text-align: center;
+                margin-bottom: 24px;
+                font-weight: 500;
+            }}
+            .chart {{
+                background: white;
+                border-radius: 8px;
+                padding: 24px;
+                margin-bottom: 24px;
+                box-shadow: 0 1px 3px rgba(0,0,0,0.1);
+                overflow-x: auto;
+            }}
+            .chart > div {{
+                min-width: 600px;
+                margin: 0 auto;
+            }}
+            @media (max-width: 768px) {{
+                body {{
+                    padding: 12px;
+                }}
+                .chart {{
+                    padding: 16px;
+                    border-radius: 6px;
+                }}
+                h1 {{
+                    font-size: 24px;
+                    margin-bottom: 16px;
+                }}
+            }}
+        </style>
+    </head>
+    <body>
+        <div class="container">
+            <h1>Benchmark Results</h1>
+            <h2>Latest Results Comparison</h2>
+            <div class="chart">
+                {comparison_charts_html}
+            </div>
+            <h2>Historical Results</h2>
+            <div class="chart">
+                {timeseries_html}
+            </div>
+        </div>
+    </body>
+    </html>
+    """
+
+    return html_template
diff --git a/scripts/benchmarks/output.py b/scripts/benchmarks/output_markdown.py
similarity index 96%
rename from scripts/benchmarks/output.py
rename to scripts/benchmarks/output_markdown.py
index eec8957fe7..177869f8f0 100644
--- a/scripts/benchmarks/output.py
+++ b/scripts/benchmarks/output_markdown.py
@@ -4,7 +4,7 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import collections, re
-from benches.base import Result
+from benches.result import Result
 from benches.options import options
 import math
 
@@ -126,7 +126,7 @@ def generate_summary_table_and_chart(chart_data: dict[str, list[Result]]):
             if oln.diff != None:
                 oln.row += f" | {(oln.diff - 1)*100:.2f}%"
                 delta = oln.diff - 1
-                oln.bars = round(10*(oln.diff - 1)/max_diff)
+                oln.bars = round(10*(oln.diff - 1)/max_diff) if max_diff != 0.0 else 0
                 if oln.bars == 0 or abs(delta) < options.epsilon:
                     oln.row += " | . |"
                 elif oln.bars > 0:
@@ -155,7 +155,6 @@ def generate_summary_table_and_chart(chart_data: dict[str, list[Result]]):
             if options.verbose: print(oln.row)
             summary_table += oln.row + "\n"
 
-
     grouped_objects = collections.defaultdict(list)
 
     for oln in output_detailed_list:
@@ -211,7 +210,7 @@ def generate_summary_table_and_chart(chart_data: dict[str, list[Result]]):
 
     return summary_line, summary_table
 
-def generate_markdown(chart_data: dict[str, list[Result]]):
+def generate_markdown(name: str, chart_data: dict[str, list[Result]]):
     (summary_line, summary_table) = generate_summary_table_and_chart(chart_data)
 
     return f"""
@@ -220,5 +219,5 @@ def generate_markdown(chart_data: dict[str, list[Result]]):
 (<ins>result</ins> is better)\n
 {summary_table}
 # Details
-{generate_markdown_details(chart_data["This PR"])}
+{generate_markdown_details(chart_data[name])}
 """
diff --git a/scripts/benchmarks/utils/utils.py b/scripts/benchmarks/utils/utils.py
index 586837fc6f..0cd10b9513 100644
--- a/scripts/benchmarks/utils/utils.py
+++ b/scripts/benchmarks/utils/utils.py
@@ -4,12 +4,10 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import os
-import json
 import shutil
 import subprocess # nosec B404
-from pathlib import Path
-from benches.result import Result
 from benches.options import options
+from pathlib import Path
 
 def run(command, env_vars={}, cwd=None, add_sycl=False):
     try:
@@ -51,25 +49,6 @@ def git_clone(dir, name, repo, commit):
         raise Exception(f"The directory {repo_path} exists but is not a git repository.")
     return repo_path
 
-def save_benchmark_results(dir, save_name, benchmark_data: list[Result]):
-    serialized = [res.to_json() for res in benchmark_data]
-    results_dir = Path(os.path.join(dir, 'results'))
-    os.makedirs(results_dir, exist_ok=True)
-
-    file_path = Path(os.path.join(results_dir, f"{save_name}.json"))
-    with file_path.open('w') as file:
-        json.dump(serialized, file, indent=4)
-    print(f"Benchmark results saved to {file_path}")
-
-def load_benchmark_results(dir, compare_name) -> list[Result]:
-    file_path = Path(os.path.join(dir, 'results', f"{compare_name}.json"))
-    if file_path.exists():
-        with file_path.open('r') as file:
-            data = json.load(file)
-            return [Result.from_json(item) for item in data]
-    else:
-        return None
-
 def prepare_bench_cwd(dir):
     # we need 2 deep to workaround a problem with a fixed relative path in cudaSift
     options.benchmark_cwd = os.path.join(dir, 'bcwd', 'bcwd')
diff --git a/third_party/benchmark_requirements.txt b/third_party/benchmark_requirements.txt
new file mode 100644
index 0000000000..c01a2215c5
--- /dev/null
+++ b/third_party/benchmark_requirements.txt
@@ -0,0 +1,43 @@
+six==1.16.0
+matplotlib==3.9.2
+mpld3==0.5.10
+alabaster==0.7.12
+Babel==2.14.0
+bandit==1.6.2
+beautifulsoup4==4.11.1
+breathe==4.33.1
+bs4==0.0.1
+certifi==2024.07.04
+chardet==3.0.4
+clang-format==15.0.7
+colorama==0.4.1
+docutils==0.15.2
+exhale==0.3.0
+idna==3.7
+imagesize==1.1.0
+Jinja2==3.1.4
+lxml==4.9.3
+Mako==1.3.0
+MarkupSafe==2.1.5
+packaging==24.1
+Pygments==2.17.2
+pyparsing==2.4.5
+pytest>=7.0
+pytz==2019.3
+PyYAML==6.0.1
+requests==2.32.2
+rst2pdf==0.102
+snowballstemmer==2.0.0
+soupsieve==1.9.5
+Sphinx==4.5.0
+sphinx-book-theme==0.3.3
+sphinxcontrib-applehelp==1.0.2
+sphinxcontrib-devhelp==1.0.2
+sphinxcontrib-htmlhelp==2.0.0
+sphinxcontrib-jsmath==1.0.1
+sphinxcontrib-qthelp==1.0.3
+sphinxcontrib-serializinghtml==1.1.5
+sphinxcontrib-websupport==1.2.4
+sphinx-rtd-theme==1.0.0
+urllib3==2.2.2
+dataclasses-json==0.6.7