diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..ba06bb8 --- /dev/null +++ b/.clang-format @@ -0,0 +1,46 @@ +--- +Language: Cpp +BasedOnStyle: Google +ColumnLimit: 100 +IndentCaseBlocks: true +IndentWidth: 4 +MaxEmptyLinesToKeep: 2 +SpaceAfterTemplateKeyword: false +SpacesBeforeTrailingComments: 4 +DerivePointerAlignment: false +PointerAlignment: Left +ReferenceAlignment: Left +Standard: Latest + +# These must go in order from most to least specific. +# Priority number goes from lowest to highest. +# Higher the number - lower the include position. +IncludeCategories: + # "*" + - Regex: '^".*"' + Priority: 1 + CaseSensitive: false + # and <*.h> together + - Regex: '^' + Priority: 2 + CaseSensitive: false + # <*/*.h> after <*> + - Regex: '^<.*/.*\.h>' + Priority: 4 + CaseSensitive: false + # and <*.h> together + - Regex: '^<.*\.h>' + Priority: 2 + CaseSensitive: false + # <*> + - Regex: '^<.*' + Priority: 3 + CaseSensitive: false + # * + - Regex: '.*' + Priority: 5 + CaseSensitive: false + +IndentAccessModifiers: true +... + diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 0000000..37a9746 --- /dev/null +++ b/.clang-tidy @@ -0,0 +1,25 @@ +# THIS FILE IS GENERATED FROM .clangd! +# Run ./.update-clang-tidy.sh to regenerate. +Checks: + bugprone*, + clang-analyzer*, + google*, + misc*, + modernize*, + -abseil*, + -bugprone-easily-swappable-parameters, + -bugprone-reserved-identifier, + -clang-analyzer-security.insecureAPI.strcpy, + -cppcoreguidelines*, + -cppcoreguidelines-pro*, + -misc-non-copyable-objects, + -misc-use-anonymous-namespace, + -modernize-avoid-c-arrays, + -modernize-redundant-void-arg, + -modernize-use-auto, + -modernize-use-nodiscard, + -modernize-use-noexcept, + -modernize-use-trailing-return-type, + -modernize-use-using, + -performance*, + -readability*, diff --git a/.clangd b/.clangd new file mode 100644 index 0000000..81aca64 --- /dev/null +++ b/.clangd @@ -0,0 +1,36 @@ +CompileFlags: + Remove: -W* + Add: [-Wall, -pedantic, -I/opt/rocm/include, -I/opt/rocm/include/hsa] + Compiler: clang++ + +# list here: https://clang.llvm.org/extra/clang-tidy/checks/list.html +Diagnostics: + UnusedIncludes: Strict + ClangTidy: + Add: [ + bugprone*, + clang-analyzer*, + google*, + misc*, + modernize*, + ] + Remove: [ + abseil*, + bugprone-easily-swappable-parameters, + bugprone-reserved-identifier, + clang-analyzer-security.insecureAPI.strcpy, + cppcoreguidelines*, + cppcoreguidelines-pro*, + misc-non-copyable-objects, + misc-use-anonymous-namespace, + modernize-avoid-c-arrays, + modernize-redundant-void-arg, + modernize-use-auto, + modernize-use-nodiscard, + modernize-use-noexcept, + modernize-use-trailing-return-type, + modernize-use-using, + performance*, + readability*, + ] + #CheckOptions: diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ef8c2d8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,28 @@ +# NOTE! Please use 'git ls-files -i --exclude-standard' +# command after changing this file, to see if there are +# any tracked files which get ignored after the change. + +# VisualStudioCode +.vscode/ + +# Below files generated via CMake + +# Build directory +build/ + +# CMake cache +.cache/ + +# Simulated SYSFS - for early development or debug +device/ + +# Misc +__pycache__ +README +README.html + +# do NOT ignore these files +!.clang-format +!.clang-tidy +!.clangd + diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..3facf44 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,30 @@ +# - How to use: +# python3 -m pip install pre-commit +# pre-commit install --install hooks +# Upon a new commit - the hooks should automagically run +# +# - How to skip: +# git commit --no-verify +# or +# SKIP=clang-format-docker git commit +# SKIP=cpplint-docker git commit + +fail_fast: false +repos: + # For portability I decided to use Docker containers + - repo: https://github.com/dmitrii-galantsev/pre-commit-docker-cpplint + rev: 0.0.3 + hooks: + - id: clang-format-docker + # - id: cpplint-docker + # Below is a local way of running formatters and linters + # NOTE: clang-tidy is not used in the above tests + # - repo: https://github.com/pocc/pre-commit-hooks + # rev: v1.3.5 + # hooks: + # - id: clang-format + # args: [--no-diff, -i] + # - id: clang-tidy + # args: [-p=build, --quiet] + # - id: cpplint + # args: [--verbose=5] diff --git a/.update-clang-tidy.sh b/.update-clang-tidy.sh new file mode 100755 index 0000000..9607b35 --- /dev/null +++ b/.update-clang-tidy.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash + +set -x # trace +set -e # exit immediately if command fails +set -u # exit if an undefined variable is found + +awk ' +BEGIN { + print "# THIS FILE IS GENERATED FROM .clangd!" + print "# Run ./.update-clang-tidy.sh to regenerate." + print "Checks:" +} +/Add: \[$/{ +a=1 + next +} +/]/{ + a=0 +} +a{ +gsub(/^\s+/," ") + print +} + +/Remove: \[$/{ +r=1 + next +} +/]/{ + r=0 +} +r{ + gsub(/^\s+/," -") + print +} +' .clangd | tee .clang-tidy diff --git a/base_test.cpp b/base_test.cpp old mode 100755 new mode 100644 index 5320a0e..2dc1eea --- a/base_test.cpp +++ b/base_test.cpp @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL @@ -44,10 +44,8 @@ // Default Constructor BaseTest::BaseTest(size_t num) { - - // Set the numIteration_ to be 10 by default - num_iteration_ = num; + // Set the numIteration_ to be 10 by default + num_iteration_ = num; } BaseTest::~BaseTest() {} - diff --git a/base_test.hpp b/base_test.hpp old mode 100755 new mode 100644 index 3e79de1..1fde0fa --- a/base_test.hpp +++ b/base_test.hpp @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL @@ -42,7 +42,7 @@ #ifndef ROC_BANDWIDTH_TEST_BASE_H_ #define ROC_BANDWIDTH_TEST_BASE_H_ -#if(defined(RBT_HSA_VERSION_FLAT) && ((RBT_HSA_VERSION_FLAT) < RBT_HSA_VERSION_FILEREORG)) +#if (defined(RBT_HSA_VERSION_FLAT) && ((RBT_HSA_VERSION_FLAT) < RBT_HSA_VERSION_FILEREORG)) // Hsa package with out file reorganization // This is for backward compatibility and will be deprecated from future release #include "hsa.h" @@ -59,41 +59,38 @@ using namespace std; // @Brief: An interface for tests to do some basic things, class BaseTest { + public: + BaseTest(size_t num_iter = 4); - public: - - BaseTest(size_t num_iter = 4); - - virtual ~BaseTest(); - - // @Brief: Allows setup proceedures to be completed - // before running the benchmark test case - virtual void SetUp() = 0; + virtual ~BaseTest(); - // @Brief: Launches the proceedures of test scenario - virtual void Run() = 0; + // @Brief: Allows setup proceedures to be completed + // before running the benchmark test case + virtual void SetUp() = 0; - // @Brief: Allows clean up proceedures to be invoked - virtual void Close() = 0; + // @Brief: Launches the proceedures of test scenario + virtual void Run() = 0; - // @Brief: Display the results - virtual void Display() const = 0; + // @Brief: Allows clean up proceedures to be invoked + virtual void Close() = 0; - // @Brief: Set number of iterations to run - void set_num_iteration(size_t num_iter) { - num_iteration_ = num_iter; - return; - } + // @Brief: Display the results + virtual void Display() const = 0; - // @Brief: Pre-declare some variables for deriviation, the - // derived class may declare more if needed - protected: + // @Brief: Set number of iterations to run + void set_num_iteration(size_t num_iter) { + num_iteration_ = num_iter; + return; + } - // @Brief: Real iteration number - uint64_t num_iteration_; + // @Brief: Pre-declare some variables for deriviation, the + // derived class may declare more if needed + protected: + // @Brief: Real iteration number + uint64_t num_iteration_; - // @Brief: Status code - hsa_status_t err_; + // @Brief: Status code + hsa_status_t err_; }; -#endif // ROC_BANDWIDTH_TEST_BASE_H_ +#endif // ROC_BANDWIDTH_TEST_BASE_H_ diff --git a/common.cpp b/common.cpp old mode 100755 new mode 100644 index 1901c05..4dce0c4 --- a/common.cpp +++ b/common.cpp @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL @@ -43,131 +43,129 @@ #include "common.hpp" void error_check(hsa_status_t hsa_error_code, int line_num, const char* str) { - if (hsa_error_code != HSA_STATUS_SUCCESS && - hsa_error_code != HSA_STATUS_INFO_BREAK) { - printf("HSA Error Found! In file: %s; At line: %d\n", str, line_num); - const char* string = NULL; - hsa_status_string(hsa_error_code, &string); - printf("Error: %s\n", string); - exit(EXIT_FAILURE); - } + if (hsa_error_code != HSA_STATUS_SUCCESS && hsa_error_code != HSA_STATUS_INFO_BREAK) { + printf("HSA Error Found! In file: %s; At line: %d\n", str, line_num); + const char* string = NULL; + hsa_status_string(hsa_error_code, &string); + printf("Error: %s\n", string); + exit(EXIT_FAILURE); + } } // So far, always find the first device hsa_status_t FindGpuDevice(hsa_agent_t agent, void* data) { - if (data == NULL) { - return HSA_STATUS_ERROR_INVALID_ARGUMENT; - } - - hsa_device_type_t hsa_device_type; - hsa_status_t hsa_error_code = - hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &hsa_device_type); - if (hsa_error_code != HSA_STATUS_SUCCESS) { - return hsa_error_code; - } - - if (hsa_device_type == HSA_DEVICE_TYPE_GPU) { - *((hsa_agent_t*)data) = agent; - return HSA_STATUS_INFO_BREAK; - } - - return HSA_STATUS_SUCCESS; + if (data == NULL) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + hsa_device_type_t hsa_device_type; + hsa_status_t hsa_error_code = + hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &hsa_device_type); + if (hsa_error_code != HSA_STATUS_SUCCESS) { + return hsa_error_code; + } + + if (hsa_device_type == HSA_DEVICE_TYPE_GPU) { + *((hsa_agent_t*)data) = agent; + return HSA_STATUS_INFO_BREAK; + } + + return HSA_STATUS_SUCCESS; } hsa_status_t FindCpuDevice(hsa_agent_t agent, void* data) { - if (data == NULL) { - return HSA_STATUS_ERROR_INVALID_ARGUMENT; - } - - hsa_device_type_t hsa_device_type; - hsa_status_t hsa_error_code = - hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &hsa_device_type); - if (hsa_error_code != HSA_STATUS_SUCCESS) { - return hsa_error_code; - } - - if (hsa_device_type == HSA_DEVICE_TYPE_CPU) { - *((hsa_agent_t*)data) = agent; - return HSA_STATUS_INFO_BREAK; - } - - return HSA_STATUS_SUCCESS; + if (data == NULL) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + hsa_device_type_t hsa_device_type; + hsa_status_t hsa_error_code = + hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &hsa_device_type); + if (hsa_error_code != HSA_STATUS_SUCCESS) { + return hsa_error_code; + } + + if (hsa_device_type == HSA_DEVICE_TYPE_CPU) { + *((hsa_agent_t*)data) = agent; + return HSA_STATUS_INFO_BREAK; + } + + return HSA_STATUS_SUCCESS; } hsa_status_t FindGlobalPool(hsa_amd_memory_pool_t region, void* data) { - if (NULL == data) { - return HSA_STATUS_ERROR_INVALID_ARGUMENT; - } + if (NULL == data) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } - hsa_status_t err; - hsa_amd_segment_t segment; - uint32_t flag; + hsa_status_t err; + hsa_amd_segment_t segment; + uint32_t flag; - err = hsa_amd_memory_pool_get_info(region, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); - ErrorCheck(err); + err = hsa_amd_memory_pool_get_info(region, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); + ErrorCheck(err); - err = hsa_amd_memory_pool_get_info(region, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag); - ErrorCheck(err); + err = hsa_amd_memory_pool_get_info(region, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag); + ErrorCheck(err); - if ((HSA_AMD_SEGMENT_GLOBAL == segment) && - (flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED)) { - *((hsa_amd_memory_pool_t*)data) = region; - } + if ((HSA_AMD_SEGMENT_GLOBAL == segment) && + (flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED)) { + *((hsa_amd_memory_pool_t*)data) = region; + } - return HSA_STATUS_SUCCESS; + return HSA_STATUS_SUCCESS; } double CalcMedian(vector scores) { - double median; - size_t size = scores.size(); + double median; + size_t size = scores.size(); - if (size % 2 == 0) - median = (scores[size / 2 - 1] + scores[size / 2]) / 2; - else - median = scores[size / 2]; + if (size % 2 == 0) + median = (scores[size / 2 - 1] + scores[size / 2]) / 2; + else + median = scores[size / 2]; - return median; + return median; } double CalcMean(vector scores) { - double mean = 0; - size_t size = scores.size(); + double mean = 0; + size_t size = scores.size(); - for (size_t i = 0; i < size; ++i) mean += scores[i]; + for (size_t i = 0; i < size; ++i) mean += scores[i]; - return mean / size; + return mean / size; } double CalcStdDeviation(vector scores, int score_mean) { - double ret = 0.0; - for (size_t i = 0; i < scores.size(); ++i) { - ret += (scores[i] - score_mean) * (scores[i] - score_mean); - } + double ret = 0.0; + for (size_t i = 0; i < scores.size(); ++i) { + ret += (scores[i] - score_mean) * (scores[i] - score_mean); + } - ret /= scores.size(); + ret /= scores.size(); - return sqrt(ret); + return sqrt(ret); } int CalcConcurrentQueues(vector scores) { - int num_of_concurrent_queues = 0; - vector execpted_exec_time_array; + int num_of_concurrent_queues = 0; + vector execpted_exec_time_array; - for (size_t i = 0; i < scores.size(); ++i) { - execpted_exec_time_array.push_back(scores[0] / (1 << i)); - } + for (size_t i = 0; i < scores.size(); ++i) { + execpted_exec_time_array.push_back(scores[0] / (1 << i)); + } - for (size_t i = 0; i < scores.size(); ++i) { - cout << "expected exe time = " << execpted_exec_time_array[i] << endl; - } + for (size_t i = 0; i < scores.size(); ++i) { + cout << "expected exe time = " << execpted_exec_time_array[i] << endl; + } - for (size_t i = 1; i < scores.size(); ++i) { - if ((execpted_exec_time_array[i] - scores[i]) < - 0.1 * execpted_exec_time_array[i]) - ++num_of_concurrent_queues; - } + for (size_t i = 1; i < scores.size(); ++i) { + if ((execpted_exec_time_array[i] - scores[i]) < 0.1 * execpted_exec_time_array[i]) + ++num_of_concurrent_queues; + } - return num_of_concurrent_queues; + return num_of_concurrent_queues; } /** hsa_status_t FindHostRegion(hsa_region_t region, void *data) { diff --git a/common.hpp b/common.hpp old mode 100755 new mode 100644 index d2933a0..eaa75d1 --- a/common.hpp +++ b/common.hpp @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL @@ -43,12 +43,13 @@ #ifndef ROC_BANDWIDTH_TEST_COMMON_HPP #define ROC_BANDWIDTH_TEST_COMMON_HPP +#include + +#include #include #include #include -#include -#include -#if(defined(RBT_HSA_VERSION_FLAT) && ((RBT_HSA_VERSION_FLAT) < RBT_HSA_VERSION_FILEREORG)) +#if (defined(RBT_HSA_VERSION_FLAT) && ((RBT_HSA_VERSION_FLAT) < RBT_HSA_VERSION_FILEREORG)) // Hsa package with out file reorganization // This is for backward compatibility and will be deprecated from future release #include "hsa.h" @@ -66,8 +67,8 @@ using namespace std; #else #if defined(__GNUC__) #define ALIGNED_(x) __attribute__((aligned(x))) -#endif // __GNUC__ -#endif // _MSC_VER +#endif // __GNUC__ +#endif // _MSC_VER #define MULTILINE(...) #__VA_ARGS__ @@ -96,4 +97,4 @@ double CalcMedian(vector scores); // @Brief: Calculate the standard deviation of the vector double CalcStdDeviation(vector scores, int score_mean); -#endif // ROC_BANDWIDTH_TEST_COMMON_HPP +#endif // ROC_BANDWIDTH_TEST_COMMON_HPP diff --git a/main.cpp b/main.cpp old mode 100755 new mode 100644 index f3b4ac6..5e8f30e --- a/main.cpp +++ b/main.cpp @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL @@ -40,64 +40,65 @@ // //////////////////////////////////////////////////////////////////////////////// +#include "rocm_bandwidth_test.hpp" + #include + #include -#include "rocm_bandwidth_test.hpp" using namespace std; int main(int argc, char** argv) { + // Default behavior is implemented as two runs + uint32_t arg_cnt = argc; + if (argc == 1) { + argc++; + argv[1] = (char*)"-a"; + setenv("ROCM_BW_DEFAULT_RUN", "true", true); + } - // Default behavior is implemented as two runs - uint32_t arg_cnt = argc; - if (argc == 1) { - argc++; - argv[1] = (char*)"-a"; - setenv("ROCM_BW_DEFAULT_RUN", "true", true); - } - - // Create the Bandwidth test object - RocmBandwidthTest bw_test1(argc, argv); + // Create the Bandwidth test object + RocmBandwidthTest bw_test1(argc, argv); - // Initialize the Bandwidth test object - bw_test1.SetUp(); + // Initialize the Bandwidth test object + bw_test1.SetUp(); - // Run the Bandwidth tests requested by user - bw_test1.Run(); + // Run the Bandwidth tests requested by user + bw_test1.Run(); - // Return if user has not passed in any arguments - // Display the time taken by various tests - // Release the Bandwidth test object resources - if (arg_cnt != 1) { - bw_test1.Display(); - bw_test1.Close(); - return bw_test1.GetExitValue(); - } + // Return if user has not passed in any arguments + // Display the time taken by various tests + // Release the Bandwidth test object resources + if (arg_cnt != 1) { + bw_test1.Display(); + bw_test1.Close(); + return bw_test1.GetExitValue(); + } - // Run the second iteration of copy requests - if (arg_cnt == 1) { - optind = 1; - argv[1] = (char*)"-A"; - } + // Run the second iteration of copy requests + if (arg_cnt == 1) { + optind = 1; + argv[1] = (char*)"-A"; + } - // Create the Bandwidth test object - RocmBandwidthTest bw_test2(argc, argv); + // Create the Bandwidth test object + RocmBandwidthTest bw_test2(argc, argv); - // Initialize the Bandwidth test object - bw_test2.SetUp(); + // Initialize the Bandwidth test object + bw_test2.SetUp(); - // Run the Bandwidth tests requested by user - bw_test2.Run(); + // Run the Bandwidth tests requested by user + bw_test2.Run(); - // Display the time taken by various tests - // and then release associated resources - bw_test1.Display(); - bw_test1.Close(); + // Display the time taken by various tests + // and then release associated resources + bw_test1.Display(); + bw_test1.Close(); - // Display the time taken by various tests - // and then release associated resources - bw_test2.Display(); - bw_test2.Close(); + // Display the time taken by various tests + // and then release associated resources + bw_test2.Display(); + bw_test2.Close(); - return 0; + return 0; } diff --git a/os.cpp b/os.cpp old mode 100755 new mode 100644 index 0731630..8e63782 --- a/os.cpp +++ b/os.cpp @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL @@ -41,21 +41,21 @@ //////////////////////////////////////////////////////////////////////////////// // Code for Linux platform -#ifdef __linux__ +#ifdef __linux__ #include "os.hpp" + #include void SetEnv(const char* env_var_name, const char* env_var_value) { - int err = setenv(env_var_name, env_var_value, 1); - if (0 != err) { - printf("Set environment variable failed!\n"); - exit(1); - } - return; + int err = setenv(env_var_name, env_var_value, 1); + if (0 != err) { + printf("Set environment variable failed!\n"); + exit(1); + } + return; } char* GetEnv(const char* env_var_name) { return getenv(env_var_name); } #endif // End of Linux Code - diff --git a/os.hpp b/os.hpp old mode 100755 new mode 100644 index c9a8b2a..ca0950f --- a/os.hpp +++ b/os.hpp @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL diff --git a/rocm_bandwidth_test.cpp b/rocm_bandwidth_test.cpp old mode 100755 new mode 100644 index 68895a3..11fec6a --- a/rocm_bandwidth_test.cpp +++ b/rocm_bandwidth_test.cpp @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL @@ -40,717 +40,655 @@ // //////////////////////////////////////////////////////////////////////////////// -#include "common.hpp" #include "rocm_bandwidth_test.hpp" -#include +#include "common.hpp" + #include -#include +#include #include + +#include #include +#include #include #include -#include #include -#include +#include #include // Initialize the variable used to capture validation failure const double RocmBandwidthTest::VALIDATE_COPY_OP_FAILURE = std::numeric_limits::max(); // The values are in megabytes at allocation time -const size_t RocmBandwidthTest::SIZE_LIST[] = { 1 * 1024, - 2 * 1024, 4 * 1024, 8 * 1024, - 16 * 1024, 32 * 1024, 64 * 1024, - 128 * 1024, 256 * 1024, 512 * 1024, - 1 * 1024 * 1024, 2 * 1024 * 1024, - 4 * 1024 * 1024, 8 * 1024 * 1024, - 16 * 1024 * 1024, 32 * 1024 * 1024, - 64 * 1024 * 1024, 128 * 1024 * 1024, - 256 * 1024 * 1024, 512 * 1024 * 1024}; - -const size_t RocmBandwidthTest::LATENCY_SIZE_LIST[] = { 1, - 2, 4, 8, - 16, 32, 64, - 128, 256, 512, - 1 * 1024, 2 * 1024, - 4 * 1024, 8 * 1024, - 16 * 1024, 32 * 1024, - 64 * 1024, 128 * 1024, - 256 * 1024, 512 * 1024 }; - -uint32_t RocmBandwidthTest::GetIterationNum() { - return (validate_) ? 1 : (num_iteration_ + 1); -} +const size_t RocmBandwidthTest::SIZE_LIST[] = { + 1 * 1024, 2 * 1024, 4 * 1024, 8 * 1024, 16 * 1024, + 32 * 1024, 64 * 1024, 128 * 1024, 256 * 1024, 512 * 1024, + 1 * 1024 * 1024, 2 * 1024 * 1024, 4 * 1024 * 1024, 8 * 1024 * 1024, 16 * 1024 * 1024, + 32 * 1024 * 1024, 64 * 1024 * 1024, 128 * 1024 * 1024, 256 * 1024 * 1024, 512 * 1024 * 1024}; + +const size_t RocmBandwidthTest::LATENCY_SIZE_LIST[] = { + 1, 2, 4, 8, 16, 32, 64, + 128, 256, 512, 1 * 1024, 2 * 1024, 4 * 1024, 8 * 1024, + 16 * 1024, 32 * 1024, 64 * 1024, 128 * 1024, 256 * 1024, 512 * 1024}; + +uint32_t RocmBandwidthTest::GetIterationNum() { return (validate_) ? 1 : (num_iteration_ + 1); } void RocmBandwidthTest::AcquireAccess(hsa_agent_t agent, void* ptr) { - err_ = hsa_amd_agents_allow_access(1, &agent, NULL, ptr); - ErrorCheck(err_); + err_ = hsa_amd_agents_allow_access(1, &agent, NULL, ptr); + ErrorCheck(err_); } -void RocmBandwidthTest::AcquirePoolAcceses(uint32_t src_dev_idx, - hsa_agent_t src_agent, void* src, - uint32_t dst_dev_idx, - hsa_agent_t dst_agent, void* dst) { - - // determine which one is a cpu and call acquire on the other agent - hsa_device_type_t src_dev_type = agent_list_[src_dev_idx].device_type_; - hsa_device_type_t dst_dev_type = agent_list_[dst_dev_idx].device_type_; - if (src_dev_type == HSA_DEVICE_TYPE_GPU) { - AcquireAccess(src_agent, dst); - } +void RocmBandwidthTest::AcquirePoolAcceses(uint32_t src_dev_idx, hsa_agent_t src_agent, void* src, + uint32_t dst_dev_idx, hsa_agent_t dst_agent, void* dst) { + // determine which one is a cpu and call acquire on the other agent + hsa_device_type_t src_dev_type = agent_list_[src_dev_idx].device_type_; + hsa_device_type_t dst_dev_type = agent_list_[dst_dev_idx].device_type_; + if (src_dev_type == HSA_DEVICE_TYPE_GPU) { + AcquireAccess(src_agent, dst); + } - if (dst_dev_type == HSA_DEVICE_TYPE_GPU) { - AcquireAccess(dst_agent, src); - } + if (dst_dev_type == HSA_DEVICE_TYPE_GPU) { + AcquireAccess(dst_agent, src); + } - return; + return; } - -void RocmBandwidthTest::InitializeSrcBuffer(size_t size, void* buf_cpy, - uint32_t cpy_dev_idx, hsa_agent_t cpy_agent) { - // Allocate host buffers and setup accessibility for copy operation - if (init_src_ == NULL) { - err_ = hsa_amd_memory_pool_allocate(sys_pool_, size, 0, (void**)&init_src_); - ErrorCheck(err_); - long double* src_buf = (long double*) init_src_; - uint32_t count = (size / sizeof(long double)); - for (uint32_t idx = 0; idx < count; idx++) { - src_buf[idx] = (init_) ? init_val_ : sin(idx); +void RocmBandwidthTest::InitializeSrcBuffer(size_t size, void* buf_cpy, uint32_t cpy_dev_idx, + hsa_agent_t cpy_agent) { + // Allocate host buffers and setup accessibility for copy operation + if (init_src_ == NULL) { + err_ = hsa_amd_memory_pool_allocate(sys_pool_, size, 0, (void**)&init_src_); + ErrorCheck(err_); + long double* src_buf = (long double*)init_src_; + uint32_t count = (size / sizeof(long double)); + for (uint32_t idx = 0; idx < count; idx++) { + src_buf[idx] = (init_) ? init_val_ : sin(idx); + } + err_ = hsa_signal_create(0, 0, NULL, &init_signal_); + ErrorCheck(err_); } - err_ = hsa_signal_create(0, 0, NULL, &init_signal_); - ErrorCheck(err_); - } - // If copying agent is a CPU, use memcpy to initialize copy buffer - hsa_device_type_t cpy_dev_type = agent_list_[cpy_dev_idx].device_type_; - if (cpy_dev_type == HSA_DEVICE_TYPE_CPU) { - std::memcpy(buf_cpy, init_src_, size); + // If copying agent is a CPU, use memcpy to initialize copy buffer + hsa_device_type_t cpy_dev_type = agent_list_[cpy_dev_idx].device_type_; + if (cpy_dev_type == HSA_DEVICE_TYPE_CPU) { + std::memcpy(buf_cpy, init_src_, size); + return; + } + + // Copying device is a Gpu, setup buffer access + // before copying initialization buffer + AcquireAccess(cpy_agent, init_src_); + hsa_signal_store_relaxed(init_signal_, 1); + copy_buffer(buf_cpy, cpy_agent, init_src_, cpu_agent_, size, init_signal_); return; - } - - // Copying device is a Gpu, setup buffer access - // before copying initialization buffer - AcquireAccess(cpy_agent, init_src_); - hsa_signal_store_relaxed(init_signal_, 1); - copy_buffer(buf_cpy, cpy_agent, - init_src_, cpu_agent_, - size, init_signal_); - return; } - + bool RocmBandwidthTest::ValidateDstBuffer(size_t max_size, size_t curr_size, void* buf_cpy, uint32_t cpy_dev_idx, hsa_agent_t cpy_agent) { + // Allocate host buffers and setup accessibility for copy operation + if (validate_dst_ == NULL) { + err_ = hsa_amd_memory_pool_allocate(sys_pool_, max_size, 0, (void**)&validate_dst_); + ErrorCheck(err_); + } - // Allocate host buffers and setup accessibility for copy operation - if (validate_dst_ == NULL) { - err_ = hsa_amd_memory_pool_allocate(sys_pool_, max_size, 0, (void**)&validate_dst_); - ErrorCheck(err_); - } + // If Copy device is a Gpu setup buffer access + std::memset(validate_dst_, ~(0x23), curr_size); + hsa_device_type_t cpy_dev_type = agent_list_[cpy_dev_idx].device_type_; + if (cpy_dev_type == HSA_DEVICE_TYPE_GPU) { + AcquireAccess(cpy_agent, validate_dst_); + hsa_signal_store_relaxed(init_signal_, 1); + copy_buffer(validate_dst_, cpu_agent_, buf_cpy, cpy_agent, curr_size, init_signal_); + } else { + // Copying device is a CPU, copy dst buffer + // into validation buffer + std::memcpy(validate_dst_, buf_cpy, curr_size); + } - // If Copy device is a Gpu setup buffer access - std::memset(validate_dst_, ~(0x23), curr_size); - hsa_device_type_t cpy_dev_type = agent_list_[cpy_dev_idx].device_type_; - if (cpy_dev_type == HSA_DEVICE_TYPE_GPU) { - AcquireAccess(cpy_agent, validate_dst_); - hsa_signal_store_relaxed(init_signal_, 1); - copy_buffer(validate_dst_, cpu_agent_, - buf_cpy, cpy_agent, - curr_size, init_signal_); - } else { - - // Copying device is a CPU, copy dst buffer - // into validation buffer - std::memcpy(validate_dst_, buf_cpy, curr_size); - } - - // Compare initialization buffer with validation buffer - err_ = (hsa_status_t)std::memcmp(init_src_, validate_dst_, curr_size); - if (err_ != HSA_STATUS_SUCCESS) { - exit_value_ = err_; - } - return (err_ == HSA_STATUS_SUCCESS); + // Compare initialization buffer with validation buffer + err_ = (hsa_status_t)std::memcmp(init_src_, validate_dst_, curr_size); + if (err_ != HSA_STATUS_SUCCESS) { + exit_value_ = err_; + } + return (err_ == HSA_STATUS_SUCCESS); } -void RocmBandwidthTest::AllocateConcurrentCopyResources(bool bidir, - vector& trans_list, - vector& buf_list, - vector& dev_list, - vector& dev_idx_list, - vector& sig_list, - vector& pool_list) { - - // Number of Unidirectional or Bidirectional - // Concurrent Copy transactions in user request - uint32_t trans_cnt = trans_list.size(); - size_t max_size = size_list_.back(); - - // Common variables used in different loops - void* buf_src; - void* buf_dst; - uint32_t src_idx; - uint32_t dst_idx; - hsa_signal_t signal; - hsa_agent_t src_dev; - hsa_agent_t dst_dev; - uint32_t src_dev_idx; - uint32_t dst_dev_idx; - hsa_amd_memory_pool_t src_pool; - hsa_amd_memory_pool_t dst_pool; - - // Allocate buffers for the various transactions - for (uint32_t idx = 0; idx < trans_cnt; idx++) { - async_trans_t& trans = trans_list[idx]; - src_idx = trans.copy.src_idx_; - dst_idx = trans.copy.dst_idx_; - src_pool = trans.copy.src_pool_; - dst_pool = trans.copy.dst_pool_; - src_dev = pool_list_[src_idx].owner_agent_; - dst_dev = pool_list_[dst_idx].owner_agent_; - src_dev_idx = pool_list_[src_idx].agent_index_; - dst_dev_idx = pool_list_[dst_idx].agent_index_; - - // Allocate buffers and signal for forward copy operation - AllocateCopyBuffers(max_size, - buf_src, src_pool, - buf_dst, dst_pool); - - err_ = hsa_signal_create(1, 0, NULL, &signal); - ErrorCheck(err_); +void RocmBandwidthTest::AllocateConcurrentCopyResources( + bool bidir, vector& trans_list, vector& buf_list, + vector& dev_list, vector& dev_idx_list, vector& sig_list, + vector& pool_list) { + // Number of Unidirectional or Bidirectional + // Concurrent Copy transactions in user request + uint32_t trans_cnt = trans_list.size(); + size_t max_size = size_list_.back(); + + // Common variables used in different loops + void* buf_src; + void* buf_dst; + uint32_t src_idx; + uint32_t dst_idx; + hsa_signal_t signal; + hsa_agent_t src_dev; + hsa_agent_t dst_dev; + uint32_t src_dev_idx; + uint32_t dst_dev_idx; + hsa_amd_memory_pool_t src_pool; + hsa_amd_memory_pool_t dst_pool; + + // Allocate buffers for the various transactions + for (uint32_t idx = 0; idx < trans_cnt; idx++) { + async_trans_t& trans = trans_list[idx]; + src_idx = trans.copy.src_idx_; + dst_idx = trans.copy.dst_idx_; + src_pool = trans.copy.src_pool_; + dst_pool = trans.copy.dst_pool_; + src_dev = pool_list_[src_idx].owner_agent_; + dst_dev = pool_list_[dst_idx].owner_agent_; + src_dev_idx = pool_list_[src_idx].agent_index_; + dst_dev_idx = pool_list_[dst_idx].agent_index_; + + // Allocate buffers and signal for forward copy operation + AllocateCopyBuffers(max_size, buf_src, src_pool, buf_dst, dst_pool); + + err_ = hsa_signal_create(1, 0, NULL, &signal); + ErrorCheck(err_); - // Acquire access to destination buffers - AcquirePoolAcceses(src_dev_idx, src_dev, buf_src, - dst_dev_idx, dst_dev, buf_dst); - - sig_list.push_back(signal); - buf_list.push_back(buf_src); - buf_list.push_back(buf_dst); - dev_list.push_back(src_dev); - dev_list.push_back(dst_dev); - dev_idx_list.push_back(src_dev_idx); - dev_idx_list.push_back(dst_dev_idx); - - // Initialize source buffers with data that could be verified - InitializeSrcBuffer(max_size, buf_src, src_dev_idx, src_dev); - - // For bidirectional copies allocate buffers - // and signal for reverse direction as well - if (bidir) { - AllocateCopyBuffers(max_size, - buf_src, dst_pool, - buf_dst, src_pool); - err_ = hsa_signal_create(1, 0, NULL, &signal); - ErrorCheck(err_); - - // Acquire access to destination buffers - AcquirePoolAcceses(dst_dev_idx, dst_dev, buf_src, - src_dev_idx, src_dev, buf_dst); - - sig_list.push_back(signal); - buf_list.push_back(buf_src); - buf_list.push_back(buf_dst); - dev_list.push_back(dst_dev); - dev_list.push_back(src_dev); - dev_idx_list.push_back(dst_dev_idx); - dev_idx_list.push_back(src_dev_idx); - - // Initialize source buffers with data that could be verified - InitializeSrcBuffer(max_size, buf_src, dst_dev_idx, dst_dev); + // Acquire access to destination buffers + AcquirePoolAcceses(src_dev_idx, src_dev, buf_src, dst_dev_idx, dst_dev, buf_dst); + + sig_list.push_back(signal); + buf_list.push_back(buf_src); + buf_list.push_back(buf_dst); + dev_list.push_back(src_dev); + dev_list.push_back(dst_dev); + dev_idx_list.push_back(src_dev_idx); + dev_idx_list.push_back(dst_dev_idx); + + // Initialize source buffers with data that could be verified + InitializeSrcBuffer(max_size, buf_src, src_dev_idx, src_dev); + + // For bidirectional copies allocate buffers + // and signal for reverse direction as well + if (bidir) { + AllocateCopyBuffers(max_size, buf_src, dst_pool, buf_dst, src_pool); + err_ = hsa_signal_create(1, 0, NULL, &signal); + ErrorCheck(err_); + + // Acquire access to destination buffers + AcquirePoolAcceses(dst_dev_idx, dst_dev, buf_src, src_dev_idx, src_dev, buf_dst); + + sig_list.push_back(signal); + buf_list.push_back(buf_src); + buf_list.push_back(buf_dst); + dev_list.push_back(dst_dev); + dev_list.push_back(src_dev); + dev_idx_list.push_back(dst_dev_idx); + dev_idx_list.push_back(src_dev_idx); + + // Initialize source buffers with data that could be verified + InitializeSrcBuffer(max_size, buf_src, dst_dev_idx, dst_dev); + } } - } } -void RocmBandwidthTest::AllocateCopyBuffers(size_t size, - void*& src, hsa_amd_memory_pool_t src_pool, - void*& dst, hsa_amd_memory_pool_t dst_pool) { - - // Allocate buffers in src and dst pools for forward copy - err_ = hsa_amd_memory_pool_allocate(src_pool, size, 0, &src); - ErrorCheck(err_); - err_ = hsa_amd_memory_pool_allocate(dst_pool, size, 0, &dst); - ErrorCheck(err_); +void RocmBandwidthTest::AllocateCopyBuffers(size_t size, void*& src, hsa_amd_memory_pool_t src_pool, + void*& dst, hsa_amd_memory_pool_t dst_pool) { + // Allocate buffers in src and dst pools for forward copy + err_ = hsa_amd_memory_pool_allocate(src_pool, size, 0, &src); + ErrorCheck(err_); + err_ = hsa_amd_memory_pool_allocate(dst_pool, size, 0, &dst); + ErrorCheck(err_); } void RocmBandwidthTest::ReleaseBuffers(std::vector& buffer_list) { - - for(uint32_t idx = 0; idx < buffer_list.size(); idx++) { - void* buffer = buffer_list[idx]; - err_ = hsa_amd_memory_pool_free(buffer); - ErrorCheck(err_); - } + for (uint32_t idx = 0; idx < buffer_list.size(); idx++) { + void* buffer = buffer_list[idx]; + err_ = hsa_amd_memory_pool_free(buffer); + ErrorCheck(err_); + } } void RocmBandwidthTest::ReleaseSignals(std::vector& signal_list) { + for (uint32_t idx = 0; idx < signal_list.size(); idx++) { + hsa_signal_t signal = signal_list[idx]; + err_ = hsa_signal_destroy(signal); + ErrorCheck(err_); + } +} - for(uint32_t idx = 0; idx < signal_list.size(); idx++) { - hsa_signal_t signal = signal_list[idx]; - err_ = hsa_signal_destroy(signal); +double RocmBandwidthTest::GetGpuCopyTime(bool bidir, hsa_signal_t signal_fwd, + hsa_signal_t signal_rev) { + // Obtain time taken for forward copy + hsa_amd_profiling_async_copy_time_t async_time_fwd = {0}; + err_ = hsa_amd_profiling_get_async_copy_time(signal_fwd, &async_time_fwd); + ErrorCheck(err_); + if (bidir == false) { + return (async_time_fwd.end - async_time_fwd.start); + } + + hsa_amd_profiling_async_copy_time_t async_time_rev = {0}; + err_ = hsa_amd_profiling_get_async_copy_time(signal_rev, &async_time_rev); ErrorCheck(err_); - } -} -double RocmBandwidthTest::GetGpuCopyTime(bool bidir, - hsa_signal_t signal_fwd, - hsa_signal_t signal_rev) { - - // Obtain time taken for forward copy - hsa_amd_profiling_async_copy_time_t async_time_fwd = {0}; - err_= hsa_amd_profiling_get_async_copy_time(signal_fwd, &async_time_fwd); - ErrorCheck(err_); - if (bidir == false) { - return(async_time_fwd.end - async_time_fwd.start); - } - - hsa_amd_profiling_async_copy_time_t async_time_rev = {0}; - err_= hsa_amd_profiling_get_async_copy_time(signal_rev, &async_time_rev); - ErrorCheck(err_); - - // Compute time taken to copy - double start = min(async_time_fwd.start, async_time_rev.start); - double end = max(async_time_fwd.end, async_time_rev.end); - double copy_time = end - start; - - // Forward copy completed before Reverse began - if (async_time_fwd.end < async_time_rev.start) { - return (copy_time - (async_time_rev.start - async_time_fwd.end)); - } - - // Reverse copy completed before Forward began - if (async_time_rev.end < async_time_fwd.start) { - return (copy_time - (async_time_fwd.start - async_time_rev.end)); - } - - // Forward and Reverse copies overlapped - return copy_time; + // Compute time taken to copy + double start = min(async_time_fwd.start, async_time_rev.start); + double end = max(async_time_fwd.end, async_time_rev.end); + double copy_time = end - start; + + // Forward copy completed before Reverse began + if (async_time_fwd.end < async_time_rev.start) { + return (copy_time - (async_time_rev.start - async_time_fwd.end)); + } + + // Reverse copy completed before Forward began + if (async_time_rev.end < async_time_fwd.start) { + return (copy_time - (async_time_fwd.start - async_time_rev.end)); + } + + // Forward and Reverse copies overlapped + return copy_time; } void RocmBandwidthTest::WaitForCopyCompletion(vector& signal_list) { + hsa_wait_state_t policy = + (bw_blocking_run_ == NULL) ? HSA_WAIT_STATE_ACTIVE : HSA_WAIT_STATE_BLOCKED; + + uint32_t size = signal_list.size(); + for (uint32_t idx = 0; idx < size; idx++) { + hsa_signal_t signal = signal_list[idx]; + while (hsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT, 1, uint64_t(-1), policy)) + ; + } +} - hsa_wait_state_t policy = (bw_blocking_run_ == NULL) ? - HSA_WAIT_STATE_ACTIVE : HSA_WAIT_STATE_BLOCKED; +void RocmBandwidthTest::copy_buffer(void* dst, hsa_agent_t dst_agent, void* src, + hsa_agent_t src_agent, size_t size, hsa_signal_t signal) { + // Copy from src into dst buffer + err_ = hsa_amd_memory_async_copy(dst, dst_agent, src, src_agent, size, 0, NULL, signal); + ErrorCheck(err_); - uint32_t size = signal_list.size(); - for (uint32_t idx = 0; idx < size; idx++) { - hsa_signal_t signal = signal_list[idx]; - while (hsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT, - 1, uint64_t(-1), policy)); - } + // Wait for the forward copy operation to complete + while (hsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT, 1, uint64_t(-1), + HSA_WAIT_STATE_ACTIVE)) + ; } -void RocmBandwidthTest::copy_buffer(void* dst, hsa_agent_t dst_agent, - void* src, hsa_agent_t src_agent, - size_t size, hsa_signal_t signal) { +void RocmBandwidthTest::RunConcurrentCopyBenchmark(bool bidir, vector& trans_list) { + // Number of Unidirectional or Bidirectional + // Concurrent Copy transactions in user request + uint32_t trans_cnt = trans_list.size(); + size_t max_size = size_list_.back(); + uint32_t size_len = size_list_.size(); + + // Lists of buffers, pools, agents and signals + // used to run copy requests + vector buf_list; + vector dev_list; + vector dev_idx_list; + vector sig_list; + vector pool_list; + + // Allocate resources for the various transactions + AllocateConcurrentCopyResources(bidir, trans_list, buf_list, dev_list, dev_idx_list, sig_list, + pool_list); + + // Common variables used in different loops + void* buf_src; + void* buf_dst; + hsa_agent_t src_dev; + hsa_agent_t dst_dev; + hsa_signal_t signal; + + // Signa to trigger all copy requests to wait + // until allowed to begin + hsa_signal_t sig_grp_start; + err_ = hsa_signal_create(1, 0, NULL, &sig_grp_start); + ErrorCheck(err_); - // Copy from src into dst buffer - err_ = hsa_amd_memory_async_copy(dst, dst_agent, - src, src_agent, - size, 0, NULL, signal); - ErrorCheck(err_); + // Bind the number of iterations + uint32_t iterations = GetIterationNum(); - // Wait for the forward copy operation to complete - while (hsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT, 1, - uint64_t(-1), HSA_WAIT_STATE_ACTIVE)); -} + // Iterate through the differnt buffer sizes to + // compute the bandwidth as determined by copy + for (uint32_t idx = 0; idx < size_len; idx++) { + // This should not be happening + size_t curr_size = size_list_[idx]; + if (curr_size > max_size) { + break; + } -void RocmBandwidthTest::RunConcurrentCopyBenchmark(bool bidir, - vector& trans_list) { - - // Number of Unidirectional or Bidirectional - // Concurrent Copy transactions in user request - uint32_t trans_cnt = trans_list.size(); - size_t max_size = size_list_.back(); - uint32_t size_len = size_list_.size(); - - // Lists of buffers, pools, agents and signals - // used to run copy requests - vector buf_list; - vector dev_list; - vector dev_idx_list; - vector sig_list; - vector pool_list; - - // Allocate resources for the various transactions - AllocateConcurrentCopyResources(bidir, trans_list, - buf_list, dev_list, - dev_idx_list, sig_list, pool_list); - - // Common variables used in different loops - void* buf_src; - void* buf_dst; - hsa_agent_t src_dev; - hsa_agent_t dst_dev; - hsa_signal_t signal; - - // Signa to trigger all copy requests to wait - // until allowed to begin - hsa_signal_t sig_grp_start; - err_ = hsa_signal_create(1, 0, NULL, &sig_grp_start); - ErrorCheck(err_); - - // Bind the number of iterations - uint32_t iterations = GetIterationNum(); - - // Iterate through the differnt buffer sizes to - // compute the bandwidth as determined by copy - for (uint32_t idx = 0; idx < size_len; idx++) { - - // This should not be happening - size_t curr_size = size_list_[idx]; - if (curr_size > max_size) { - break; - } + std::vector> gpu_time_list(trans_cnt, std::vector()); + for (uint32_t it = 0; it < iterations; it++) { + if (it % 2) { + printf("."); + fflush(stdout); + } + + // Set group trigger signal + hsa_signal_store_relaxed(sig_grp_start, 1); + + // Update signal value to one before submitting copy requests + uint32_t sig_idx = 0; + uint32_t sig_cnt = sig_list.size(); + for (sig_idx = 0; sig_idx < sig_cnt; sig_idx++) { + signal = sig_list[sig_idx]; + hsa_signal_store_relaxed(signal, 1); + } + + // Submit copy operations in batch mode + uint32_t rsrc_idx = 0; + uint32_t cpy_cnt = (bidir) ? (trans_cnt * 2) : trans_cnt; + for (uint32_t cpy_idx = 0; cpy_idx < cpy_cnt; cpy_idx++) { + sig_idx = cpy_idx; + rsrc_idx = cpy_idx * 2; + signal = sig_list[sig_idx + 0]; + buf_src = buf_list[rsrc_idx + 0]; + buf_dst = buf_list[rsrc_idx + 1]; + src_dev = dev_list[rsrc_idx + 0]; + dst_dev = dev_list[rsrc_idx + 1]; + + err_ = hsa_amd_memory_async_copy(buf_dst, dst_dev, buf_src, src_dev, curr_size, 1, + &sig_grp_start, signal); + ErrorCheck(err_); + } + + // Set group trigger signal + hsa_signal_store_relaxed(sig_grp_start, 0); + + // Wait for the copy operations to complete + WaitForCopyCompletion(sig_list); + + // Retrieve times for each copy operation + hsa_signal_t signal_rev; + for (uint32_t tidx = 0; tidx < trans_cnt; tidx++) { + sig_idx = (bidir) ? (tidx * 2) : (tidx); + signal = sig_list[sig_idx + 0]; + signal_rev = (bidir) ? (sig_list[sig_idx + 1]) : signal; + double temp = GetGpuCopyTime(bidir, signal, signal_rev); + std::vector& gpu_time = gpu_time_list[tidx]; + gpu_time.push_back(temp); + } + } - std::vector< std::vector > gpu_time_list(trans_cnt, std::vector()); - for (uint32_t it = 0; it < iterations; it++) { - if (it % 2) { - printf("."); - fflush(stdout); - } - - // Set group trigger signal - hsa_signal_store_relaxed(sig_grp_start, 1); - - // Update signal value to one before submitting copy requests - uint32_t sig_idx = 0; - uint32_t sig_cnt = sig_list.size(); - for (sig_idx = 0; sig_idx < sig_cnt; sig_idx++) { - signal = sig_list[sig_idx]; - hsa_signal_store_relaxed(signal, 1); - } - - // Submit copy operations in batch mode - uint32_t rsrc_idx = 0; - uint32_t cpy_cnt = (bidir) ? (trans_cnt * 2) : trans_cnt; - for (uint32_t cpy_idx = 0; cpy_idx < cpy_cnt; cpy_idx++) { - - sig_idx = cpy_idx; - rsrc_idx = cpy_idx * 2; - signal = sig_list[sig_idx + 0]; - buf_src = buf_list[rsrc_idx + 0]; - buf_dst = buf_list[rsrc_idx + 1]; - src_dev = dev_list[rsrc_idx + 0]; - dst_dev = dev_list[rsrc_idx + 1]; - - err_ = hsa_amd_memory_async_copy(buf_dst, dst_dev, - buf_src, src_dev, curr_size, - 1, &sig_grp_start, signal); - ErrorCheck(err_); - } - - // Set group trigger signal - hsa_signal_store_relaxed(sig_grp_start, 0); - - // Wait for the copy operations to complete - WaitForCopyCompletion(sig_list); - - // Retrieve times for each copy operation - hsa_signal_t signal_rev; - for (uint32_t tidx = 0; tidx < trans_cnt; tidx++) { - sig_idx = (bidir) ? (tidx * 2) : (tidx); - signal = sig_list[sig_idx + 0]; - signal_rev = (bidir) ? (sig_list[sig_idx + 1]) : signal; - double temp = GetGpuCopyTime(bidir, signal, signal_rev); - std::vector& gpu_time = gpu_time_list[tidx]; - gpu_time.push_back(temp); - } - } - - // Update time taken to copy a particular size - // Get Gpu min and mean copy times - for (uint32_t tidx = 0; tidx < trans_cnt; tidx++) { - async_trans_t& trans = trans_list[tidx]; - std::vector& gpu_time = gpu_time_list[tidx]; - double min_time = GetMinTime(gpu_time); - double mean_time = GetMeanTime(gpu_time); - trans.gpu_min_time_.push_back(min_time); - trans.gpu_avg_time_.push_back(mean_time); - gpu_time.clear(); + // Update time taken to copy a particular size + // Get Gpu min and mean copy times + for (uint32_t tidx = 0; tidx < trans_cnt; tidx++) { + async_trans_t& trans = trans_list[tidx]; + std::vector& gpu_time = gpu_time_list[tidx]; + double min_time = GetMinTime(gpu_time); + double mean_time = GetMeanTime(gpu_time); + trans.gpu_min_time_.push_back(min_time); + trans.gpu_avg_time_.push_back(mean_time); + gpu_time.clear(); + } } - } - // Free up buffers and signal objects used in copy operation - sig_list.push_back(sig_grp_start); - ReleaseSignals(sig_list); - ReleaseBuffers(buf_list); + // Free up buffers and signal objects used in copy operation + sig_list.push_back(sig_grp_start); + ReleaseSignals(sig_list); + ReleaseBuffers(buf_list); } void RocmBandwidthTest::RunCopyBenchmark(async_trans_t& trans) { - - // Bind if this transaction is bidirectional - bool bidir = trans.copy.bidir_; - - // Initialize size of buffer to equal the largest element of allocation - size_t max_size = size_list_.back(); - uint32_t size_len = size_list_.size(); - - // Bind to resources such as pool and agents that are involved - // in both forward and reverse copy operations - void* buf_src_fwd; - void* buf_dst_fwd; - void* buf_src_rev; - void* buf_dst_rev; - hsa_signal_t signal_fwd; - hsa_signal_t signal_rev; - hsa_signal_t signal_start_bidir; - uint32_t src_idx = trans.copy.src_idx_; - uint32_t dst_idx = trans.copy.dst_idx_; - uint32_t src_dev_idx_fwd = pool_list_[src_idx].agent_index_; - uint32_t dst_dev_idx_fwd = pool_list_[dst_idx].agent_index_; - uint32_t src_dev_idx_rev = dst_dev_idx_fwd; - uint32_t dst_dev_idx_rev = src_dev_idx_fwd; - hsa_amd_memory_pool_t src_pool_fwd = trans.copy.src_pool_; - hsa_amd_memory_pool_t dst_pool_fwd = trans.copy.dst_pool_; - hsa_amd_memory_pool_t src_pool_rev = dst_pool_fwd; - hsa_amd_memory_pool_t dst_pool_rev = src_pool_fwd; - hsa_agent_t src_agent_fwd = pool_list_[src_idx].owner_agent_; - hsa_agent_t dst_agent_fwd = pool_list_[dst_idx].owner_agent_; - hsa_agent_t src_agent_rev = dst_agent_fwd; - hsa_agent_t dst_agent_rev = src_agent_fwd; - std::vector buffer_list; - std::vector signal_list; - - // Allocate buffers for forward path of unidirectional - // or bidirectional copy - AllocateCopyBuffers(max_size, - buf_src_fwd, src_pool_fwd, - buf_dst_fwd, dst_pool_fwd); - - // Create a signal to wait on copy operation - // @TODO: replace it with a signal pool call - err_ = hsa_signal_create(1, 0, NULL, &signal_fwd); - ErrorCheck(err_); - - // Collect resources to be released later - signal_list.push_back(signal_fwd); - buffer_list.push_back(buf_src_fwd); - buffer_list.push_back(buf_dst_fwd); - - // Allocate buffers for reverse path of bidirectional copy - if (bidir) { - AllocateCopyBuffers(max_size, - buf_src_rev, src_pool_rev, - buf_dst_rev, dst_pool_rev); - - // Create a signal to begin bidir copy operations + // Bind if this transaction is bidirectional + bool bidir = trans.copy.bidir_; + + // Initialize size of buffer to equal the largest element of allocation + size_t max_size = size_list_.back(); + uint32_t size_len = size_list_.size(); + + // Bind to resources such as pool and agents that are involved + // in both forward and reverse copy operations + void* buf_src_fwd; + void* buf_dst_fwd; + void* buf_src_rev; + void* buf_dst_rev; + hsa_signal_t signal_fwd; + hsa_signal_t signal_rev; + hsa_signal_t signal_start_bidir; + uint32_t src_idx = trans.copy.src_idx_; + uint32_t dst_idx = trans.copy.dst_idx_; + uint32_t src_dev_idx_fwd = pool_list_[src_idx].agent_index_; + uint32_t dst_dev_idx_fwd = pool_list_[dst_idx].agent_index_; + uint32_t src_dev_idx_rev = dst_dev_idx_fwd; + uint32_t dst_dev_idx_rev = src_dev_idx_fwd; + hsa_amd_memory_pool_t src_pool_fwd = trans.copy.src_pool_; + hsa_amd_memory_pool_t dst_pool_fwd = trans.copy.dst_pool_; + hsa_amd_memory_pool_t src_pool_rev = dst_pool_fwd; + hsa_amd_memory_pool_t dst_pool_rev = src_pool_fwd; + hsa_agent_t src_agent_fwd = pool_list_[src_idx].owner_agent_; + hsa_agent_t dst_agent_fwd = pool_list_[dst_idx].owner_agent_; + hsa_agent_t src_agent_rev = dst_agent_fwd; + hsa_agent_t dst_agent_rev = src_agent_fwd; + std::vector buffer_list; + std::vector signal_list; + + // Allocate buffers for forward path of unidirectional + // or bidirectional copy + AllocateCopyBuffers(max_size, buf_src_fwd, src_pool_fwd, buf_dst_fwd, dst_pool_fwd); + + // Create a signal to wait on copy operation // @TODO: replace it with a signal pool call - err_ = hsa_signal_create(1, 0, NULL, &signal_rev); + err_ = hsa_signal_create(1, 0, NULL, &signal_fwd); ErrorCheck(err_); - err_ = hsa_signal_create(1, 0, NULL, &signal_start_bidir); - ErrorCheck(err_); - - signal_list.push_back(signal_rev); - signal_list.push_back(signal_start_bidir); - buffer_list.push_back(buf_src_rev); - buffer_list.push_back(buf_dst_rev); - } - - // Initialize source buffers with data that could be verified - InitializeSrcBuffer(max_size, buf_src_fwd, - src_dev_idx_fwd, src_agent_fwd); - if (bidir) { - InitializeSrcBuffer(max_size, buf_src_rev, - src_dev_idx_rev, src_agent_rev); - } - - // Setup access to destination buffers for - // both unidirectional and bidirectional copies - AcquirePoolAcceses(src_dev_idx_fwd, src_agent_fwd, buf_src_fwd, - dst_dev_idx_fwd, dst_agent_fwd, buf_dst_fwd); - if (bidir) { - AcquirePoolAcceses(src_dev_idx_rev, src_agent_rev, buf_src_rev, - dst_dev_idx_rev, dst_agent_rev, buf_dst_rev); - } - - // Bind the number of iterations - uint32_t iterations = GetIterationNum(); - - // Iterate through the differnt buffer sizes to - // compute the bandwidth as determined by copy - for (uint32_t idx = 0; idx < size_len; idx++) { - - // This should not be happening - size_t curr_size = size_list_[idx]; - if (curr_size > max_size) { - break; - } - bool verify = true; - std::vector cpu_time; - std::vector gpu_time; - for (uint32_t it = 0; it < iterations; it++) { - if (it % 2) { - printf("."); - fflush(stdout); - } - - hsa_signal_store_relaxed(signal_fwd, 1); - if (bidir) { - hsa_signal_store_relaxed(signal_rev, 1); - hsa_signal_store_relaxed(signal_start_bidir, 1); - } - - // Temporary code for testing - if (sleep_time_ > 0) { - std::this_thread::sleep_for(sleep_usecs_); - } - - // Create a timer object and start it - if (print_cpu_time_) { - cpu_start_ = std::chrono::steady_clock::now(); - } - - // Launch the copy operation - if (bidir == false) { - err_ = hsa_amd_memory_async_copy(buf_dst_fwd, dst_agent_fwd, - buf_src_fwd, src_agent_fwd, - curr_size, 0, NULL, signal_fwd); - } else { - err_ = hsa_amd_memory_async_copy(buf_dst_fwd, dst_agent_fwd, - buf_src_fwd, src_agent_fwd, - curr_size, 1, &signal_start_bidir, - signal_fwd); - } - ErrorCheck(err_); - - // Launch reverse copy operation if it is bidirectional - if (bidir) { - err_ = hsa_amd_memory_async_copy(buf_dst_rev, dst_agent_rev, - buf_src_rev, src_agent_rev, - curr_size, 1, &signal_start_bidir, - signal_rev); + // Collect resources to be released later + signal_list.push_back(signal_fwd); + buffer_list.push_back(buf_src_fwd); + buffer_list.push_back(buf_dst_fwd); + + // Allocate buffers for reverse path of bidirectional copy + if (bidir) { + AllocateCopyBuffers(max_size, buf_src_rev, src_pool_rev, buf_dst_rev, dst_pool_rev); + + // Create a signal to begin bidir copy operations + // @TODO: replace it with a signal pool call + err_ = hsa_signal_create(1, 0, NULL, &signal_rev); + ErrorCheck(err_); + err_ = hsa_signal_create(1, 0, NULL, &signal_start_bidir); ErrorCheck(err_); - } - - // Signal the bidir copies to begin - if (bidir) { - hsa_signal_store_relaxed(signal_start_bidir, 0); - } - - WaitForCopyCompletion(signal_list); - - // Stop the timer object and extract time taken - if (print_cpu_time_) { - cpu_end_ = std::chrono::steady_clock::now(); - cpu_cp_time_ = cpu_end_ - cpu_start_; - uint64_t cpu_temp = cpu_cp_time_.count(); - cpu_time.push_back(cpu_temp); - } - - // Collect time from the signal(s) - if (print_cpu_time_ == false) { - if (trans.copy.uses_gpu_) { - double temp = GetGpuCopyTime(bidir, signal_fwd, signal_rev); - gpu_time.push_back(temp); - } - } - if (validate_) { - verify = ValidateDstBuffer(max_size, curr_size, buf_dst_fwd, - dst_dev_idx_fwd, dst_agent_fwd); - } + signal_list.push_back(signal_rev); + signal_list.push_back(signal_start_bidir); + buffer_list.push_back(buf_src_rev); + buffer_list.push_back(buf_dst_rev); } - // Collecting Cpu time. Capture verify failures if any - // Get min and mean copy times and collect them into Cpu - // time list - double min_time = 0; - double mean_time = 0; - if (print_cpu_time_) { - min_time = (verify) ? GetMinTime(cpu_time) : VALIDATE_COPY_OP_FAILURE; - mean_time = (verify) ? GetMeanTime(cpu_time) : VALIDATE_COPY_OP_FAILURE; - trans.cpu_min_time_.push_back(min_time); - trans.cpu_avg_time_.push_back(mean_time); + // Initialize source buffers with data that could be verified + InitializeSrcBuffer(max_size, buf_src_fwd, src_dev_idx_fwd, src_agent_fwd); + if (bidir) { + InitializeSrcBuffer(max_size, buf_src_rev, src_dev_idx_rev, src_agent_rev); } - // Collecting Gpu time. Capture verify failures if any - // Get min and mean copy times and collect them into Gpu - // time list - if (print_cpu_time_ == false) { - if (trans.copy.uses_gpu_) { - min_time = (verify) ? GetMinTime(gpu_time) : VALIDATE_COPY_OP_FAILURE; - mean_time = (verify) ? GetMeanTime(gpu_time) : VALIDATE_COPY_OP_FAILURE; - trans.gpu_min_time_.push_back(min_time); - trans.gpu_avg_time_.push_back(mean_time); - } + // Setup access to destination buffers for + // both unidirectional and bidirectional copies + AcquirePoolAcceses(src_dev_idx_fwd, src_agent_fwd, buf_src_fwd, dst_dev_idx_fwd, dst_agent_fwd, + buf_dst_fwd); + if (bidir) { + AcquirePoolAcceses(src_dev_idx_rev, src_agent_rev, buf_src_rev, dst_dev_idx_rev, + dst_agent_rev, buf_dst_rev); } - verify = true; - // Clear the stack of cpu times - if (print_cpu_time_) { - cpu_time.clear(); + // Bind the number of iterations + uint32_t iterations = GetIterationNum(); + + // Iterate through the differnt buffer sizes to + // compute the bandwidth as determined by copy + for (uint32_t idx = 0; idx < size_len; idx++) { + // This should not be happening + size_t curr_size = size_list_[idx]; + if (curr_size > max_size) { + break; + } + + bool verify = true; + std::vector cpu_time; + std::vector gpu_time; + for (uint32_t it = 0; it < iterations; it++) { + if (it % 2) { + printf("."); + fflush(stdout); + } + + hsa_signal_store_relaxed(signal_fwd, 1); + if (bidir) { + hsa_signal_store_relaxed(signal_rev, 1); + hsa_signal_store_relaxed(signal_start_bidir, 1); + } + + // Temporary code for testing + if (sleep_time_ > 0) { + std::this_thread::sleep_for(sleep_usecs_); + } + + // Create a timer object and start it + if (print_cpu_time_) { + cpu_start_ = std::chrono::steady_clock::now(); + } + + // Launch the copy operation + if (bidir == false) { + err_ = hsa_amd_memory_async_copy(buf_dst_fwd, dst_agent_fwd, buf_src_fwd, + src_agent_fwd, curr_size, 0, NULL, signal_fwd); + } else { + err_ = hsa_amd_memory_async_copy(buf_dst_fwd, dst_agent_fwd, buf_src_fwd, + src_agent_fwd, curr_size, 1, &signal_start_bidir, + signal_fwd); + } + ErrorCheck(err_); + + // Launch reverse copy operation if it is bidirectional + if (bidir) { + err_ = hsa_amd_memory_async_copy(buf_dst_rev, dst_agent_rev, buf_src_rev, + src_agent_rev, curr_size, 1, &signal_start_bidir, + signal_rev); + ErrorCheck(err_); + } + + // Signal the bidir copies to begin + if (bidir) { + hsa_signal_store_relaxed(signal_start_bidir, 0); + } + + WaitForCopyCompletion(signal_list); + + // Stop the timer object and extract time taken + if (print_cpu_time_) { + cpu_end_ = std::chrono::steady_clock::now(); + cpu_cp_time_ = cpu_end_ - cpu_start_; + uint64_t cpu_temp = cpu_cp_time_.count(); + cpu_time.push_back(cpu_temp); + } + + // Collect time from the signal(s) + if (print_cpu_time_ == false) { + if (trans.copy.uses_gpu_) { + double temp = GetGpuCopyTime(bidir, signal_fwd, signal_rev); + gpu_time.push_back(temp); + } + } + + if (validate_) { + verify = ValidateDstBuffer(max_size, curr_size, buf_dst_fwd, dst_dev_idx_fwd, + dst_agent_fwd); + } + } + + // Collecting Cpu time. Capture verify failures if any + // Get min and mean copy times and collect them into Cpu + // time list + double min_time = 0; + double mean_time = 0; + if (print_cpu_time_) { + min_time = (verify) ? GetMinTime(cpu_time) : VALIDATE_COPY_OP_FAILURE; + mean_time = (verify) ? GetMeanTime(cpu_time) : VALIDATE_COPY_OP_FAILURE; + trans.cpu_min_time_.push_back(min_time); + trans.cpu_avg_time_.push_back(mean_time); + } + + // Collecting Gpu time. Capture verify failures if any + // Get min and mean copy times and collect them into Gpu + // time list + if (print_cpu_time_ == false) { + if (trans.copy.uses_gpu_) { + min_time = (verify) ? GetMinTime(gpu_time) : VALIDATE_COPY_OP_FAILURE; + mean_time = (verify) ? GetMeanTime(gpu_time) : VALIDATE_COPY_OP_FAILURE; + trans.gpu_min_time_.push_back(min_time); + trans.gpu_avg_time_.push_back(mean_time); + } + } + verify = true; + + // Clear the stack of cpu times + if (print_cpu_time_) { + cpu_time.clear(); + } + gpu_time.clear(); } - gpu_time.clear(); - } - // Free up buffers and signal objects used in copy operation - ReleaseSignals(signal_list); - ReleaseBuffers(buffer_list); + // Free up buffers and signal objects used in copy operation + ReleaseSignals(signal_list); + ReleaseBuffers(buffer_list); } void RocmBandwidthTest::Run() { - - // Enable profiling of Async Copy Activity - if (print_cpu_time_ == false) { - err_ = hsa_amd_profiling_async_copy_enable(true); - ErrorCheck(err_); - } - - if ((req_concurrent_copy_bidir_ == REQ_CONCURRENT_COPY_BIDIR) || - (req_concurrent_copy_unidir_ == REQ_CONCURRENT_COPY_UNIDIR)) { - bool bidir = (req_concurrent_copy_bidir_ == REQ_CONCURRENT_COPY_BIDIR); - RunConcurrentCopyBenchmark(bidir, trans_list_); - ComputeCopyTime(trans_list_); - err_ = hsa_amd_profiling_async_copy_enable(false); - ErrorCheck(err_); - return; - } - - // Iterate through the list of transactions and execute them - uint32_t trans_size = trans_list_.size(); - for (uint32_t idx = 0; idx < trans_size; idx++) { - async_trans_t& trans = trans_list_[idx]; - if ((trans.req_type_ == REQ_COPY_BIDIR) || - (trans.req_type_ == REQ_COPY_UNIDIR) || - (trans.req_type_ == REQ_COPY_ALL_BIDIR) || - (trans.req_type_ == REQ_COPY_ALL_UNIDIR)) { - RunCopyBenchmark(trans); - ComputeCopyTime(trans); + // Enable profiling of Async Copy Activity + if (print_cpu_time_ == false) { + err_ = hsa_amd_profiling_async_copy_enable(true); + ErrorCheck(err_); } - if ((trans.req_type_ == REQ_READ) || - (trans.req_type_ == REQ_WRITE)) { - RunIOBenchmark(trans); + + if ((req_concurrent_copy_bidir_ == REQ_CONCURRENT_COPY_BIDIR) || + (req_concurrent_copy_unidir_ == REQ_CONCURRENT_COPY_UNIDIR)) { + bool bidir = (req_concurrent_copy_bidir_ == REQ_CONCURRENT_COPY_BIDIR); + RunConcurrentCopyBenchmark(bidir, trans_list_); + ComputeCopyTime(trans_list_); + err_ = hsa_amd_profiling_async_copy_enable(false); + ErrorCheck(err_); + return; } - } - // Disable profiling of Async Copy Activity - if (print_cpu_time_ == false) { - err_ = hsa_amd_profiling_async_copy_enable(false); - ErrorCheck(err_); - } + // Iterate through the list of transactions and execute them + uint32_t trans_size = trans_list_.size(); + for (uint32_t idx = 0; idx < trans_size; idx++) { + async_trans_t& trans = trans_list_[idx]; + if ((trans.req_type_ == REQ_COPY_BIDIR) || (trans.req_type_ == REQ_COPY_UNIDIR) || + (trans.req_type_ == REQ_COPY_ALL_BIDIR) || (trans.req_type_ == REQ_COPY_ALL_UNIDIR)) { + RunCopyBenchmark(trans); + ComputeCopyTime(trans); + } + if ((trans.req_type_ == REQ_READ) || (trans.req_type_ == REQ_WRITE)) { + RunIOBenchmark(trans); + } + } + // Disable profiling of Async Copy Activity + if (print_cpu_time_ == false) { + err_ = hsa_amd_profiling_async_copy_enable(false); + ErrorCheck(err_); + } } void RocmBandwidthTest::Close() { + if (init_src_ != NULL) { + hsa_signal_destroy(init_signal_); + hsa_amd_memory_pool_free(init_src_); + } - if (init_src_ != NULL) { - hsa_signal_destroy(init_signal_); - hsa_amd_memory_pool_free(init_src_); - } - - if (validate_) { - hsa_amd_memory_pool_free(validate_dst_); - } + if (validate_) { + hsa_amd_memory_pool_free(validate_dst_); + } - hsa_status_t status = hsa_shut_down(); - ErrorCheck(status); - return; + hsa_status_t status = hsa_shut_down(); + ErrorCheck(status); + return; } // Sets up the bandwidth test object to enable running // the various test scenarios requested by user. The // things this proceedure takes care of are: -// +// // Parse user arguments // Discover RocR Device Topology // Determine validity of requested test scenarios @@ -758,130 +696,120 @@ void RocmBandwidthTest::Close() { // Miscellaneous // void RocmBandwidthTest::SetUp() { + // Parse user arguments + ParseArguments(); + + // Validate input parameters + bool status = ValidateArguments(); + if (status == false) { + PrintHelpScreen(); + exit(1); + } - // Parse user arguments - ParseArguments(); - - // Validate input parameters - bool status = ValidateArguments(); - if (status == false) { - PrintHelpScreen(); - exit(1); - } - - // Build list of transactions (copy, read, write) to execute - status = BuildTransList(); - if (status == false) { - PrintHelpScreen(); - exit(1); - } + // Build list of transactions (copy, read, write) to execute + status = BuildTransList(); + if (status == false) { + PrintHelpScreen(); + exit(1); + } } RocmBandwidthTest::RocmBandwidthTest(int argc, char** argv) : BaseTest() { - - usr_argc_ = argc; - usr_argv_ = argv; - - pool_index_ = 0; - cpu_index_ = -1; - agent_index_ = 0; - - req_read_ = REQ_INVALID; - req_write_ = REQ_INVALID; - req_version_ = REQ_INVALID; - req_topology_ = REQ_INVALID; - req_copy_bidir_ = REQ_INVALID; - req_copy_unidir_ = REQ_INVALID; - req_copy_all_bidir_ = REQ_INVALID; - req_copy_all_unidir_ = REQ_INVALID; - req_concurrent_copy_bidir_ = REQ_INVALID; - req_concurrent_copy_unidir_ = REQ_INVALID; - - access_matrix_ = NULL; - link_hops_matrix_ = NULL; - link_type_matrix_ = NULL; - active_agents_list_ = NULL; - link_weight_matrix_ = NULL; - direct_access_matrix_ = NULL; - - init_ = false; - latency_ = false; - validate_ = false; - print_cpu_time_ = false; - - // Set initial value to 11.231926 in case - // user does not have a preference - init_val_ = 11.231926; - init_src_ = NULL; - validate_dst_ = NULL; - - // Initialize version of the test - version_.major_id = 2; - version_.minor_id = 6; - version_.step_id = 0; - version_.reserved = 0; - - // Test impact of sleep, temp code - sleep_time_ = 0; - bw_sleep_time_ = getenv("ROCM_BW_SLEEP_TIME"); - if (bw_sleep_time_ != NULL) { - sleep_time_ = atoi(bw_sleep_time_); - if ((sleep_time_ < 0) || (sleep_time_ > 400000)) { - std::cout << "Unit of sleep time is defined as 10 microseconds" << std::endl; - std::cout << "An input value of 10 implies sleep time of 100 microseconds" << std::endl; - std::cout << "Value of ROCM_BW_SLEEP_TIME must be between [1, 400000]" << sleep_time_ << std::endl; - exit(1); + usr_argc_ = argc; + usr_argv_ = argv; + + pool_index_ = 0; + cpu_index_ = -1; + agent_index_ = 0; + + req_read_ = REQ_INVALID; + req_write_ = REQ_INVALID; + req_version_ = REQ_INVALID; + req_topology_ = REQ_INVALID; + req_copy_bidir_ = REQ_INVALID; + req_copy_unidir_ = REQ_INVALID; + req_copy_all_bidir_ = REQ_INVALID; + req_copy_all_unidir_ = REQ_INVALID; + req_concurrent_copy_bidir_ = REQ_INVALID; + req_concurrent_copy_unidir_ = REQ_INVALID; + + access_matrix_ = NULL; + link_hops_matrix_ = NULL; + link_type_matrix_ = NULL; + active_agents_list_ = NULL; + link_weight_matrix_ = NULL; + direct_access_matrix_ = NULL; + + init_ = false; + latency_ = false; + validate_ = false; + print_cpu_time_ = false; + + // Set initial value to 11.231926 in case + // user does not have a preference + init_val_ = 11.231926; + init_src_ = NULL; + validate_dst_ = NULL; + + // Initialize version of the test + version_.major_id = 2; + version_.minor_id = 6; + version_.step_id = 0; + version_.reserved = 0; + + // Test impact of sleep, temp code + sleep_time_ = 0; + bw_sleep_time_ = getenv("ROCM_BW_SLEEP_TIME"); + if (bw_sleep_time_ != NULL) { + sleep_time_ = atoi(bw_sleep_time_); + if ((sleep_time_ < 0) || (sleep_time_ > 400000)) { + std::cout << "Unit of sleep time is defined as 10 microseconds" << std::endl; + std::cout << "An input value of 10 implies sleep time of 100 microseconds" << std::endl; + std::cout << "Value of ROCM_BW_SLEEP_TIME must be between [1, 400000]" << sleep_time_ + << std::endl; + exit(1); + } + sleep_time_ *= 10; + std::chrono::microseconds temp(sleep_time_); + sleep_usecs_ = temp; } - sleep_time_ *= 10; - std::chrono::microseconds temp(sleep_time_); - sleep_usecs_ = temp; - } - - bw_iter_cnt_ = getenv("ROCM_BW_ITER_CNT"); - bw_default_run_ = getenv("ROCM_BW_DEFAULT_RUN"); - bw_blocking_run_ = getenv("ROCR_BW_RUN_BLOCKING"); - skip_cpu_fine_grain_ = getenv("ROCM_SKIP_CPU_FINE_GRAINED_POOL"); - skip_gpu_coarse_grain_ = getenv("ROCM_SKIP_GPU_COARSE_GRAINED_POOL"); - - if (bw_iter_cnt_ != NULL) { - int32_t num = atoi(bw_iter_cnt_); - if (num < 0) { - std::cout << "Value of ROCM_BW_ITER_CNT can't be negative: " << num << std::endl; - exit(1); + + bw_iter_cnt_ = getenv("ROCM_BW_ITER_CNT"); + bw_default_run_ = getenv("ROCM_BW_DEFAULT_RUN"); + bw_blocking_run_ = getenv("ROCR_BW_RUN_BLOCKING"); + skip_cpu_fine_grain_ = getenv("ROCM_SKIP_CPU_FINE_GRAINED_POOL"); + skip_gpu_coarse_grain_ = getenv("ROCM_SKIP_GPU_COARSE_GRAINED_POOL"); + + if (bw_iter_cnt_ != NULL) { + int32_t num = atoi(bw_iter_cnt_); + if (num < 0) { + std::cout << "Value of ROCM_BW_ITER_CNT can't be negative: " << num << std::endl; + exit(1); + } + set_num_iteration(num); } - set_num_iteration(num); - } - exit_value_ = 0; + exit_value_ = 0; } RocmBandwidthTest::~RocmBandwidthTest() { + if (access_matrix_) delete[] access_matrix_; - if (access_matrix_) - delete[] access_matrix_; - - if (direct_access_matrix_) - delete[] direct_access_matrix_; - - if (link_hops_matrix_) - delete[] link_hops_matrix_; - - if (link_type_matrix_) - delete[] link_type_matrix_; - - if (link_weight_matrix_) - delete[] link_weight_matrix_; - - if (active_agents_list_) - delete[] active_agents_list_; -} + if (direct_access_matrix_) delete[] direct_access_matrix_; -std::string RocmBandwidthTest::GetVersion() const { + if (link_hops_matrix_) delete[] link_hops_matrix_; - std::stringstream stream; - stream << version_.major_id << "."; - stream << version_.minor_id << "."; - stream << version_.step_id; - return stream.str(); + if (link_type_matrix_) delete[] link_type_matrix_; + + if (link_weight_matrix_) delete[] link_weight_matrix_; + + if (active_agents_list_) delete[] active_agents_list_; } +std::string RocmBandwidthTest::GetVersion() const { + std::stringstream stream; + stream << version_.major_id << "."; + stream << version_.minor_id << "."; + stream << version_.step_id; + return stream.str(); +} diff --git a/rocm_bandwidth_test.hpp b/rocm_bandwidth_test.hpp old mode 100755 new mode 100644 index f7eb338..b79fafc --- a/rocm_bandwidth_test.hpp +++ b/rocm_bandwidth_test.hpp @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL @@ -43,7 +43,7 @@ #ifndef __ROC_BANDWIDTH_TEST_H__ #define __ROC_BANDWIDTH_TEST_H__ -#if(defined(RBT_HSA_VERSION_FLAT) && ((RBT_HSA_VERSION_FLAT) < RBT_HSA_VERSION_FILEREORG)) +#if (defined(RBT_HSA_VERSION_FLAT) && ((RBT_HSA_VERSION_FLAT) < RBT_HSA_VERSION_FILEREORG)) // Hsa package with out file reorganization // This is for backward compatibility and will be deprecated from future release #include "hsa.h" @@ -54,508 +54,483 @@ #include "base_test.hpp" #include "common.hpp" -#include #include +#include using namespace std; // Structure to encapsulate a RocR agent and its index in a list typedef struct agent_info { + agent_info(hsa_agent_t agent, uint32_t index, hsa_device_type_t device_type) { + agent_ = agent; + index_ = index; + device_type_ = device_type; + } + + agent_info() {} - agent_info(hsa_agent_t agent, - uint32_t index, hsa_device_type_t device_type) { - agent_ = agent; - index_ = index; - device_type_ = device_type; - } - - agent_info() {} - - uint32_t index_; - hsa_agent_t agent_; - hsa_device_type_t device_type_; - char name_[64]; // Size specified in public header file - char uuid_[24]; // Unique ID of the device - char bdf_id_[16]; // Bus (8-bits), Device (5-bits), Function (3-bits) + uint32_t index_; + hsa_agent_t agent_; + hsa_device_type_t device_type_; + char name_[64]; // Size specified in public header file + char uuid_[24]; // Unique ID of the device + char bdf_id_[16]; // Bus (8-bits), Device (5-bits), Function (3-bits) } agent_info_t; typedef struct pool_info { - - pool_info(hsa_agent_t agent, uint32_t agent_index, - hsa_amd_memory_pool_t pool, hsa_amd_segment_t segment, - size_t size, uint32_t index, bool is_fine_grained, - bool is_kernarg, bool access_to_all, - hsa_amd_memory_pool_access_t owner_access) { - - pool_ = pool; - index_ = index; - segment_ = segment; - owner_agent_ = agent; - agent_index_ = agent_index; - allocable_size_ = size; - is_kernarg_ = is_kernarg; - owner_access_ = owner_access; - access_to_all_ = access_to_all; - is_fine_grained_ = is_fine_grained; - } - - pool_info() {} - - uint32_t index_; - bool is_kernarg_; - bool access_to_all_; - bool is_fine_grained_; - size_t allocable_size_; - uint32_t agent_index_; - hsa_agent_t owner_agent_; - hsa_amd_segment_t segment_; - hsa_amd_memory_pool_t pool_; - hsa_amd_memory_pool_access_t owner_access_; + pool_info(hsa_agent_t agent, uint32_t agent_index, hsa_amd_memory_pool_t pool, + hsa_amd_segment_t segment, size_t size, uint32_t index, bool is_fine_grained, + bool is_kernarg, bool access_to_all, hsa_amd_memory_pool_access_t owner_access) { + pool_ = pool; + index_ = index; + segment_ = segment; + owner_agent_ = agent; + agent_index_ = agent_index; + allocable_size_ = size; + is_kernarg_ = is_kernarg; + owner_access_ = owner_access; + access_to_all_ = access_to_all; + is_fine_grained_ = is_fine_grained; + } + + pool_info() {} + + uint32_t index_; + bool is_kernarg_; + bool access_to_all_; + bool is_fine_grained_; + size_t allocable_size_; + uint32_t agent_index_; + hsa_agent_t owner_agent_; + hsa_amd_segment_t segment_; + hsa_amd_memory_pool_t pool_; + hsa_amd_memory_pool_access_t owner_access_; } pool_info_t; // Used to print out topology info typedef struct agent_pool_info { + agent_pool_info() {} - agent_pool_info() {} - - agent_info agent; - - vector pool_list; + agent_info agent; + + vector pool_list; } agent_pool_info_t; typedef struct async_trans { - - uint32_t req_type_; - union { - struct { - bool bidir_; - bool uses_gpu_; - uint32_t src_idx_; - uint32_t dst_idx_; - hsa_amd_memory_pool_t src_pool_; - hsa_amd_memory_pool_t dst_pool_; - } copy; - struct { - void* code_; - uint32_t agent_idx_; - hsa_agent_t agent_; - uint32_t pool_idx_; - hsa_amd_memory_pool_t pool_; - } kernel; - }; - - // Cpu BenchMark average copy time - vector cpu_avg_time_; - - // Cpu Min time - vector cpu_min_time_; - - // Gpu BenchMark average copy time - vector gpu_avg_time_; - - // Gpu Min time - vector gpu_min_time_; - - // BenchMark's Average copy time and average bandwidth - vector avg_time_; - vector avg_bandwidth_; - - // BenchMark's Min copy time and peak bandwidth - vector min_time_; - vector peak_bandwidth_; - - async_trans(uint32_t req_type) { req_type_ = req_type; } + uint32_t req_type_; + union { + struct { + bool bidir_; + bool uses_gpu_; + uint32_t src_idx_; + uint32_t dst_idx_; + hsa_amd_memory_pool_t src_pool_; + hsa_amd_memory_pool_t dst_pool_; + } copy; + struct { + void* code_; + uint32_t agent_idx_; + hsa_agent_t agent_; + uint32_t pool_idx_; + hsa_amd_memory_pool_t pool_; + } kernel; + }; + + // Cpu BenchMark average copy time + vector cpu_avg_time_; + + // Cpu Min time + vector cpu_min_time_; + + // Gpu BenchMark average copy time + vector gpu_avg_time_; + + // Gpu Min time + vector gpu_min_time_; + + // BenchMark's Average copy time and average bandwidth + vector avg_time_; + vector avg_bandwidth_; + + // BenchMark's Min copy time and peak bandwidth + vector min_time_; + vector peak_bandwidth_; + + async_trans(uint32_t req_type) { req_type_ = req_type; } } async_trans_t; typedef enum Request_Type { - REQ_READ = 1, - REQ_WRITE = 2, - REQ_VERSION = 3, - REQ_TOPOLOGY = 4, - REQ_LIST_DEVS = 5, - REQ_COPY_BIDIR = 6, - REQ_COPY_UNIDIR = 7, - REQ_COPY_ALL_BIDIR = 8, - REQ_COPY_ALL_UNIDIR = 9, - REQ_CONCURRENT_COPY_BIDIR = 10, - REQ_CONCURRENT_COPY_UNIDIR = 11, - REQ_INVALID = 12, + REQ_READ = 1, + REQ_WRITE = 2, + REQ_VERSION = 3, + REQ_TOPOLOGY = 4, + REQ_LIST_DEVS = 5, + REQ_COPY_BIDIR = 6, + REQ_COPY_UNIDIR = 7, + REQ_COPY_ALL_BIDIR = 8, + REQ_COPY_ALL_UNIDIR = 9, + REQ_CONCURRENT_COPY_BIDIR = 10, + REQ_CONCURRENT_COPY_UNIDIR = 11, + REQ_INVALID = 12, } Request_Type; class RocmBandwidthTest : public BaseTest { + public: + // @brief: Constructor for test case of RocmBandwidthTest + RocmBandwidthTest(int argc, char** argv); + + // @brief: Destructor for test case of RocmBandwidthTest + virtual ~RocmBandwidthTest(); + + // @brief: Setup the environment for measurement + virtual void SetUp(); + + // @brief: Core measurement execution + virtual void Run(); + + // @brief: Clean up and retrive the resource + virtual void Close(); + + // @brief: Display the results + virtual void Display() const; + + // @brief: Return exit value, useful in case of error + int32_t GetExitValue() { return exit_value_; } + + private: + // @brief: Print Help Menu Screen + void PrintHelpScreen(); + + // @brief: Discover the topology of pools on Rocm Platform + void DiscoverTopology(); + + // @brief: Populate link properties for the set of agents + void DiscoverLinkProps(); + void BindLinkProps(uint32_t idx1, uint32_t idx2); + + // @brief: Populates the access matrix + void PopulateAccessMatrix(); + + // @brief: Print topology info + void PrintTopology(); + + // @brief: Print in matrix form various + // properties such as access, link weight, + // link type and number of hops, etc + void PrintLinkPropsMatrix(uint32_t key) const; + + // @brief: Print info on agents in system + void PrintAgentsList(); + + // @brief: Print info on memory pools in system + void PrintPoolsList(); + + // @brief: Parse the arguments provided by user to + // build list of transactions + void ParseArguments(); + + // @brief Validate user input of primary operations + void ValidateInputFlags(uint32_t pf_cnt, uint32_t copy_mask, uint32_t copy_ctrl_mask); + + // @brief: Print the list of transactions + void PrintTransList(); + + // @brief: Run read/write requests of users + void RunIOBenchmark(async_trans_t& trans); - public: + // @brief: Run copy requests of users + void RunCopyBenchmark(async_trans_t& trans); - // @brief: Constructor for test case of RocmBandwidthTest - RocmBandwidthTest(int argc, char** argv); + // @brief: Run copy requests of users + void RunConcurrentCopyBenchmark(bool bidir, vector& trans_list); - // @brief: Destructor for test case of RocmBandwidthTest - virtual ~RocmBandwidthTest(); + // @brief: Get iteration number + uint32_t GetIterationNum(); + + // @brief: Get the mean copy time + double GetMeanTime(vector& vec); - // @brief: Setup the environment for measurement - virtual void SetUp(); - - // @brief: Core measurement execution - virtual void Run(); + // @brief: Get the min copy time + double GetMinTime(vector& vec); - // @brief: Clean up and retrive the resource - virtual void Close(); - - // @brief: Display the results - virtual void Display() const; - - // @brief: Return exit value, useful in case of error - int32_t GetExitValue() { return exit_value_; } - - private: - - // @brief: Print Help Menu Screen - void PrintHelpScreen(); - - // @brief: Discover the topology of pools on Rocm Platform - void DiscoverTopology(); - - // @brief: Populate link properties for the set of agents - void DiscoverLinkProps(); - void BindLinkProps(uint32_t idx1, uint32_t idx2); - - // @brief: Populates the access matrix - void PopulateAccessMatrix(); - - // @brief: Print topology info - void PrintTopology(); - - // @brief: Print in matrix form various - // properties such as access, link weight, - // link type and number of hops, etc - void PrintLinkPropsMatrix(uint32_t key) const; - - // @brief: Print info on agents in system - void PrintAgentsList(); - - // @brief: Print info on memory pools in system - void PrintPoolsList(); - - // @brief: Parse the arguments provided by user to - // build list of transactions - void ParseArguments(); - - // @brief Validate user input of primary operations - void ValidateInputFlags(uint32_t pf_cnt, - uint32_t copy_mask, uint32_t copy_ctrl_mask); - - // @brief: Print the list of transactions - void PrintTransList(); - - // @brief: Run read/write requests of users - void RunIOBenchmark(async_trans_t& trans); - - // @brief: Run copy requests of users - void RunCopyBenchmark(async_trans_t& trans); - - // @brief: Run copy requests of users - void RunConcurrentCopyBenchmark(bool bidir, - vector& trans_list); - - // @brief: Get iteration number - uint32_t GetIterationNum(); - - // @brief: Get the mean copy time - double GetMeanTime(vector& vec); - - // @brief: Get the min copy time - double GetMinTime(vector& vec); - - // @brief: Dispaly Benchmark result - void PopulatePerfMatrix(bool peak, double* perf_matrix) const; - void PrintPerfMatrix(bool validate, bool peak, double* perf_matrix) const; - void DisplayDevInfo() const; - void DisplayIOTime(async_trans_t& trans) const; - void DisplayCopyTime(async_trans_t& trans) const; - void DisplayCopyTimeMatrix(bool peak) const; - void DisplayValidationMatrix() const; - - private: - - // @brief: Validate the arguments passed in by user - bool ValidateArguments(); - bool ValidateReadReq(); - bool ValidateWriteReq(); - bool ValidateReadOrWriteReq(vector& in_list); - - void ValidateCopyBidirFlags(uint32_t copy_ctrl_mask); - void ValidateCopyAllBidirFlags(uint32_t copy_ctrl_mask); - void ValidateCopyAllUnidirFlags(uint32_t copy_ctrl_mask); - void ValidateCopyUnidirFlags(uint32_t copy_mask, uint32_t copy_ctrl_mask); - - bool ValidateBidirCopyReq(); - bool ValidateUnidirCopyReq(); - bool ValidateConcurrentCopyReq(); - bool ValidateCopyReq(vector& in_list); - void PrintIOAccessError(uint32_t agent_idx, uint32_t pool_idx); - void PrintCopyAccessError(uint32_t src_pool_idx, uint32_t dst_pool_idx); - - bool PoolIsPresent(vector& in_list); - bool PoolIsDuplicated(vector& in_list); - - // @brief: Builds a list of transaction per user request - void ComputeCopyTime(async_trans_t& trans); - void ComputeCopyTime(vector& trans_list); - void BuildDeviceList(); - void BuildBufferList(); - bool BuildTransList(); - bool BuildReadTrans(); - bool BuildWriteTrans(); - bool BuildBidirCopyTrans(); - bool BuildUnidirCopyTrans(); - bool BuildAllPoolsBidirCopyTrans(); - bool BuildAllPoolsUnidirCopyTrans(); - bool BuildReadOrWriteTrans(uint32_t req_type, - vector& in_list); - bool BuildCopyTrans(uint32_t req_type, - vector& src_list, - vector& dst_list); - bool BuildConcurrentCopyTrans(uint32_t req_type, - vector& dev_list); - - void WaitForCopyCompletion(vector& signal_list); - - void AllocateCopyBuffers(size_t size, - void*& src, hsa_amd_memory_pool_t src_pool, - void*& dst, hsa_amd_memory_pool_t dst_pool); - - void AllocateConcurrentCopyResources(bool bidir, - vector& trans_list, - vector& buffer_list, - vector& dev_list, - vector& dev_idx_list, - vector& sig_list, - vector& pool_list); - - void ReleaseBuffers(vector& buffer_list); - void ReleaseSignals(vector& signal_list); - - double GetGpuCopyTime(bool bidir, hsa_signal_t signal_fwd, hsa_signal_t signal_rev); - - void InitializeSrcBuffer(size_t size, void* buf_cpy, - uint32_t cpy_dev_idx, hsa_agent_t cpy_agent); - - bool ValidateDstBuffer(size_t max_size, size_t curr_size, - void* buf_cpy, uint32_t cpy_dev_idx, hsa_agent_t cpy_agent); - - void copy_buffer(void* dst, hsa_agent_t dst_agent, - void* src, hsa_agent_t src_agent, - size_t size, hsa_signal_t signal); - bool FilterCpuPool(uint32_t req_type, - hsa_device_type_t dev_type, - bool fine_grained); - - // Find the mirror transaction if present - bool FindMirrorRequest(bool reverse, uint32_t src_idx, uint32_t dst_idx); - - // @brief: Check if agent and access memory pool, if so, set - // access to the agent, if not, exit - void AcquireAccess(hsa_agent_t agent, void* ptr); - void AcquirePoolAcceses(uint32_t src_dev_idx, hsa_agent_t src_agent, void* src, - uint32_t dst_dev_idx, hsa_agent_t dst_agent, void* dst); - - // Functions to find agents and memory pools and udpate - // relevant data structures used to maintain system topology - friend hsa_status_t AgentInfo(hsa_agent_t agent, void* data); - friend hsa_status_t MemPoolInfo(hsa_amd_memory_pool_t pool, void* data); - - // Populate the Bus Device Function of Gpu device - friend void PopulateBDF(uint32_t bdf_id, agent_info_t *agent_info); - - // Compute the type and weight of a link - friend uint32_t GetLinkType(hsa_device_type_t src_dev_type, - hsa_device_type_t dst_dev_type, - hsa_amd_memory_pool_link_info_t* link_info, uint32_t hops); - friend uint32_t GetLinkWeight(hsa_amd_memory_pool_link_info_t* link_info, uint32_t hops); - - // Return value of input key as string - friend std::string GetValueAsString(uint32_t key, uint32_t value); - - // Structure of Version used to identify an instance of RocmBandwidthTest - struct RocmBandwidthVersion { - - // Tracks changes such as re-design - // re-factor, re-write, extend with - // new major functionality, etc - uint32_t major_id; - - // Tracks changes that affect Apis - // being added, removed, modified - uint32_t minor_id; - - // Tracks changes that affect Apis - // being added, removed, modified - uint32_t step_id; - - // Used to pack space for structure alignment - uint32_t reserved; - }; - - RocmBandwidthVersion version_; - void PrintVersion() const; - void PrintLaunchCmd() const; - std::string GetVersion() const; - - // Used to help count agent_info - uint32_t agent_index_; - - // List used to store agent info, indexed by agent_index_ - vector agent_list_; - - // Used to help count pool_info_t - uint32_t pool_index_; - - // List used to store pool_info_t, indexed by pool_index_ - vector pool_list_; - - // List used to store agent_pool_info_t - vector agent_pool_list_; - - // List of agents involved in a bidrectional copy operation - // Size of the list cannot exceed the number of agents - // reported by the system - vector bidir_list_; - - // List of source agents in a unidrectional copy operation - // Size of the list cannot exceed the number of agents - // reported by the system - vector src_list_; - - // List of destination agents in a unidrectional copy operation - // Size of the list cannot exceed the number of agents - // reported by the system - vector dst_list_; - - // List of agents involved in read operation. Has - // two agents, the first agent hosts the memory pool - // while the second agent executes the read operation - vector read_list_; - - // List of agents involved in write operation. Has - // two agents, the first agent hosts the memory pool - // while the second agent executes the write operation - vector write_list_; - - // List of sizes to use in copy and read/write transactions - // Size is specified in terms of Megabytes - vector size_list_; - - // Type of service requested by user - uint32_t req_read_; - uint32_t req_write_; - uint32_t req_version_; - uint32_t req_topology_; - uint32_t req_list_devs_; - uint32_t req_copy_bidir_; - uint32_t req_copy_unidir_; - uint32_t req_copy_all_bidir_; - uint32_t req_copy_all_unidir_; - uint32_t req_concurrent_copy_bidir_; - uint32_t req_concurrent_copy_unidir_; - - static const uint32_t USR_SRC_FLAG = 0x01; - static const uint32_t USR_DST_FLAG = 0x02; - - static const uint32_t USR_BUFFER_SIZE = 0x01; - static const uint32_t USR_BUFFER_INIT = 0x02; - static const uint32_t CPU_VISIBLE_TIME = 0x04; - static const uint32_t DEV_COPY_LATENCY = 0x08; - static const uint32_t VALIDATE_COPY_OP = 0x010; - - static const uint32_t LINK_TYPE_SELF = 0x00; - static const uint32_t LINK_TYPE_PCIE = 0x01; - static const uint32_t LINK_TYPE_XGMI = 0x02; - static const uint32_t LINK_TYPE_IGNORED = 0x03; - static const uint32_t LINK_TYPE_NO_PATH = 0xFFFFFFFF; - - static const uint32_t LINK_PROP_HOPS = 0x00; - static const uint32_t LINK_PROP_TYPE = 0x01; - static const uint32_t LINK_PROP_WEIGHT = 0x02; - static const uint32_t LINK_PROP_ACCESS = 0x03; - - // Encodes validation failure - static const double VALIDATE_COPY_OP_FAILURE; - - // List used to store transactions per user request - vector trans_list_; - - // List used to track agents involved in various transactions - uint32_t* active_agents_list_; - - // Matrix used to track Access among agents - uint32_t* access_matrix_; - uint32_t* link_hops_matrix_; - uint32_t* link_type_matrix_; - uint32_t* link_weight_matrix_; - uint32_t* direct_access_matrix_; - - // Env key to determine if Fine-grained or - // Coarse-grained pool should be filtered out - char* skip_cpu_fine_grain_; - char* skip_gpu_coarse_grain_; - - // Env key to determine if the run should block - // or actively wait on completion signal - char* bw_blocking_run_; - - // Env key to determine if the run is a default one - char* bw_default_run_; - - // Env key to specify iteration count - char* bw_iter_cnt_; - char* bw_sleep_time_; - uint32_t sleep_time_; - std::chrono::nanoseconds cpu_cp_time_; - std::chrono::microseconds sleep_usecs_; - std::chrono::time_point cpu_end_; - std::chrono::time_point cpu_start_; - - // Variable to store argument number - uint32_t usr_argc_; - - // Pointer to store address of argument text - char** usr_argv_; - - // Flag to print Cpu time - bool print_cpu_time_; - - // Determines if user has requested initialization - bool init_; - - // Determines if user has requested validation - bool validate_; - long double init_val_; - - // Handles to buffer used to initialize and validate - void* init_src_; - void* validate_dst_; - hsa_signal_t init_signal_; - - // Determines the latency overhead of copy operations - bool latency_; - - // CPU agent used for validation - int32_t cpu_index_; - hsa_agent_t cpu_agent_; - - // System region - hsa_amd_memory_pool_t sys_pool_; - - static const size_t SIZE_LIST[20]; - static const size_t LATENCY_SIZE_LIST[20]; - - // Exit value to return in case of error - int32_t exit_value_; + // @brief: Dispaly Benchmark result + void PopulatePerfMatrix(bool peak, double* perf_matrix) const; + void PrintPerfMatrix(bool validate, bool peak, double* perf_matrix) const; + void DisplayDevInfo() const; + void DisplayIOTime(async_trans_t& trans) const; + void DisplayCopyTime(async_trans_t& trans) const; + void DisplayCopyTimeMatrix(bool peak) const; + void DisplayValidationMatrix() const; + + private: + // @brief: Validate the arguments passed in by user + bool ValidateArguments(); + bool ValidateReadReq(); + bool ValidateWriteReq(); + bool ValidateReadOrWriteReq(vector& in_list); + + void ValidateCopyBidirFlags(uint32_t copy_ctrl_mask); + void ValidateCopyAllBidirFlags(uint32_t copy_ctrl_mask); + void ValidateCopyAllUnidirFlags(uint32_t copy_ctrl_mask); + void ValidateCopyUnidirFlags(uint32_t copy_mask, uint32_t copy_ctrl_mask); + + bool ValidateBidirCopyReq(); + bool ValidateUnidirCopyReq(); + bool ValidateConcurrentCopyReq(); + bool ValidateCopyReq(vector& in_list); + void PrintIOAccessError(uint32_t agent_idx, uint32_t pool_idx); + void PrintCopyAccessError(uint32_t src_pool_idx, uint32_t dst_pool_idx); + + bool PoolIsPresent(vector& in_list); + bool PoolIsDuplicated(vector& in_list); + + // @brief: Builds a list of transaction per user request + void ComputeCopyTime(async_trans_t& trans); + void ComputeCopyTime(vector& trans_list); + void BuildDeviceList(); + void BuildBufferList(); + bool BuildTransList(); + bool BuildReadTrans(); + bool BuildWriteTrans(); + bool BuildBidirCopyTrans(); + bool BuildUnidirCopyTrans(); + bool BuildAllPoolsBidirCopyTrans(); + bool BuildAllPoolsUnidirCopyTrans(); + bool BuildReadOrWriteTrans(uint32_t req_type, vector& in_list); + bool BuildCopyTrans(uint32_t req_type, vector& src_list, vector& dst_list); + bool BuildConcurrentCopyTrans(uint32_t req_type, vector& dev_list); + + void WaitForCopyCompletion(vector& signal_list); + + void AllocateCopyBuffers(size_t size, void*& src, hsa_amd_memory_pool_t src_pool, + void*& dst, hsa_amd_memory_pool_t dst_pool); + + void AllocateConcurrentCopyResources(bool bidir, vector& trans_list, + vector& buffer_list, + vector& dev_list, + vector& dev_idx_list, + vector& sig_list, + vector& pool_list); + + void ReleaseBuffers(vector& buffer_list); + void ReleaseSignals(vector& signal_list); + + double GetGpuCopyTime(bool bidir, hsa_signal_t signal_fwd, hsa_signal_t signal_rev); + + void InitializeSrcBuffer(size_t size, void* buf_cpy, uint32_t cpy_dev_idx, + hsa_agent_t cpy_agent); + + bool ValidateDstBuffer(size_t max_size, size_t curr_size, void* buf_cpy, + uint32_t cpy_dev_idx, hsa_agent_t cpy_agent); + + void copy_buffer(void* dst, hsa_agent_t dst_agent, void* src, hsa_agent_t src_agent, + size_t size, hsa_signal_t signal); + bool FilterCpuPool(uint32_t req_type, hsa_device_type_t dev_type, bool fine_grained); + + // Find the mirror transaction if present + bool FindMirrorRequest(bool reverse, uint32_t src_idx, uint32_t dst_idx); + + // @brief: Check if agent and access memory pool, if so, set + // access to the agent, if not, exit + void AcquireAccess(hsa_agent_t agent, void* ptr); + void AcquirePoolAcceses(uint32_t src_dev_idx, hsa_agent_t src_agent, void* src, + uint32_t dst_dev_idx, hsa_agent_t dst_agent, void* dst); + + // Functions to find agents and memory pools and udpate + // relevant data structures used to maintain system topology + friend hsa_status_t AgentInfo(hsa_agent_t agent, void* data); + friend hsa_status_t MemPoolInfo(hsa_amd_memory_pool_t pool, void* data); + + // Populate the Bus Device Function of Gpu device + friend void PopulateBDF(uint32_t bdf_id, agent_info_t* agent_info); + + // Compute the type and weight of a link + friend uint32_t GetLinkType(hsa_device_type_t src_dev_type, hsa_device_type_t dst_dev_type, + hsa_amd_memory_pool_link_info_t* link_info, uint32_t hops); + friend uint32_t GetLinkWeight(hsa_amd_memory_pool_link_info_t* link_info, uint32_t hops); + + // Return value of input key as string + friend std::string GetValueAsString(uint32_t key, uint32_t value); + + // Structure of Version used to identify an instance of RocmBandwidthTest + struct RocmBandwidthVersion { + // Tracks changes such as re-design + // re-factor, re-write, extend with + // new major functionality, etc + uint32_t major_id; + + // Tracks changes that affect Apis + // being added, removed, modified + uint32_t minor_id; + + // Tracks changes that affect Apis + // being added, removed, modified + uint32_t step_id; + + // Used to pack space for structure alignment + uint32_t reserved; + }; + + RocmBandwidthVersion version_; + void PrintVersion() const; + void PrintLaunchCmd() const; + std::string GetVersion() const; + + // Used to help count agent_info + uint32_t agent_index_; + + // List used to store agent info, indexed by agent_index_ + vector agent_list_; + + // Used to help count pool_info_t + uint32_t pool_index_; + + // List used to store pool_info_t, indexed by pool_index_ + vector pool_list_; + + // List used to store agent_pool_info_t + vector agent_pool_list_; + + // List of agents involved in a bidrectional copy operation + // Size of the list cannot exceed the number of agents + // reported by the system + vector bidir_list_; + + // List of source agents in a unidrectional copy operation + // Size of the list cannot exceed the number of agents + // reported by the system + vector src_list_; + + // List of destination agents in a unidrectional copy operation + // Size of the list cannot exceed the number of agents + // reported by the system + vector dst_list_; + + // List of agents involved in read operation. Has + // two agents, the first agent hosts the memory pool + // while the second agent executes the read operation + vector read_list_; + + // List of agents involved in write operation. Has + // two agents, the first agent hosts the memory pool + // while the second agent executes the write operation + vector write_list_; + + // List of sizes to use in copy and read/write transactions + // Size is specified in terms of Megabytes + vector size_list_; + + // Type of service requested by user + uint32_t req_read_; + uint32_t req_write_; + uint32_t req_version_; + uint32_t req_topology_; + uint32_t req_list_devs_; + uint32_t req_copy_bidir_; + uint32_t req_copy_unidir_; + uint32_t req_copy_all_bidir_; + uint32_t req_copy_all_unidir_; + uint32_t req_concurrent_copy_bidir_; + uint32_t req_concurrent_copy_unidir_; + + static const uint32_t USR_SRC_FLAG = 0x01; + static const uint32_t USR_DST_FLAG = 0x02; + + static const uint32_t USR_BUFFER_SIZE = 0x01; + static const uint32_t USR_BUFFER_INIT = 0x02; + static const uint32_t CPU_VISIBLE_TIME = 0x04; + static const uint32_t DEV_COPY_LATENCY = 0x08; + static const uint32_t VALIDATE_COPY_OP = 0x010; + + static const uint32_t LINK_TYPE_SELF = 0x00; + static const uint32_t LINK_TYPE_PCIE = 0x01; + static const uint32_t LINK_TYPE_XGMI = 0x02; + static const uint32_t LINK_TYPE_IGNORED = 0x03; + static const uint32_t LINK_TYPE_NO_PATH = 0xFFFFFFFF; + + static const uint32_t LINK_PROP_HOPS = 0x00; + static const uint32_t LINK_PROP_TYPE = 0x01; + static const uint32_t LINK_PROP_WEIGHT = 0x02; + static const uint32_t LINK_PROP_ACCESS = 0x03; + + // Encodes validation failure + static const double VALIDATE_COPY_OP_FAILURE; + + // List used to store transactions per user request + vector trans_list_; + + // List used to track agents involved in various transactions + uint32_t* active_agents_list_; + + // Matrix used to track Access among agents + uint32_t* access_matrix_; + uint32_t* link_hops_matrix_; + uint32_t* link_type_matrix_; + uint32_t* link_weight_matrix_; + uint32_t* direct_access_matrix_; + + // Env key to determine if Fine-grained or + // Coarse-grained pool should be filtered out + char* skip_cpu_fine_grain_; + char* skip_gpu_coarse_grain_; + + // Env key to determine if the run should block + // or actively wait on completion signal + char* bw_blocking_run_; + + // Env key to determine if the run is a default one + char* bw_default_run_; + + // Env key to specify iteration count + char* bw_iter_cnt_; + char* bw_sleep_time_; + uint32_t sleep_time_; + std::chrono::nanoseconds cpu_cp_time_; + std::chrono::microseconds sleep_usecs_; + std::chrono::time_point cpu_end_; + std::chrono::time_point cpu_start_; + + // Variable to store argument number + uint32_t usr_argc_; + + // Pointer to store address of argument text + char** usr_argv_; + + // Flag to print Cpu time + bool print_cpu_time_; + + // Determines if user has requested initialization + bool init_; + + // Determines if user has requested validation + bool validate_; + long double init_val_; + + // Handles to buffer used to initialize and validate + void* init_src_; + void* validate_dst_; + hsa_signal_t init_signal_; + + // Determines the latency overhead of copy operations + bool latency_; + + // CPU agent used for validation + int32_t cpu_index_; + hsa_agent_t cpu_agent_; + + // System region + hsa_amd_memory_pool_t sys_pool_; + + static const size_t SIZE_LIST[20]; + static const size_t LATENCY_SIZE_LIST[20]; + + // Exit value to return in case of error + int32_t exit_value_; }; -#endif // __ROC_BANDWIDTH_TEST_H__ +#endif // __ROC_BANDWIDTH_TEST_H__ diff --git a/rocm_bandwidth_test_io.cpp b/rocm_bandwidth_test_io.cpp old mode 100755 new mode 100644 index 42dccee..7b3e9b5 --- a/rocm_bandwidth_test_io.cpp +++ b/rocm_bandwidth_test_io.cpp @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL @@ -43,15 +43,15 @@ #include "common.hpp" #include "rocm_bandwidth_test.hpp" -#include #include -#include +#include #include + +#include #include #include void RocmBandwidthTest::RunIOBenchmark(async_trans_t& trans) { - - std::cout << "Unsupported Request - Read / Write" << std::endl; - exit(1); + std::cout << "Unsupported Request - Read / Write" << std::endl; + exit(1); } diff --git a/rocm_bandwidth_test_parse.cpp b/rocm_bandwidth_test_parse.cpp old mode 100755 new mode 100644 index bd43533..601d84f --- a/rocm_bandwidth_test_parse.cpp +++ b/rocm_bandwidth_test_parse.cpp @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL @@ -44,18 +44,19 @@ #include "rocm_bandwidth_test.hpp" #include +#include + #include -#include #include -#include -//#include +#include +// #include // Parse option value string. The string has to be either // sin or cos literal // value as in example: -I sin or -I cos /* static bool ParseTrigValue(char* value_str, uint32_t&value) { - + // Capture the option value string std::cout << "Value of Trig: " << value_str << std::endl; int32_t cmp = strncasecmp("sin", value_str, 3); @@ -69,484 +70,459 @@ static bool ParseTrigValue(char* value_str, uint32_t&value) { value = 2; return true; } - + return false; } */ // Parse option value string. The string has one decimal // value as in example: -i 11.231926 -static bool ParseInitValue(char* value_str, long double&value) { - - // Capture the option value string - value = strtold(value_str, NULL); - return true; +static bool ParseInitValue(char* value_str, long double& value) { + // Capture the option value string + value = strtold(value_str, NULL); + return true; } // Parse option value string. The string has one more decimal // values separated by comma - "3,6,9,12,15". -static bool ParseOptionValue(char* value, vector&value_list) { - - // Capture the option value string - std::stringstream stream; - stream << value; - - uint32_t token = 0x11231926; - do { - - // Read the option value - stream >> token; - if (stream.fail()) { - return false; - } +static bool ParseOptionValue(char* value, vector& value_list) { + // Capture the option value string + std::stringstream stream; + stream << value; + + uint32_t token = 0x11231926; + do { + // Read the option value + stream >> token; + if (stream.fail()) { + return false; + } - // Update output list with values - value_list.push_back(token); + // Update output list with values + value_list.push_back(token); - // Ignore the delimiter - if((stream.eof()) || - (stream.peek() == ',')) { - stream.ignore(); - } else { - return false; - } + // Ignore the delimiter + if ((stream.eof()) || (stream.peek() == ',')) { + stream.ignore(); + } else { + return false; + } - } while (!stream.eof()); + } while (!stream.eof()); - return true; + return true; } void RocmBandwidthTest::ValidateCopyBidirFlags(uint32_t copy_ctrl_mask) { + // It is illegal to specify following flags + // secondary flag that affects a copy operation + if ((copy_ctrl_mask & DEV_COPY_LATENCY) || (copy_ctrl_mask & CPU_VISIBLE_TIME) || + (copy_ctrl_mask & VALIDATE_COPY_OP)) { + PrintHelpScreen(); + exit(0); + } - // It is illegal to specify following flags - // secondary flag that affects a copy operation - if ((copy_ctrl_mask & DEV_COPY_LATENCY) || - (copy_ctrl_mask & CPU_VISIBLE_TIME) || - (copy_ctrl_mask & VALIDATE_COPY_OP)) { - PrintHelpScreen(); - exit(0); - } - - return; + return; } -void RocmBandwidthTest::ValidateCopyUnidirFlags(uint32_t copy_mask, - uint32_t copy_ctrl_mask) { - - if (copy_mask != (USR_SRC_FLAG | USR_DST_FLAG)) { - PrintHelpScreen(); - exit(0); - } +void RocmBandwidthTest::ValidateCopyUnidirFlags(uint32_t copy_mask, uint32_t copy_ctrl_mask) { + if (copy_mask != (USR_SRC_FLAG | USR_DST_FLAG)) { + PrintHelpScreen(); + exit(0); + } - if ((copy_ctrl_mask & DEV_COPY_LATENCY) && - (copy_ctrl_mask & USR_BUFFER_SIZE)) { - PrintHelpScreen(); - exit(0); - } + if ((copy_ctrl_mask & DEV_COPY_LATENCY) && (copy_ctrl_mask & USR_BUFFER_SIZE)) { + PrintHelpScreen(); + exit(0); + } - // It is illegal to specify Latency and another - // secondary flag that affects a copy operation - if ((copy_ctrl_mask & DEV_COPY_LATENCY) && - (copy_ctrl_mask & VALIDATE_COPY_OP)) { - PrintHelpScreen(); - exit(0); - } + // It is illegal to specify Latency and another + // secondary flag that affects a copy operation + if ((copy_ctrl_mask & DEV_COPY_LATENCY) && (copy_ctrl_mask & VALIDATE_COPY_OP)) { + PrintHelpScreen(); + exit(0); + } - // It is illegal to specify user buffer sizes and another - // secondary flag that affects a copy operation - if ((copy_ctrl_mask & USR_BUFFER_SIZE) && - (copy_ctrl_mask & VALIDATE_COPY_OP)) { - PrintHelpScreen(); - exit(0); - } + // It is illegal to specify user buffer sizes and another + // secondary flag that affects a copy operation + if ((copy_ctrl_mask & USR_BUFFER_SIZE) && (copy_ctrl_mask & VALIDATE_COPY_OP)) { + PrintHelpScreen(); + exit(0); + } - // Check of illegal flags is complete - return; + // Check of illegal flags is complete + return; } void RocmBandwidthTest::ValidateCopyAllBidirFlags(uint32_t copy_ctrl_mask) { + // It is illegal to specify following flags + // secondary flag that affects a copy operation + if ((copy_ctrl_mask & DEV_COPY_LATENCY) || (copy_ctrl_mask & USR_BUFFER_SIZE) || + (copy_ctrl_mask & CPU_VISIBLE_TIME) || (copy_ctrl_mask & VALIDATE_COPY_OP)) { + PrintHelpScreen(); + exit(0); + } - // It is illegal to specify following flags - // secondary flag that affects a copy operation - if ((copy_ctrl_mask & DEV_COPY_LATENCY) || - (copy_ctrl_mask & USR_BUFFER_SIZE) || - (copy_ctrl_mask & CPU_VISIBLE_TIME) || - (copy_ctrl_mask & VALIDATE_COPY_OP)) { - PrintHelpScreen(); - exit(0); - } - - // Check of illegal flags is complete - return; + // Check of illegal flags is complete + return; } void RocmBandwidthTest::ValidateCopyAllUnidirFlags(uint32_t copy_ctrl_mask) { + // It is illegal to specify following flags + // secondary flag that affects a copy operation + if ((copy_ctrl_mask & DEV_COPY_LATENCY) || (copy_ctrl_mask & USR_BUFFER_SIZE)) { + PrintHelpScreen(); + exit(0); + } - // It is illegal to specify following flags - // secondary flag that affects a copy operation - if ((copy_ctrl_mask & DEV_COPY_LATENCY) || - (copy_ctrl_mask & USR_BUFFER_SIZE)) { - PrintHelpScreen(); - exit(0); - } - - // Check of illegal flags is complete - return; + // Check of illegal flags is complete + return; } -void RocmBandwidthTest::ValidateInputFlags(uint32_t pf_cnt, - uint32_t copy_mask, uint32_t copy_ctrl_mask) { - - // Input can't have more than two Primary flags - if ((pf_cnt == 0) || (pf_cnt > 2)) { - PrintHelpScreen(); - exit(0); - } +void RocmBandwidthTest::ValidateInputFlags(uint32_t pf_cnt, uint32_t copy_mask, + uint32_t copy_ctrl_mask) { + // Input can't have more than two Primary flags + if ((pf_cnt == 0) || (pf_cnt > 2)) { + PrintHelpScreen(); + exit(0); + } - // Input specifies unidirectional copy among subset of devices - // rocm_bandwidth_test -s Di,Dj,Dk -d Dp,Dq,Dr - if (pf_cnt == 2) { - return ValidateCopyUnidirFlags(copy_mask, copy_ctrl_mask); - } + // Input specifies unidirectional copy among subset of devices + // rocm_bandwidth_test -s Di,Dj,Dk -d Dp,Dq,Dr + if (pf_cnt == 2) { + return ValidateCopyUnidirFlags(copy_mask, copy_ctrl_mask); + } - // Input is requesting to print RBT version - // rocm_bandwidth_test -q - if (req_version_ == REQ_VERSION) { - PrintVersion(); - exit(0); - } + // Input is requesting to print RBT version + // rocm_bandwidth_test -q + if (req_version_ == REQ_VERSION) { + PrintVersion(); + exit(0); + } - // Input is requesting to print ROCm topology - // rocm_bandwidth_test -t - if (req_topology_ == REQ_TOPOLOGY) { - return; - } + // Input is requesting to print ROCm topology + // rocm_bandwidth_test -t + if (req_topology_ == REQ_TOPOLOGY) { + return; + } - // Input is requesting to print list of devices - // rocm_bandwidth_test -e - if (req_list_devs_ == REQ_LIST_DEVS) { - return; - } + // Input is requesting to print list of devices + // rocm_bandwidth_test -e + if (req_list_devs_ == REQ_LIST_DEVS) { + return; + } - // Input is for bidirectional bandwidth for some devices - // rocm_bandwidth_test -b - if (req_copy_bidir_ == REQ_COPY_BIDIR) { - return ValidateCopyBidirFlags(copy_ctrl_mask); - } + // Input is for bidirectional bandwidth for some devices + // rocm_bandwidth_test -b + if (req_copy_bidir_ == REQ_COPY_BIDIR) { + return ValidateCopyBidirFlags(copy_ctrl_mask); + } - // Input is for bidirectional bandwidth for all devices - // rocm_bandwidth_test -A - if (req_copy_all_bidir_ == REQ_COPY_ALL_BIDIR) { - return ValidateCopyAllBidirFlags(copy_ctrl_mask); - } + // Input is for bidirectional bandwidth for all devices + // rocm_bandwidth_test -A + if (req_copy_all_bidir_ == REQ_COPY_ALL_BIDIR) { + return ValidateCopyAllBidirFlags(copy_ctrl_mask); + } - // Input is for unidirectional bandwidth for all devices - // rocm_bandwidth_test -a - if (req_copy_all_unidir_ == REQ_COPY_ALL_UNIDIR) { - return ValidateCopyAllUnidirFlags(copy_ctrl_mask); - } + // Input is for unidirectional bandwidth for all devices + // rocm_bandwidth_test -a + if (req_copy_all_unidir_ == REQ_COPY_ALL_UNIDIR) { + return ValidateCopyAllUnidirFlags(copy_ctrl_mask); + } - // Input is requesting to run concurrent copies - // rocm_bandwidth_test -k or -K - // It is illegal to specify secondary flags - if ((req_concurrent_copy_bidir_ == REQ_CONCURRENT_COPY_BIDIR) || - (req_concurrent_copy_unidir_ == REQ_CONCURRENT_COPY_UNIDIR)) { - if ((copy_ctrl_mask & DEV_COPY_LATENCY) || - (copy_ctrl_mask & USR_BUFFER_INIT) || - (copy_ctrl_mask & USR_BUFFER_SIZE) || - (copy_ctrl_mask & CPU_VISIBLE_TIME) || - (copy_ctrl_mask & VALIDATE_COPY_OP)) { - PrintHelpScreen(); - exit(0); + // Input is requesting to run concurrent copies + // rocm_bandwidth_test -k or -K + // It is illegal to specify secondary flags + if ((req_concurrent_copy_bidir_ == REQ_CONCURRENT_COPY_BIDIR) || + (req_concurrent_copy_unidir_ == REQ_CONCURRENT_COPY_UNIDIR)) { + if ((copy_ctrl_mask & DEV_COPY_LATENCY) || (copy_ctrl_mask & USR_BUFFER_INIT) || + (copy_ctrl_mask & USR_BUFFER_SIZE) || (copy_ctrl_mask & CPU_VISIBLE_TIME) || + (copy_ctrl_mask & VALIDATE_COPY_OP)) { + PrintHelpScreen(); + exit(0); + } + return; } - return; - } - std::cout << "ValidateInputFlags: This should not be happening" << std::endl; - assert(false); - return; + std::cout << "ValidateInputFlags: This should not be happening" << std::endl; + assert(false); + return; } void RocmBandwidthTest::BuildDeviceList() { - - // Initialize devices list if copying unidirectional - // all or bidirectional all mode is enabled - uint32_t size = pool_list_.size(); - for (uint32_t idx = 0; idx < size; idx++) { - if (req_copy_all_bidir_ == REQ_COPY_ALL_BIDIR) { - bidir_list_.push_back(idx); - } else { - src_list_.push_back(idx); - dst_list_.push_back(idx); + // Initialize devices list if copying unidirectional + // all or bidirectional all mode is enabled + uint32_t size = pool_list_.size(); + for (uint32_t idx = 0; idx < size; idx++) { + if (req_copy_all_bidir_ == REQ_COPY_ALL_BIDIR) { + bidir_list_.push_back(idx); + } else { + src_list_.push_back(idx); + dst_list_.push_back(idx); + } } - } } void RocmBandwidthTest::BuildBufferList() { - - // User has specified buffer sizes to be used - if (size_list_.size() != 0) { - uint32_t size_len = size_list_.size(); - for (uint32_t idx = 0; idx < size_len; idx++) { - size_list_[idx] = size_list_[idx] * 1024 * 1024; + // User has specified buffer sizes to be used + if (size_list_.size() != 0) { + uint32_t size_len = size_list_.size(); + for (uint32_t idx = 0; idx < size_len; idx++) { + size_list_[idx] = size_list_[idx] * 1024 * 1024; + } + return; } - return; - } - // User has NOT specified buffer sizes to be used - // For All Copy operations use only one buffer size - uint32_t size_len = sizeof(SIZE_LIST)/sizeof(size_t); - for (uint32_t idx = 0; idx < size_len; idx++) { + // User has NOT specified buffer sizes to be used + // For All Copy operations use only one buffer size + uint32_t size_len = sizeof(SIZE_LIST) / sizeof(size_t); + for (uint32_t idx = 0; idx < size_len; idx++) { + if ((req_copy_all_bidir_ == REQ_COPY_ALL_BIDIR) || + (req_copy_all_unidir_ == REQ_COPY_ALL_UNIDIR)) { + if (idx == 16) { + size_list_.push_back(SIZE_LIST[idx]); + } + } + + if (req_copy_unidir_ == REQ_COPY_UNIDIR) { + if (latency_) { + size_list_.push_back(LATENCY_SIZE_LIST[idx]); + } else if (validate_) { + if (idx == 16) { + size_list_.push_back(SIZE_LIST[idx]); + } + } else { + size_list_.push_back(SIZE_LIST[idx]); + } + } + + if (req_copy_bidir_ == REQ_COPY_BIDIR) { + size_list_.push_back(SIZE_LIST[idx]); + } - if ((req_copy_all_bidir_ == REQ_COPY_ALL_BIDIR) || - (req_copy_all_unidir_ == REQ_COPY_ALL_UNIDIR)) { - if (idx == 16) { - size_list_.push_back(SIZE_LIST[idx]); - } + if ((req_concurrent_copy_bidir_ == REQ_CONCURRENT_COPY_BIDIR) || + (req_concurrent_copy_unidir_ == REQ_CONCURRENT_COPY_UNIDIR)) { + size_list_.push_back(SIZE_LIST[idx]); + } } +} - if (req_copy_unidir_ == REQ_COPY_UNIDIR) { - if (latency_) { - size_list_.push_back(LATENCY_SIZE_LIST[idx]); - } else if (validate_) { - if (idx == 16) { - size_list_.push_back(SIZE_LIST[idx]); +void RocmBandwidthTest::ParseArguments() { + bool print_help = 0; + uint32_t copy_mask = 0; + uint32_t copy_ctrl_mask = 0; + uint32_t num_primary_flags = 0; + + // This will suppress prints from getopt implementation + // In case of error, it will return the character '?' as + // return value. + opterr = 0; + + int opt; + bool status; + while ((opt = getopt(usr_argc_, usr_argv_, "hqteclvaAb:i:s:d:r:w:m:k:K:")) != -1) { + switch (opt) { + // Print help screen + case 'h': + print_help = true; + break; + + // Print version of the test + case 'q': + num_primary_flags++; + req_version_ = REQ_VERSION; + break; + + // Print list of devices + case 'e': + num_primary_flags++; + req_list_devs_ = REQ_LIST_DEVS; + break; + + // Print system topology + case 't': + num_primary_flags++; + req_topology_ = REQ_TOPOLOGY; + break; + + // Enable Unidirectional copy among all valid buffers + case 'a': + num_primary_flags++; + req_copy_all_unidir_ = REQ_COPY_ALL_UNIDIR; + break; + + // Enable Bidirectional copy among all valid buffers + case 'A': + num_primary_flags++; + req_copy_all_bidir_ = REQ_COPY_ALL_BIDIR; + break; + + // Collect list of source buffers involved in unidirectional copy operation + case 's': + status = ParseOptionValue(optarg, src_list_); + if (status) { + num_primary_flags++; + copy_mask |= USR_SRC_FLAG; + req_copy_unidir_ = REQ_COPY_UNIDIR; + break; + } + print_help = true; + break; + + // Collect list of destination buffers involved in unidirectional copy operation + case 'd': + status = ParseOptionValue(optarg, dst_list_); + if (status) { + num_primary_flags++; + copy_mask |= USR_DST_FLAG; + req_copy_unidir_ = REQ_COPY_UNIDIR; + break; + } + print_help = true; + break; + + // Collect list of agents involved in bidirectional copy operation + case 'b': + status = ParseOptionValue(optarg, bidir_list_); + if (status) { + num_primary_flags++; + req_copy_bidir_ = REQ_COPY_BIDIR; + break; + } + print_help = true; + break; + + // Collect list of agents involved in concurrent copy operation + case 'k': + case 'K': + status = ParseOptionValue(optarg, bidir_list_); + if ((status) && ((bidir_list_.size() % 2) == 0)) { + num_primary_flags++; + if (opt == 'K') { + req_concurrent_copy_bidir_ = REQ_CONCURRENT_COPY_BIDIR; + } else { + req_concurrent_copy_unidir_ = REQ_CONCURRENT_COPY_UNIDIR; + } + break; + } + print_help = true; + break; + + // Size of buffers to use in copy and read/write operations + case 'm': + status = ParseOptionValue(optarg, size_list_); + if (status == false) { + print_help = true; + break; + } + copy_ctrl_mask |= USR_BUFFER_SIZE; + break; + + // Print Cpu time + case 'c': + print_cpu_time_ = true; + copy_ctrl_mask |= CPU_VISIBLE_TIME; + break; + + // Set Latency mode flag to true + case 'l': + latency_ = true; + copy_ctrl_mask |= DEV_COPY_LATENCY; + break; + + // Set validation mode flag to true + case 'v': + validate_ = true; + copy_ctrl_mask |= VALIDATE_COPY_OP; + break; + + // Set initialization mode flag to true + case 'i': + init_ = true; + status = ParseInitValue(optarg, init_val_); + if (status == false) { + print_help = true; + } + copy_ctrl_mask |= USR_BUFFER_INIT; + break; + + // Collect request to read a buffer + case 'r': + req_read_ = REQ_READ; + status = ParseOptionValue(optarg, read_list_); + if (status == false) { + print_help = true; + } + break; + + // Collect request to write a buffer + case 'w': + req_write_ = REQ_WRITE; + status = ParseOptionValue(optarg, write_list_); + if (status == false) { + print_help = true; + } + break; + + // getopt implementation returns the value of the unknown + // option or an option with missing operand in the variable + // optopt + case '?': + std::cout << "Argument is illegal or needs value: " << '?' << std::endl; + if ((optopt == 'b') || (optopt == 's') || (optopt == 'd') || (optopt == 'm') || + (optopt == 'i') || (false)) { + std::cout << "Error: Options -b -s -d -m -i -k and -K require argument" + << std::endl; + } + print_help = true; + break; + default: + print_help = true; + break; } - } else { - size_list_.push_back(SIZE_LIST[idx]); - } } - if (req_copy_bidir_ == REQ_COPY_BIDIR) { - size_list_.push_back(SIZE_LIST[idx]); + // Print help screen if user option has "-h" + if (print_help) { + PrintHelpScreen(); + exit(0); } - - if ((req_concurrent_copy_bidir_ == REQ_CONCURRENT_COPY_BIDIR) || - (req_concurrent_copy_unidir_ == REQ_CONCURRENT_COPY_UNIDIR)) { - size_list_.push_back(SIZE_LIST[idx]); + + // Determine input of primary flags is valid + ValidateInputFlags(num_primary_flags, copy_mask, copy_ctrl_mask); + + // Initialize Roc Runtime + err_ = hsa_init(); + ErrorCheck(err_); + + // Discover the topology of RocR agent in system + DiscoverTopology(); + + // Print list of devices if user option is "-e" + if (req_list_devs_ == REQ_LIST_DEVS) { + PrintVersion(); + PrintTopology(); + exit(0); } - } -} -void RocmBandwidthTest::ParseArguments() { + // Print system topology if user option is "-t" + if (req_topology_ == REQ_TOPOLOGY) { + PrintVersion(); + PrintTopology(); + PrintLinkPropsMatrix(LINK_PROP_ACCESS); + PrintLinkPropsMatrix(LINK_PROP_TYPE); + PrintLinkPropsMatrix(LINK_PROP_WEIGHT); + exit(0); + } - bool print_help = 0; - uint32_t copy_mask = 0; - uint32_t copy_ctrl_mask = 0; - uint32_t num_primary_flags = 0; - - // This will suppress prints from getopt implementation - // In case of error, it will return the character '?' as - // return value. - opterr = 0; - - int opt; - bool status; - while ((opt = getopt(usr_argc_, usr_argv_, "hqteclvaAb:i:s:d:r:w:m:k:K:")) != -1) { - switch (opt) { - - // Print help screen - case 'h': - print_help = true; - break; - - // Print version of the test - case 'q': - num_primary_flags++; - req_version_ = REQ_VERSION; - break; - - // Print list of devices - case 'e': - num_primary_flags++; - req_list_devs_ = REQ_LIST_DEVS; - break; - - // Print system topology - case 't': - num_primary_flags++; - req_topology_ = REQ_TOPOLOGY; - break; - - // Enable Unidirectional copy among all valid buffers - case 'a': - num_primary_flags++; - req_copy_all_unidir_ = REQ_COPY_ALL_UNIDIR; - break; - - // Enable Bidirectional copy among all valid buffers - case 'A': - num_primary_flags++; - req_copy_all_bidir_ = REQ_COPY_ALL_BIDIR; - break; - - // Collect list of source buffers involved in unidirectional copy operation - case 's': - status = ParseOptionValue(optarg, src_list_); - if (status) { - num_primary_flags++; - copy_mask |= USR_SRC_FLAG; - req_copy_unidir_ = REQ_COPY_UNIDIR; - break; - } - print_help = true; - break; - - // Collect list of destination buffers involved in unidirectional copy operation - case 'd': - status = ParseOptionValue(optarg, dst_list_); - if (status) { - num_primary_flags++; - copy_mask |= USR_DST_FLAG; - req_copy_unidir_ = REQ_COPY_UNIDIR; - break; - } - print_help = true; - break; - - // Collect list of agents involved in bidirectional copy operation - case 'b': - status = ParseOptionValue(optarg, bidir_list_); - if (status) { - num_primary_flags++; - req_copy_bidir_ = REQ_COPY_BIDIR; - break; - } - print_help = true; - break; - - // Collect list of agents involved in concurrent copy operation - case 'k': - case 'K': - status = ParseOptionValue(optarg, bidir_list_); - if ((status) && ((bidir_list_.size() % 2) == 0)) { - num_primary_flags++; - if (opt == 'K') { - req_concurrent_copy_bidir_ = REQ_CONCURRENT_COPY_BIDIR; - } else { - req_concurrent_copy_unidir_ = REQ_CONCURRENT_COPY_UNIDIR; - } - break; - } - print_help = true; - break; - - // Size of buffers to use in copy and read/write operations - case 'm': - status = ParseOptionValue(optarg, size_list_); - if (status == false) { - print_help = true; - break; - } - copy_ctrl_mask |= USR_BUFFER_SIZE; - break; - - // Print Cpu time - case 'c': - print_cpu_time_ = true; - copy_ctrl_mask |= CPU_VISIBLE_TIME; - break; - - // Set Latency mode flag to true - case 'l': - latency_ = true; - copy_ctrl_mask |= DEV_COPY_LATENCY; - break; - - // Set validation mode flag to true - case 'v': - validate_ = true; - copy_ctrl_mask |= VALIDATE_COPY_OP; - break; - - // Set initialization mode flag to true - case 'i': - init_ = true; - status = ParseInitValue(optarg, init_val_); - if (status == false) { - print_help = true; - } - copy_ctrl_mask |= USR_BUFFER_INIT; - break; - - // Collect request to read a buffer - case 'r': - req_read_ = REQ_READ; - status = ParseOptionValue(optarg, read_list_); - if (status == false) { - print_help = true; - } - break; - - // Collect request to write a buffer - case 'w': - req_write_ = REQ_WRITE; - status = ParseOptionValue(optarg, write_list_); - if (status == false) { - print_help = true; - } - break; - - // getopt implementation returns the value of the unknown - // option or an option with missing operand in the variable - // optopt - case '?': - std::cout << "Argument is illegal or needs value: " << '?' << std::endl; - if ((optopt == 'b') || (optopt == 's') || - (optopt == 'd') || (optopt == 'm') || - (optopt == 'i') || (false)) { - std::cout << "Error: Options -b -s -d -m -i -k and -K require argument" << std::endl; - } - print_help = true; - break; - default: - print_help = true; - break; + // Initialize devices list if copying unidirectional + // all or bidirectional all mode is enabled + if ((req_copy_all_unidir_ == REQ_COPY_ALL_UNIDIR) || + (req_copy_all_bidir_ == REQ_COPY_ALL_BIDIR)) { + BuildDeviceList(); } - } - - // Print help screen if user option has "-h" - if (print_help) { - PrintHelpScreen(); - exit(0); - } - - // Determine input of primary flags is valid - ValidateInputFlags(num_primary_flags, copy_mask, copy_ctrl_mask); - - // Initialize Roc Runtime - err_ = hsa_init(); - ErrorCheck(err_); - - // Discover the topology of RocR agent in system - DiscoverTopology(); - - // Print list of devices if user option is "-e" - if (req_list_devs_ == REQ_LIST_DEVS) { - PrintVersion(); - PrintTopology(); - exit(0); - } - - // Print system topology if user option is "-t" - if (req_topology_ == REQ_TOPOLOGY) { - PrintVersion(); - PrintTopology(); - PrintLinkPropsMatrix(LINK_PROP_ACCESS); - PrintLinkPropsMatrix(LINK_PROP_TYPE); - PrintLinkPropsMatrix(LINK_PROP_WEIGHT); - exit(0); - } - // Initialize devices list if copying unidirectional - // all or bidirectional all mode is enabled - if ((req_copy_all_unidir_ == REQ_COPY_ALL_UNIDIR) || - (req_copy_all_bidir_ == REQ_COPY_ALL_BIDIR)) { - BuildDeviceList(); - } - - // Initialize list of buffer sizes used in copy operations - BuildBufferList(); - std::sort(size_list_.begin(), size_list_.end()); + // Initialize list of buffer sizes used in copy operations + BuildBufferList(); + std::sort(size_list_.begin(), size_list_.end()); } - diff --git a/rocm_bandwidth_test_print.cpp b/rocm_bandwidth_test_print.cpp old mode 100755 new mode 100644 index be18601..b548668 --- a/rocm_bandwidth_test_print.cpp +++ b/rocm_bandwidth_test_print.cpp @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL @@ -44,369 +44,362 @@ #include "rocm_bandwidth_test.hpp" #include + #include -#include #include +#include // @Brief: Print Help Menu Screen void RocmBandwidthTest::PrintHelpScreen() { + std::cout << std::endl; + std::cout << "Supported arguments:" << std::endl; + std::cout << std::endl; + std::cout << "\t -h Prints the help screen" << std::endl; + std::cout << "\t -q Query version of the test" << std::endl; + std::cout << "\t -v Run the test in validation mode" << std::endl; + std::cout << "\t -l Run test to collect Latency data" << std::endl; + std::cout << "\t -c Time the operation using CPU Timers" << std::endl; + std::cout << "\t -e Prints the list of ROCm devices enabled on platform" << std::endl; + std::cout << "\t -i Initialize copy buffer with specified 'long double' pattern" + << std::endl; + std::cout << "\t -t Prints system topology and allocatable memory info" << std::endl; + std::cout << "\t -m List of buffer sizes to use, specified in Megabytes" << std::endl; + std::cout << "\t -b List devices to use in bidirectional copy operations" << std::endl; + std::cout << "\t -s List of source devices to use in copy unidirectional operations" + << std::endl; + std::cout << "\t -d List of destination devices to use in unidirectional copy operations" + << std::endl; + std::cout << "\t -a Perform Unidirectional Copy involving all device combinations" + << std::endl; + std::cout << "\t -A Perform Bidirectional Copy involving all device combinations" + << std::endl; + std::cout << std::endl; - std::cout << std::endl; - std::cout << "Supported arguments:" << std::endl; - std::cout << std::endl; - std::cout << "\t -h Prints the help screen" << std::endl; - std::cout << "\t -q Query version of the test" << std::endl; - std::cout << "\t -v Run the test in validation mode" << std::endl; - std::cout << "\t -l Run test to collect Latency data" << std::endl; - std::cout << "\t -c Time the operation using CPU Timers" << std::endl; - std::cout << "\t -e Prints the list of ROCm devices enabled on platform" << std::endl; - std::cout << "\t -i Initialize copy buffer with specified 'long double' pattern" << std::endl; - std::cout << "\t -t Prints system topology and allocatable memory info" << std::endl; - std::cout << "\t -m List of buffer sizes to use, specified in Megabytes" << std::endl; - std::cout << "\t -b List devices to use in bidirectional copy operations" << std::endl; - std::cout << "\t -s List of source devices to use in copy unidirectional operations" << std::endl; - std::cout << "\t -d List of destination devices to use in unidirectional copy operations" << std::endl; - std::cout << "\t -a Perform Unidirectional Copy involving all device combinations" << std::endl; - std::cout << "\t -A Perform Bidirectional Copy involving all device combinations" << std::endl; - std::cout << std::endl; - - std::cout << "\t NOTE: Mixing following options is illegal/unsupported" << std::endl; - std::cout << "\t\t Case 1: rocm_bandwidth_test -a with {lm}{1,}" << std::endl; - std::cout << "\t\t Case 2: rocm_bandwidth_test -b with {clv}{1,}" << std::endl; - std::cout << "\t\t Case 3: rocm_bandwidth_test -A with {clmv}{1,}" << std::endl; - std::cout << "\t\t Case 4: rocm_bandwidth_test -s x -d y with {lmv}{2,}" << std::endl; - std::cout << std::endl; - - std::cout << std::endl; + std::cout << "\t NOTE: Mixing following options is illegal/unsupported" << std::endl; + std::cout << "\t\t Case 1: rocm_bandwidth_test -a with {lm}{1,}" << std::endl; + std::cout << "\t\t Case 2: rocm_bandwidth_test -b with {clv}{1,}" << std::endl; + std::cout << "\t\t Case 3: rocm_bandwidth_test -A with {clmv}{1,}" << std::endl; + std::cout << "\t\t Case 4: rocm_bandwidth_test -s x -d y with {lmv}{2,}" << std::endl; + std::cout << std::endl; + std::cout << std::endl; } // @brief: Print the cmdline used to run the test void RocmBandwidthTest::PrintLaunchCmd() const { + uint32_t format = 10; + std::cout.setf(ios::left); - uint32_t format = 10; - std::cout.setf(ios::left); + std::cout << std::endl; + std::cout.width(format); + std::cout << ""; + std::cout << "Launch Command is: "; - std::cout << std::endl; - std::cout.width(format); - std::cout << ""; - std::cout << "Launch Command is: "; + // Print the exe value + std::cout << usr_argv_[0]; - // Print the exe value - std::cout << usr_argv_[0]; + // Return for default run + if (bw_default_run_ != NULL) { + std::cout << " (rocm_bandwidth -a + rocm_bandwidth -A)"; + std::cout << std::endl; + std::cout << std::endl; + return; + } + + // Print launch parameters for non-default runs + for (uint32_t idx = 1; idx < usr_argc_; idx++) { + std::cout << " " << usr_argv_[idx]; + } - // Return for default run - if (bw_default_run_ != NULL) { - std::cout << " (rocm_bandwidth -a + rocm_bandwidth -A)"; std::cout << std::endl; std::cout << std::endl; - return; - } - - // Print launch parameters for non-default runs - for (uint32_t idx = 1; idx < usr_argc_; idx++) { - std::cout << " " << usr_argv_[idx]; - } - - std::cout << std::endl; - std::cout << std::endl; } // @brief: Print the version of the test void RocmBandwidthTest::PrintVersion() const { + uint32_t format = 10; + std::cout.setf(ios::left); - uint32_t format = 10; - std::cout.setf(ios::left); + std::cout << std::endl; + std::cout.width(format); + std::cout << ""; + std::cout << "RocmBandwidthTest Version: " << GetVersion() << std::endl; - std::cout << std::endl; - std::cout.width(format); - std::cout << ""; - std::cout << "RocmBandwidthTest Version: " << GetVersion() << std::endl; - - // Print launch command - PrintLaunchCmd(); + // Print launch command + PrintLaunchCmd(); } // @brief: Print the topology of Memory Pools and Devices present in system void RocmBandwidthTest::PrintTopology() { - - uint32_t format = 10; - size_t count = agent_pool_list_.size(); - std::cout << std::endl; - for (uint32_t idx = 0; idx < count; idx++) { - agent_pool_info_t node = agent_pool_list_.at(idx); - - std::cout.width(format); - std::cout << ""; - std::cout.width(format); - - // Print device info - std::cout << "Device Index: " - << node.agent.index_ << std::endl; - - std::cout.width(format); - std::cout << ""; - std::cout.width(format); - - if (HSA_DEVICE_TYPE_CPU == node.agent.device_type_) { - std::cout << " Device Type: CPU" << std::endl; - std::cout.width(format); - std::cout << ""; - std::cout.width(format); - std::cout << " Device Name: " << node.agent.name_ << std::endl; - } else if (HSA_DEVICE_TYPE_GPU == node.agent.device_type_) { - std::cout << " Device Type: GPU" << std::endl; - std::cout.width(format); - std::cout << ""; - std::cout.width(format); - std::cout << " Device Name: " << node.agent.name_ << std::endl; - std::cout.width(format); - std::cout << ""; - std::cout.width(format); - std::cout << " Device BDF: " << node.agent.bdf_id_ << std::endl; - std::cout.width(format); - std::cout << ""; - std::cout.width(format); - std::cout << " Device UUID: " << node.agent.uuid_ << std::endl; - } - - // Print pool info - size_t pool_count = node.pool_list.size(); - for (uint32_t jdx = 0; jdx < pool_count; jdx++) { - - std::cout.width(format); - std::cout << ""; - std::cout.width(format); - - std::cout << " Allocatable Memory Size (KB): " - << node.pool_list.at(jdx).allocable_size_ / 1024 << std::endl; + uint32_t format = 10; + size_t count = agent_pool_list_.size(); + std::cout << std::endl; + for (uint32_t idx = 0; idx < count; idx++) { + agent_pool_info_t node = agent_pool_list_.at(idx); + + std::cout.width(format); + std::cout << ""; + std::cout.width(format); + + // Print device info + std::cout << "Device Index: " << node.agent.index_ << std::endl; + + std::cout.width(format); + std::cout << ""; + std::cout.width(format); + + if (HSA_DEVICE_TYPE_CPU == node.agent.device_type_) { + std::cout << " Device Type: CPU" << std::endl; + std::cout.width(format); + std::cout << ""; + std::cout.width(format); + std::cout << " Device Name: " << node.agent.name_ + << std::endl; + } else if (HSA_DEVICE_TYPE_GPU == node.agent.device_type_) { + std::cout << " Device Type: GPU" << std::endl; + std::cout.width(format); + std::cout << ""; + std::cout.width(format); + std::cout << " Device Name: " << node.agent.name_ + << std::endl; + std::cout.width(format); + std::cout << ""; + std::cout.width(format); + std::cout << " Device BDF: " << node.agent.bdf_id_ + << std::endl; + std::cout.width(format); + std::cout << ""; + std::cout.width(format); + std::cout << " Device UUID: " << node.agent.uuid_ + << std::endl; + } + + // Print pool info + size_t pool_count = node.pool_list.size(); + for (uint32_t jdx = 0; jdx < pool_count; jdx++) { + std::cout.width(format); + std::cout << ""; + std::cout.width(format); + + std::cout << " Allocatable Memory Size (KB): " + << node.pool_list.at(jdx).allocable_size_ / 1024 << std::endl; + } + std::cout << std::endl; } std::cout << std::endl; - } - std::cout << std::endl; } std::string GetValueAsString(uint32_t key, uint32_t value) { - - std::stringstream ss; - - switch(key) { - case RocmBandwidthTest::LINK_PROP_ACCESS: - ss << value; - return ss.str(); - break; - case RocmBandwidthTest::LINK_PROP_HOPS: - case RocmBandwidthTest::LINK_PROP_WEIGHT: - ss << value; - return (value == 0xFFFFFFFF) ? std::string("N/A") : ss.str(); - break; - case RocmBandwidthTest::LINK_PROP_TYPE: - if ((value == RocmBandwidthTest::LINK_TYPE_SELF) || - (value == RocmBandwidthTest::LINK_TYPE_NO_PATH) || - (value == RocmBandwidthTest::LINK_TYPE_IGNORED)) { - return std::string("N/A"); - } else if (value == RocmBandwidthTest::LINK_TYPE_XGMI) { - return std::string("X"); - } else if (value == RocmBandwidthTest::LINK_TYPE_PCIE) { - return std::string("P"); - } - break; - } - std::cout << "An illegal key to get value for" << std::endl; - assert(false); - return ""; + std::stringstream ss; + + switch (key) { + case RocmBandwidthTest::LINK_PROP_ACCESS: + ss << value; + return ss.str(); + break; + case RocmBandwidthTest::LINK_PROP_HOPS: + case RocmBandwidthTest::LINK_PROP_WEIGHT: + ss << value; + return (value == 0xFFFFFFFF) ? std::string("N/A") : ss.str(); + break; + case RocmBandwidthTest::LINK_PROP_TYPE: + if ((value == RocmBandwidthTest::LINK_TYPE_SELF) || + (value == RocmBandwidthTest::LINK_TYPE_NO_PATH) || + (value == RocmBandwidthTest::LINK_TYPE_IGNORED)) { + return std::string("N/A"); + } else if (value == RocmBandwidthTest::LINK_TYPE_XGMI) { + return std::string("X"); + } else if (value == RocmBandwidthTest::LINK_TYPE_PCIE) { + return std::string("P"); + } + break; + } + std::cout << "An illegal key to get value for" << std::endl; + assert(false); + return ""; } void RocmBandwidthTest::PrintLinkPropsMatrix(uint32_t key) const { + uint32_t format = 10; + std::cout.setf(ios::left); - uint32_t format = 10; - std::cout.setf(ios::left); - - std::cout.width(format); - std::cout << ""; - std::cout.width(format); - - switch(key) { - case LINK_PROP_ACCESS: - std::cout << "Inter-Device Access"; - break; - case LINK_PROP_TYPE: - std::cout << "Inter-Device Link Type: P = PCIe, X = xGMI, N/A = Not Applicable"; - break; - case LINK_PROP_HOPS: - std::cout << "Inter-Device Link Hops"; - break; - case LINK_PROP_WEIGHT: - std::cout << "Inter-Device Numa Distance"; - break; - default: - std::cout << "An illegal key to print matrix" << std::endl; - assert(false); - } - std::cout << std::endl; - std::cout << std::endl; - - std::cout.width(format); - std::cout << ""; - std::cout.width(format); - std::cout << "D/D"; - for (uint32_t idx0 = 0; idx0 < agent_index_; idx0++) { - std::cout.width(format); - std::cout << idx0; - } - std::cout << std::endl; - std::cout << std::endl; - - for (uint32_t src_idx = 0; src_idx < agent_index_; src_idx++) { std::cout.width(format); std::cout << ""; std::cout.width(format); - std::cout << src_idx; - for (uint32_t dst_idx = 0; dst_idx < agent_index_; dst_idx++) { - uint32_t value = 0x00; - switch(key) { + + switch (key) { case LINK_PROP_ACCESS: - value = direct_access_matrix_[(src_idx * agent_index_) + dst_idx]; - break; + std::cout << "Inter-Device Access"; + break; case LINK_PROP_TYPE: - value = link_type_matrix_[(src_idx * agent_index_) + dst_idx]; - break; + std::cout << "Inter-Device Link Type: P = PCIe, X = xGMI, N/A = Not Applicable"; + break; case LINK_PROP_HOPS: - value = link_hops_matrix_[(src_idx * agent_index_) + dst_idx]; - break; + std::cout << "Inter-Device Link Hops"; + break; case LINK_PROP_WEIGHT: - value = link_weight_matrix_[(src_idx * agent_index_) + dst_idx]; - break; - } - std::cout.width(format); - std::cout << GetValueAsString(key, value); + std::cout << "Inter-Device Numa Distance"; + break; + default: + std::cout << "An illegal key to print matrix" << std::endl; + assert(false); + } + std::cout << std::endl; + std::cout << std::endl; + + std::cout.width(format); + std::cout << ""; + std::cout.width(format); + std::cout << "D/D"; + for (uint32_t idx0 = 0; idx0 < agent_index_; idx0++) { + std::cout.width(format); + std::cout << idx0; } std::cout << std::endl; std::cout << std::endl; - } - std::cout << std::endl; + + for (uint32_t src_idx = 0; src_idx < agent_index_; src_idx++) { + std::cout.width(format); + std::cout << ""; + std::cout.width(format); + std::cout << src_idx; + for (uint32_t dst_idx = 0; dst_idx < agent_index_; dst_idx++) { + uint32_t value = 0x00; + switch (key) { + case LINK_PROP_ACCESS: + value = direct_access_matrix_[(src_idx * agent_index_) + dst_idx]; + break; + case LINK_PROP_TYPE: + value = link_type_matrix_[(src_idx * agent_index_) + dst_idx]; + break; + case LINK_PROP_HOPS: + value = link_hops_matrix_[(src_idx * agent_index_) + dst_idx]; + break; + case LINK_PROP_WEIGHT: + value = link_weight_matrix_[(src_idx * agent_index_) + dst_idx]; + break; + } + std::cout.width(format); + std::cout << GetValueAsString(key, value); + } + std::cout << std::endl; + std::cout << std::endl; + } + std::cout << std::endl; } // @brief: Print info on Devices in system void RocmBandwidthTest::PrintAgentsList() { - - size_t count = agent_pool_list_.size(); - for (uint32_t idx = 0; idx < count; idx++) { - std::cout << std::endl; - agent_pool_info_t node = agent_pool_list_.at(idx); - std::cout << "Device Index: " - << node.agent.index_ << std::endl; - if (HSA_DEVICE_TYPE_CPU == node.agent.device_type_) { - std::cout << " Device Type: CPU" << std::endl; - } else if (HSA_DEVICE_TYPE_GPU == node.agent.device_type_) { - std::cout << " Device Type: GPU" << std::endl; - std::cout << " Device BDF: " << node.agent.bdf_id_ << std::endl; - std::cout << " Device UUID: " << node.agent.uuid_ << std::endl; + size_t count = agent_pool_list_.size(); + for (uint32_t idx = 0; idx < count; idx++) { + std::cout << std::endl; + agent_pool_info_t node = agent_pool_list_.at(idx); + std::cout << "Device Index: " << node.agent.index_ << std::endl; + if (HSA_DEVICE_TYPE_CPU == node.agent.device_type_) { + std::cout << " Device Type: CPU" << std::endl; + } else if (HSA_DEVICE_TYPE_GPU == node.agent.device_type_) { + std::cout << " Device Type: GPU" << std::endl; + std::cout << " Device BDF: " << node.agent.bdf_id_ + << std::endl; + std::cout << " Device UUID: " << node.agent.uuid_ + << std::endl; + } } - } - std::cout << std::endl; + std::cout << std::endl; } // @brief: Print info on memory pools in system void RocmBandwidthTest::PrintPoolsList() { - - size_t pool_count = pool_list_.size(); - for (uint32_t jdx = 0; jdx < pool_count; jdx++) { + size_t pool_count = pool_list_.size(); + for (uint32_t jdx = 0; jdx < pool_count; jdx++) { + std::cout << std::endl; + std::cout << "Memory Pool Idx: " << pool_list_.at(jdx).index_ + << std::endl; + std::cout << " max allocable size in KB: " + << pool_list_.at(jdx).allocable_size_ / 1024 << std::endl; + std::cout << " segment id: " << pool_list_.at(jdx).segment_ + << std::endl; + std::cout << " is kernarg: " << pool_list_.at(jdx).is_kernarg_ + << std::endl; + std::cout << " is fine-grained: " + << pool_list_.at(jdx).is_fine_grained_ << std::endl; + std::cout << " accessible to owner: " + << pool_list_.at(jdx).owner_access_ << std::endl; + std::cout << " accessible to all by default: " + << pool_list_.at(jdx).access_to_all_ << std::endl; + } std::cout << std::endl; - std::cout << "Memory Pool Idx: " - << pool_list_.at(jdx).index_ << std::endl; - std::cout << " max allocable size in KB: " - << pool_list_.at(jdx).allocable_size_ / 1024 << std::endl; - std::cout << " segment id: " - << pool_list_.at(jdx).segment_ << std::endl; - std::cout << " is kernarg: " - << pool_list_.at(jdx).is_kernarg_ << std::endl; - std::cout << " is fine-grained: " - << pool_list_.at(jdx).is_fine_grained_ << std::endl; - std::cout << " accessible to owner: " - << pool_list_.at(jdx).owner_access_ << std::endl; - std::cout << " accessible to all by default: " - << pool_list_.at(jdx).access_to_all_ << std::endl; - } - std::cout << std::endl; - } // @brief: Print the list of transactions that will be executed void RocmBandwidthTest::PrintTransList() { - - size_t count = trans_list_.size(); - for (uint32_t idx = 0; idx < count; idx++) { - async_trans_t trans = trans_list_.at(idx); - std::cout << std::endl; - std::cout << " Transaction Id: " << idx << std::endl; - std::cout << " Transaction Type: " << trans.req_type_ << std::endl; - if ((trans.req_type_ == REQ_READ) || (trans.req_type_ == REQ_WRITE)) { - std::cout << "Rocm Kernel used by Transaction: " << trans.kernel.code_ << std::endl; - std::cout << "Rocm Buffer index Used by Kernel: " << trans.kernel.pool_idx_ << std::endl; - std::cout << " Rocm Device used for Execution: " << trans.kernel.agent_idx_ << std::endl; - } - if ((trans.req_type_ == REQ_COPY_BIDIR) || (trans.req_type_ == REQ_COPY_UNIDIR)) { - std::cout << " Src Buffer used in Copy: " << trans.copy.src_idx_ << std::endl; - std::cout << " Dst Buffer used in Copy: " << trans.copy.dst_idx_ << std::endl; - } - if ((trans.req_type_ == REQ_COPY_ALL_BIDIR) || (trans.req_type_ == REQ_COPY_ALL_UNIDIR)) { - std::cout << " Src Memory Pool used in Copy: " << trans.copy.src_idx_ << std::endl; - std::cout << " Dst Memory Pool used in Copy: " << trans.copy.dst_idx_ << std::endl; - } - if ((trans.req_type_ == REQ_CONCURRENT_COPY_BIDIR) || - (trans.req_type_ == REQ_CONCURRENT_COPY_UNIDIR)) { - std::cout << " Src Memory Pool used in Copy: " << trans.copy.src_idx_ << std::endl; - std::cout << " Dst Memory Pool used in Copy: " << trans.copy.dst_idx_ << std::endl; + size_t count = trans_list_.size(); + for (uint32_t idx = 0; idx < count; idx++) { + async_trans_t trans = trans_list_.at(idx); + std::cout << std::endl; + std::cout << " Transaction Id: " << idx << std::endl; + std::cout << " Transaction Type: " << trans.req_type_ << std::endl; + if ((trans.req_type_ == REQ_READ) || (trans.req_type_ == REQ_WRITE)) { + std::cout << "Rocm Kernel used by Transaction: " << trans.kernel.code_ << std::endl; + std::cout << "Rocm Buffer index Used by Kernel: " << trans.kernel.pool_idx_ + << std::endl; + std::cout << " Rocm Device used for Execution: " << trans.kernel.agent_idx_ + << std::endl; + } + if ((trans.req_type_ == REQ_COPY_BIDIR) || (trans.req_type_ == REQ_COPY_UNIDIR)) { + std::cout << " Src Buffer used in Copy: " << trans.copy.src_idx_ << std::endl; + std::cout << " Dst Buffer used in Copy: " << trans.copy.dst_idx_ << std::endl; + } + if ((trans.req_type_ == REQ_COPY_ALL_BIDIR) || (trans.req_type_ == REQ_COPY_ALL_UNIDIR)) { + std::cout << " Src Memory Pool used in Copy: " << trans.copy.src_idx_ << std::endl; + std::cout << " Dst Memory Pool used in Copy: " << trans.copy.dst_idx_ << std::endl; + } + if ((trans.req_type_ == REQ_CONCURRENT_COPY_BIDIR) || + (trans.req_type_ == REQ_CONCURRENT_COPY_UNIDIR)) { + std::cout << " Src Memory Pool used in Copy: " << trans.copy.src_idx_ << std::endl; + std::cout << " Dst Memory Pool used in Copy: " << trans.copy.dst_idx_ << std::endl; + } } - - } - std::cout << std::endl; + std::cout << std::endl; } // @brief: Prints error message when a request to copy between // source buffer and destination buffer is not possible void RocmBandwidthTest::PrintCopyAccessError(uint32_t src_idx, uint32_t dst_idx) { + // Retrieve Roc runtime handles for Src memory pool and devices + uint32_t src_dev_idx = pool_list_[src_idx].agent_index_; + hsa_device_type_t src_dev_type = agent_list_[src_dev_idx].device_type_; + + // Retrieve Roc runtime handles for Dst memory pool and devices + uint32_t dst_dev_idx = pool_list_[dst_idx].agent_index_; + hsa_device_type_t dst_dev_type = agent_list_[dst_dev_idx].device_type_; - // Retrieve Roc runtime handles for Src memory pool and devices - uint32_t src_dev_idx = pool_list_[src_idx].agent_index_; - hsa_device_type_t src_dev_type = agent_list_[src_dev_idx].device_type_; - - // Retrieve Roc runtime handles for Dst memory pool and devices - uint32_t dst_dev_idx = pool_list_[dst_idx].agent_index_; - hsa_device_type_t dst_dev_type = agent_list_[dst_dev_idx].device_type_; - - std::cout << std::endl; - std::cout << "Src Device: Index " - << src_dev_idx - << ", Type: " - << ((src_dev_type == HSA_DEVICE_TYPE_CPU) ? "CPU" : "GPU") << std::endl; - std::cout << "Dst Device: Index " - << dst_dev_idx - << ", Type: " - << ((dst_dev_type == HSA_DEVICE_TYPE_CPU) ? "CPU" : "GPU") << std::endl; - std::cout << "Rocm Device hosting Src Memory cannot ACCESS Dst Memory" << std::endl; - std::cout << std::endl; + std::cout << std::endl; + std::cout << "Src Device: Index " << src_dev_idx + << ", Type: " << ((src_dev_type == HSA_DEVICE_TYPE_CPU) ? "CPU" : "GPU") << std::endl; + std::cout << "Dst Device: Index " << dst_dev_idx + << ", Type: " << ((dst_dev_type == HSA_DEVICE_TYPE_CPU) ? "CPU" : "GPU") << std::endl; + std::cout << "Rocm Device hosting Src Memory cannot ACCESS Dst Memory" << std::endl; + std::cout << std::endl; } // @brief: Prints error message when a request to read / write from // a buffer of a device is not possible void RocmBandwidthTest::PrintIOAccessError(uint32_t exec_idx, uint32_t pool_idx) { + // Retrieve device type of executing device + hsa_device_type_t exec_dev_type = agent_list_[exec_idx].device_type_; - // Retrieve device type of executing device - hsa_device_type_t exec_dev_type = agent_list_[exec_idx].device_type_; - - // Retrieve device type of memory pool's device - uint32_t pool_dev_idx = pool_list_[pool_idx].agent_index_; - hsa_device_type_t pool_dev_type = agent_list_[pool_dev_idx].device_type_; - - std::cout << std::endl; - std::cout << "Index of Executing Device: " << exec_idx << std::endl; - std::cout << "Device Type of Executing Device: " << exec_dev_type << std::endl; - - std::cout << "Index of Buffer: " << pool_idx << std::endl; - std::cout << "Index of Buffer's Device: " << pool_dev_idx << std::endl; - std::cout << "Device Type Hosting Buffer: " << pool_dev_type << std::endl; - std::cout << "Rocm Device executing Read / Write request cannot ACCESS Buffer" << std::endl; - std::cout << std::endl; + // Retrieve device type of memory pool's device + uint32_t pool_dev_idx = pool_list_[pool_idx].agent_index_; + hsa_device_type_t pool_dev_type = agent_list_[pool_dev_idx].device_type_; + + std::cout << std::endl; + std::cout << "Index of Executing Device: " << exec_idx << std::endl; + std::cout << "Device Type of Executing Device: " << exec_dev_type << std::endl; + + std::cout << "Index of Buffer: " << pool_idx << std::endl; + std::cout << "Index of Buffer's Device: " << pool_dev_idx << std::endl; + std::cout << "Device Type Hosting Buffer: " << pool_dev_type << std::endl; + std::cout << "Rocm Device executing Read / Write request cannot ACCESS Buffer" << std::endl; + std::cout << std::endl; } diff --git a/rocm_bandwidth_test_report.cpp b/rocm_bandwidth_test_report.cpp old mode 100755 new mode 100644 index d4a0f60..c57463c --- a/rocm_bandwidth_test_report.cpp +++ b/rocm_bandwidth_test_report.cpp @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL @@ -43,337 +43,315 @@ #include "common.hpp" #include "rocm_bandwidth_test.hpp" +#include #include #include -#include -static void printRecord(size_t size, double avg_time, - double avg_bandwidth, double min_time, +static void printRecord(size_t size, double avg_time, double avg_bandwidth, double min_time, double peak_bandwidth) { + std::stringstream size_str; + if (size < 1024) { + size_str << size << " Bytes"; + } else if (size < 1024 * 1024) { + size_str << size / 1024 << " KB"; + } else { + size_str << size / (1024 * 1024) << " MB"; + } - std::stringstream size_str; - if (size < 1024) { - size_str << size << " Bytes"; - } else if (size < 1024 * 1024) { - size_str << size / 1024 << " KB"; - } else { - size_str << size / (1024 * 1024) << " MB"; - } - - uint32_t format = 15; - std::cout.precision(3); - std::cout << std::fixed; - std::cout.width(format); - std::cout << size_str.str(); - std::cout.width(format); - std::cout << (avg_time * 1e6); - std::cout.width(format); - std::cout << avg_bandwidth; - std::cout.width(format); - std::cout << (min_time * 1e6); - std::cout.width(format); - std::cout << peak_bandwidth; - std::cout << std::endl; + uint32_t format = 15; + std::cout.precision(3); + std::cout << std::fixed; + std::cout.width(format); + std::cout << size_str.str(); + std::cout.width(format); + std::cout << (avg_time * 1e6); + std::cout.width(format); + std::cout << avg_bandwidth; + std::cout.width(format); + std::cout << (min_time * 1e6); + std::cout.width(format); + std::cout << peak_bandwidth; + std::cout << std::endl; } -static void printCopyBanner(uint32_t src_pool_id, uint32_t src_agent_type, - uint32_t dst_pool_id, uint32_t dst_agent_type, - bool unidir) { - - std::stringstream src_type; - std::stringstream dst_type; - (src_agent_type == 0) ? src_type << "Cpu" : src_type << "Gpu"; - (dst_agent_type == 0) ? dst_type << "Cpu" : dst_type << "Gpu"; - - std::cout << std::endl; - std::cout << "================"; - if (unidir) { - std::cout << " Unidirectional Benchmark Result"; - } else { - std::cout << " Bidirectional Benchmark Result"; - } - std::cout << " ================"; - std::cout << std::endl; - std::cout << "================"; - std::cout << " Src Device Id: " << src_pool_id; - std::cout << " Src Device Type: " << src_type.str(); - std::cout << " ================"; - std::cout << std::endl; - std::cout << "================"; - std::cout << " Dst Device Id: " << dst_pool_id; - std::cout << " Dst Device Type: " << dst_type.str(); - std::cout << " ================"; - std::cout << std::endl; - std::cout << std::endl; - - uint32_t format = 15; - std::cout.setf(ios::left); - std::cout.width(format); - std::cout << "Data Size"; - std::cout.width(format); - std::cout << "Avg Time(us)"; - std::cout.width(format); - std::cout << "Avg BW(GB/s)"; - std::cout.width(format); - std::cout << "Min Time(us)"; - std::cout.width(format); - std::cout << "Peak BW(GB/s)"; - std::cout << std::endl; +static void printCopyBanner(uint32_t src_pool_id, uint32_t src_agent_type, uint32_t dst_pool_id, + uint32_t dst_agent_type, bool unidir) { + std::stringstream src_type; + std::stringstream dst_type; + (src_agent_type == 0) ? src_type << "Cpu" : src_type << "Gpu"; + (dst_agent_type == 0) ? dst_type << "Cpu" : dst_type << "Gpu"; + + std::cout << std::endl; + std::cout << "================"; + if (unidir) { + std::cout << " Unidirectional Benchmark Result"; + } else { + std::cout << " Bidirectional Benchmark Result"; + } + std::cout << " ================"; + std::cout << std::endl; + std::cout << "================"; + std::cout << " Src Device Id: " << src_pool_id; + std::cout << " Src Device Type: " << src_type.str(); + std::cout << " ================"; + std::cout << std::endl; + std::cout << "================"; + std::cout << " Dst Device Id: " << dst_pool_id; + std::cout << " Dst Device Type: " << dst_type.str(); + std::cout << " ================"; + std::cout << std::endl; + std::cout << std::endl; + + uint32_t format = 15; + std::cout.setf(ios::left); + std::cout.width(format); + std::cout << "Data Size"; + std::cout.width(format); + std::cout << "Avg Time(us)"; + std::cout.width(format); + std::cout << "Avg BW(GB/s)"; + std::cout.width(format); + std::cout << "Min Time(us)"; + std::cout.width(format); + std::cout << "Peak BW(GB/s)"; + std::cout << std::endl; } double RocmBandwidthTest::GetMinTime(std::vector& vec) { - - std::sort(vec.begin(), vec.end()); - return vec.at(0); + std::sort(vec.begin(), vec.end()); + return vec.at(0); } double RocmBandwidthTest::GetMeanTime(std::vector& vec) { + // In validation mode we run only one iteration + if (validate_) { + return vec.at(0); + } - // In validation mode we run only one iteration - if (validate_) { - return vec.at(0); - } - - // Number of elements is ONE plus number of iterations - std::sort(vec.begin(), vec.end()); - vec.erase(vec.end() - 1); - - double mean = 0.0; - int num = vec.size(); - for (int it = 0; it < num; it++) { - mean += vec[it]; - } - mean /= num; - return mean; + // Number of elements is ONE plus number of iterations + std::sort(vec.begin(), vec.end()); + vec.erase(vec.end() - 1); + + double mean = 0.0; + int num = vec.size(); + for (int it = 0; it < num; it++) { + mean += vec[it]; + } + mean /= num; + return mean; } void RocmBandwidthTest::Display() const { - - // Iterate through list of transactions and display its timing data - uint32_t trans_size = trans_list_.size(); - if (trans_size == 0) { - std::cout << std::endl; - std::cout << " Invalid Request" << std::endl; - std::cout << std::endl; - return; - } - - if (validate_) { - PrintVersion(); - DisplayDevInfo(); - PrintLinkPropsMatrix(LINK_PROP_ACCESS); - DisplayValidationMatrix(); - return; - } - - if (req_copy_all_unidir_ == REQ_COPY_ALL_UNIDIR) { - PrintVersion(); - DisplayDevInfo(); - PrintLinkPropsMatrix(LINK_PROP_ACCESS); - PrintLinkPropsMatrix(LINK_PROP_WEIGHT); - DisplayCopyTimeMatrix(true); - return; - } - - if (req_copy_all_bidir_ == REQ_COPY_ALL_BIDIR) { - if (bw_default_run_ == NULL) { - PrintVersion(); - DisplayDevInfo(); - PrintLinkPropsMatrix(LINK_PROP_ACCESS); - PrintLinkPropsMatrix(LINK_PROP_WEIGHT); - } - DisplayCopyTimeMatrix(true); - return; - } - - if ((req_copy_bidir_ == REQ_COPY_BIDIR) || - (req_copy_unidir_ == REQ_COPY_UNIDIR) || - (req_concurrent_copy_bidir_ == REQ_CONCURRENT_COPY_BIDIR) || - (req_concurrent_copy_unidir_ == REQ_CONCURRENT_COPY_UNIDIR)) { - PrintVersion(); - } - - for (uint32_t idx = 0; idx < trans_size; idx++) { - async_trans_t trans = trans_list_[idx]; - if ((trans.req_type_ == REQ_COPY_BIDIR) || - (trans.req_type_ == REQ_COPY_UNIDIR) || - (trans.req_type_ == REQ_CONCURRENT_COPY_BIDIR) || - (trans.req_type_ == REQ_CONCURRENT_COPY_UNIDIR)) { - DisplayCopyTime(trans); + // Iterate through list of transactions and display its timing data + uint32_t trans_size = trans_list_.size(); + if (trans_size == 0) { + std::cout << std::endl; + std::cout << " Invalid Request" << std::endl; + std::cout << std::endl; + return; } - if ((trans.req_type_ == REQ_READ) || - (trans.req_type_ == REQ_WRITE)) { - DisplayIOTime(trans); + + if (validate_) { + PrintVersion(); + DisplayDevInfo(); + PrintLinkPropsMatrix(LINK_PROP_ACCESS); + DisplayValidationMatrix(); + return; } - } - std::cout << std::endl; -} -void RocmBandwidthTest::DisplayIOTime(async_trans_t& trans) const { + if (req_copy_all_unidir_ == REQ_COPY_ALL_UNIDIR) { + PrintVersion(); + DisplayDevInfo(); + PrintLinkPropsMatrix(LINK_PROP_ACCESS); + PrintLinkPropsMatrix(LINK_PROP_WEIGHT); + DisplayCopyTimeMatrix(true); + return; + } -} + if (req_copy_all_bidir_ == REQ_COPY_ALL_BIDIR) { + if (bw_default_run_ == NULL) { + PrintVersion(); + DisplayDevInfo(); + PrintLinkPropsMatrix(LINK_PROP_ACCESS); + PrintLinkPropsMatrix(LINK_PROP_WEIGHT); + } + DisplayCopyTimeMatrix(true); + return; + } -void RocmBandwidthTest::DisplayCopyTime(async_trans_t& trans) const { + if ((req_copy_bidir_ == REQ_COPY_BIDIR) || (req_copy_unidir_ == REQ_COPY_UNIDIR) || + (req_concurrent_copy_bidir_ == REQ_CONCURRENT_COPY_BIDIR) || + (req_concurrent_copy_unidir_ == REQ_CONCURRENT_COPY_UNIDIR)) { + PrintVersion(); + } - // Print Benchmark Header - uint32_t src_idx = trans.copy.src_idx_; - uint32_t dst_idx = trans.copy.dst_idx_; - uint32_t src_dev_idx = pool_list_[src_idx].agent_index_; - hsa_device_type_t src_dev_type = agent_list_[src_dev_idx].device_type_; - uint32_t dst_dev_idx = pool_list_[dst_idx].agent_index_; - hsa_device_type_t dst_dev_type = agent_list_[dst_dev_idx].device_type_; - - bool unidir = ((trans.req_type_ == REQ_COPY_UNIDIR) || - (trans.req_type_ == REQ_CONCURRENT_COPY_UNIDIR)); - printCopyBanner(src_idx, src_dev_type, dst_idx, dst_dev_type, unidir); - - uint32_t size_len = size_list_.size(); - for (uint32_t idx = 0; idx < size_len; idx++) { - printRecord(size_list_[idx], trans.avg_time_[idx], - trans.avg_bandwidth_[idx], trans.min_time_[idx], - trans.peak_bandwidth_[idx]); - } + for (uint32_t idx = 0; idx < trans_size; idx++) { + async_trans_t trans = trans_list_[idx]; + if ((trans.req_type_ == REQ_COPY_BIDIR) || (trans.req_type_ == REQ_COPY_UNIDIR) || + (trans.req_type_ == REQ_CONCURRENT_COPY_BIDIR) || + (trans.req_type_ == REQ_CONCURRENT_COPY_UNIDIR)) { + DisplayCopyTime(trans); + } + if ((trans.req_type_ == REQ_READ) || (trans.req_type_ == REQ_WRITE)) { + DisplayIOTime(trans); + } + } + std::cout << std::endl; } -void RocmBandwidthTest::PopulatePerfMatrix(bool peak, double* perf_matrix) const { +void RocmBandwidthTest::DisplayIOTime(async_trans_t& trans) const {} - uint32_t trans_size = trans_list_.size(); - for (uint32_t idx = 0; idx < trans_size; idx++) { - async_trans_t trans = trans_list_[idx]; +void RocmBandwidthTest::DisplayCopyTime(async_trans_t& trans) const { + // Print Benchmark Header uint32_t src_idx = trans.copy.src_idx_; uint32_t dst_idx = trans.copy.dst_idx_; uint32_t src_dev_idx = pool_list_[src_idx].agent_index_; + hsa_device_type_t src_dev_type = agent_list_[src_dev_idx].device_type_; uint32_t dst_dev_idx = pool_list_[dst_idx].agent_index_; + hsa_device_type_t dst_dev_type = agent_list_[dst_dev_idx].device_type_; - // For COPY_ALL_UNIDIR and COPY_ALL_BIDIR we use only one copy size - double bandwidth = (peak) ? trans.peak_bandwidth_[0] : trans.avg_bandwidth_[0]; - perf_matrix[(src_dev_idx * agent_index_) + dst_dev_idx] = bandwidth; - if (req_copy_all_bidir_ == REQ_COPY_ALL_BIDIR) { - perf_matrix[(dst_dev_idx * agent_index_) + src_dev_idx] = bandwidth; + bool unidir = + ((trans.req_type_ == REQ_COPY_UNIDIR) || (trans.req_type_ == REQ_CONCURRENT_COPY_UNIDIR)); + printCopyBanner(src_idx, src_dev_type, dst_idx, dst_dev_type, unidir); + + uint32_t size_len = size_list_.size(); + for (uint32_t idx = 0; idx < size_len; idx++) { + printRecord(size_list_[idx], trans.avg_time_[idx], trans.avg_bandwidth_[idx], + trans.min_time_[idx], trans.peak_bandwidth_[idx]); } - } +} +void RocmBandwidthTest::PopulatePerfMatrix(bool peak, double* perf_matrix) const { + uint32_t trans_size = trans_list_.size(); + for (uint32_t idx = 0; idx < trans_size; idx++) { + async_trans_t trans = trans_list_[idx]; + uint32_t src_idx = trans.copy.src_idx_; + uint32_t dst_idx = trans.copy.dst_idx_; + uint32_t src_dev_idx = pool_list_[src_idx].agent_index_; + uint32_t dst_dev_idx = pool_list_[dst_idx].agent_index_; + + // For COPY_ALL_UNIDIR and COPY_ALL_BIDIR we use only one copy size + double bandwidth = (peak) ? trans.peak_bandwidth_[0] : trans.avg_bandwidth_[0]; + perf_matrix[(src_dev_idx * agent_index_) + dst_dev_idx] = bandwidth; + if (req_copy_all_bidir_ == REQ_COPY_ALL_BIDIR) { + perf_matrix[(dst_dev_idx * agent_index_) + src_dev_idx] = bandwidth; + } + } } void RocmBandwidthTest::PrintPerfMatrix(bool validate, bool peak, double* perf_matrix) const { + uint32_t format = 10; + std::cout.setf(ios::left); - uint32_t format = 10; - std::cout.setf(ios::left); - - std::cout.width(format); - std::cout << ""; - std::cout.width(format); - - if (validate == false) { - if ((peak) && (req_copy_all_unidir_ == REQ_COPY_ALL_UNIDIR)) { - std::cout << "Unidirectional copy peak bandwidth GB/s"; - } - - if ((peak == false) && (req_copy_all_unidir_ == REQ_COPY_ALL_UNIDIR)) { - std::cout << "Unidirectional copy average bandwidth GB/s"; - } - - if ((peak) && (req_copy_all_bidir_ == REQ_COPY_ALL_BIDIR)) { - std::cout << "Bidirectional copy peak bandwidth GB/s"; - } - - if ((peak == false) && (req_copy_all_bidir_ == REQ_COPY_ALL_BIDIR)) { - std::cout << "Bidirectional copy average bandwidth GB/s"; - } - } else { - std::cout << "Data Path Validation"; - } - - std::cout << std::endl; - std::cout << std::endl; - std::cout.precision(3); - std::cout << std::fixed; - - std::cout.width(format); - std::cout << ""; - std::cout.width(format); - std::cout << "D/D"; - format = 12; - for (uint32_t idx0 = 0; idx0 < agent_index_; idx0++) { - std::cout.width(format); - std::stringstream agent_id; - agent_id << idx0; - std::cout << agent_id.str(); - } - std::cout << std::endl; - std::cout << std::endl; - for (uint32_t idx0 = 0; idx0 < agent_index_; idx0++) { - format = 10; std::cout.width(format); std::cout << ""; - std::stringstream agent_id; - agent_id << idx0; std::cout.width(format); - std::cout << agent_id.str(); - for (uint32_t idx1 = 0; idx1 < agent_index_; idx1++) { - format = 12; - std::cout.width(format); - double value = perf_matrix[(idx0 * agent_index_) + idx1]; - if (validate) { - if (value == 0) { - std::cout << "N/A"; - } else if (value == VALIDATE_COPY_OP_FAILURE) { - std::cout << "FAIL"; - } else { - std::cout << "PASS"; + + if (validate == false) { + if ((peak) && (req_copy_all_unidir_ == REQ_COPY_ALL_UNIDIR)) { + std::cout << "Unidirectional copy peak bandwidth GB/s"; } - } else { - if (value == 0) { - std::cout << "N/A"; - } else { - std::cout << perf_matrix[(idx0 * agent_index_) + idx1]; + + if ((peak == false) && (req_copy_all_unidir_ == REQ_COPY_ALL_UNIDIR)) { + std::cout << "Unidirectional copy average bandwidth GB/s"; + } + + if ((peak) && (req_copy_all_bidir_ == REQ_COPY_ALL_BIDIR)) { + std::cout << "Bidirectional copy peak bandwidth GB/s"; } - } + + if ((peak == false) && (req_copy_all_bidir_ == REQ_COPY_ALL_BIDIR)) { + std::cout << "Bidirectional copy average bandwidth GB/s"; + } + } else { + std::cout << "Data Path Validation"; } + std::cout << std::endl; std::cout << std::endl; - } - std::cout << std::endl; + std::cout.precision(3); + std::cout << std::fixed; + + std::cout.width(format); + std::cout << ""; + std::cout.width(format); + std::cout << "D/D"; + format = 12; + for (uint32_t idx0 = 0; idx0 < agent_index_; idx0++) { + std::cout.width(format); + std::stringstream agent_id; + agent_id << idx0; + std::cout << agent_id.str(); + } + std::cout << std::endl; + std::cout << std::endl; + for (uint32_t idx0 = 0; idx0 < agent_index_; idx0++) { + format = 10; + std::cout.width(format); + std::cout << ""; + std::stringstream agent_id; + agent_id << idx0; + std::cout.width(format); + std::cout << agent_id.str(); + for (uint32_t idx1 = 0; idx1 < agent_index_; idx1++) { + format = 12; + std::cout.width(format); + double value = perf_matrix[(idx0 * agent_index_) + idx1]; + if (validate) { + if (value == 0) { + std::cout << "N/A"; + } else if (value == VALIDATE_COPY_OP_FAILURE) { + std::cout << "FAIL"; + } else { + std::cout << "PASS"; + } + } else { + if (value == 0) { + std::cout << "N/A"; + } else { + std::cout << perf_matrix[(idx0 * agent_index_) + idx1]; + } + } + } + std::cout << std::endl; + std::cout << std::endl; + } + std::cout << std::endl; } void RocmBandwidthTest::DisplayCopyTimeMatrix(bool peak) const { - - double* perf_matrix = new double[agent_index_ * agent_index_](); - PopulatePerfMatrix(peak, perf_matrix); - PrintPerfMatrix(false, peak, perf_matrix); - delete[] perf_matrix; + double* perf_matrix = new double[agent_index_ * agent_index_](); + PopulatePerfMatrix(peak, perf_matrix); + PrintPerfMatrix(false, peak, perf_matrix); + delete[] perf_matrix; } void RocmBandwidthTest::DisplayValidationMatrix() const { - - double* perf_matrix = new double[agent_index_ * agent_index_](); - PopulatePerfMatrix(true, perf_matrix); - PrintPerfMatrix(true, true, perf_matrix); - delete[] perf_matrix; + double* perf_matrix = new double[agent_index_ * agent_index_](); + PopulatePerfMatrix(true, perf_matrix); + PrintPerfMatrix(true, true, perf_matrix); + delete[] perf_matrix; } void RocmBandwidthTest::DisplayDevInfo() const { + uint32_t format = 10; + std::cout.setf(ios::left); - uint32_t format = 10; - std::cout.setf(ios::left); - - std::cout << std::endl; - for (uint32_t idx = 0; idx < agent_index_; idx++) { - uint32_t active = active_agents_list_[idx]; - if (active == 1) { - std::cout.width(format); - std::cout << ""; - std::cout << "Device: " << idx; - std::cout << ", " << agent_list_[idx].name_; - bool gpuDevice = (agent_list_[idx].device_type_ == HSA_DEVICE_TYPE_GPU); - if (gpuDevice) { - std::cout << ", " << agent_list_[idx].uuid_; - std::cout << ", " << agent_list_[idx].bdf_id_; - } - std::cout << std::endl; + std::cout << std::endl; + for (uint32_t idx = 0; idx < agent_index_; idx++) { + uint32_t active = active_agents_list_[idx]; + if (active == 1) { + std::cout.width(format); + std::cout << ""; + std::cout << "Device: " << idx; + std::cout << ", " << agent_list_[idx].name_; + bool gpuDevice = (agent_list_[idx].device_type_ == HSA_DEVICE_TYPE_GPU); + if (gpuDevice) { + std::cout << ", " << agent_list_[idx].uuid_; + std::cout << ", " << agent_list_[idx].bdf_id_; + } + std::cout << std::endl; + } } - } - std::cout << std::endl; + std::cout << std::endl; } - - diff --git a/rocm_bandwidth_test_topology.cpp b/rocm_bandwidth_test_topology.cpp old mode 100755 new mode 100644 index f5325e9..84675e6 --- a/rocm_bandwidth_test_topology.cpp +++ b/rocm_bandwidth_test_topology.cpp @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL @@ -43,346 +43,324 @@ #include "common.hpp" #include "rocm_bandwidth_test.hpp" +#include #include #include #include -#include // @brief: Helper method to iterate throught the memory pools of // an agent and discover its properties hsa_status_t MemPoolInfo(hsa_amd_memory_pool_t pool, void* data) { - - hsa_status_t status; - RocmBandwidthTest* asyncDrvr = reinterpret_cast(data); - - // Query pools' segment, report only pools from global segment - hsa_amd_segment_t segment; - status = hsa_amd_memory_pool_get_info(pool, - HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); - ErrorCheck(status); - if (HSA_AMD_SEGMENT_GLOBAL != segment) { - return HSA_STATUS_SUCCESS; - } - - // Determine if allocation is allowed in this pool - // Report only pools that allow an alloction by user - bool alloc = false; - status = hsa_amd_memory_pool_get_info(pool, - HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &alloc); - ErrorCheck(status); - if (alloc != true) { - return HSA_STATUS_SUCCESS; - } - - // Query the max allocatable size - size_t max_size = 0; - status = hsa_amd_memory_pool_get_info(pool, - HSA_AMD_MEMORY_POOL_INFO_SIZE, &max_size); - ErrorCheck(status); - - // Determine if the pools is accessible to all agents - bool access_to_all = false; - status = hsa_amd_memory_pool_get_info(pool, - HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, &access_to_all); - ErrorCheck(status); - - // Determine type of access to owner agent - hsa_amd_memory_pool_access_t owner_access; - hsa_agent_t agent = asyncDrvr->agent_list_.back().agent_; - status = hsa_amd_agent_memory_pool_get_info(agent, pool, - HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &owner_access); - ErrorCheck(status); - - // Determine if the pool is fine-grained or coarse-grained - uint32_t flag = 0; - status = hsa_amd_memory_pool_get_info(pool, - HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag); - ErrorCheck(status); - bool is_kernarg = (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & flag); - bool is_fine_grained = (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & flag); - - // Update the pool handle for system memory if kernarg is true - if (is_kernarg) { - asyncDrvr->sys_pool_ = pool; - } - - // Consult user request and add either fine-grained or - // coarse-grained memory pools if agent is CPU. Default - // is to skip coarse-grained memory pools - agent_info_t& agent_info = asyncDrvr->agent_list_.back(); - if (agent_info.device_type_ == HSA_DEVICE_TYPE_CPU) { - if (asyncDrvr->skip_cpu_fine_grain_ != NULL) { - if (is_fine_grained == true) { - return HSA_STATUS_SUCCESS; - } - } else { - // Skip pools that are one of the following: - // Coarse grained - // Fine grained with kernarg being false - if ((is_fine_grained == false) || (is_kernarg == false)) { + hsa_status_t status; + RocmBandwidthTest* asyncDrvr = reinterpret_cast(data); + + // Query pools' segment, report only pools from global segment + hsa_amd_segment_t segment; + status = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); + ErrorCheck(status); + if (HSA_AMD_SEGMENT_GLOBAL != segment) { return HSA_STATUS_SUCCESS; - } } - } - - // Consult user request and add either fine-grained or - // coarse-grained memory pools if agent is GPU. Default - // is to skip fine-grained memory pools - if (agent_info.device_type_ == HSA_DEVICE_TYPE_GPU) { - if (asyncDrvr->skip_gpu_coarse_grain_ != NULL) { - if (is_fine_grained == false) { - return HSA_STATUS_SUCCESS; - } - } else { - if (is_fine_grained == true) { + + // Determine if allocation is allowed in this pool + // Report only pools that allow an alloction by user + bool alloc = false; + status = + hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &alloc); + ErrorCheck(status); + if (alloc != true) { return HSA_STATUS_SUCCESS; - } } - } - // Create an instance of agent_pool_info and add it to the list - pool_info_t pool_info(agent, asyncDrvr->agent_index_, pool, - segment, max_size, asyncDrvr->pool_index_, - is_fine_grained, is_kernarg, - access_to_all, owner_access); - asyncDrvr->pool_list_.push_back(pool_info); + // Query the max allocatable size + size_t max_size = 0; + status = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SIZE, &max_size); + ErrorCheck(status); + + // Determine if the pools is accessible to all agents + bool access_to_all = false; + status = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, + &access_to_all); + ErrorCheck(status); + + // Determine type of access to owner agent + hsa_amd_memory_pool_access_t owner_access; + hsa_agent_t agent = asyncDrvr->agent_list_.back().agent_; + status = hsa_amd_agent_memory_pool_get_info(agent, pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, + &owner_access); + ErrorCheck(status); + + // Determine if the pool is fine-grained or coarse-grained + uint32_t flag = 0; + status = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag); + ErrorCheck(status); + bool is_kernarg = (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & flag); + bool is_fine_grained = (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & flag); + + // Update the pool handle for system memory if kernarg is true + if (is_kernarg) { + asyncDrvr->sys_pool_ = pool; + } + + // Consult user request and add either fine-grained or + // coarse-grained memory pools if agent is CPU. Default + // is to skip coarse-grained memory pools + agent_info_t& agent_info = asyncDrvr->agent_list_.back(); + if (agent_info.device_type_ == HSA_DEVICE_TYPE_CPU) { + if (asyncDrvr->skip_cpu_fine_grain_ != NULL) { + if (is_fine_grained == true) { + return HSA_STATUS_SUCCESS; + } + } else { + // Skip pools that are one of the following: + // Coarse grained + // Fine grained with kernarg being false + if ((is_fine_grained == false) || (is_kernarg == false)) { + return HSA_STATUS_SUCCESS; + } + } + } - // Create an agent_pool_infot and add it to its list - asyncDrvr->agent_pool_list_[asyncDrvr->agent_index_].pool_list.push_back(pool_info); - asyncDrvr->pool_index_++; + // Consult user request and add either fine-grained or + // coarse-grained memory pools if agent is GPU. Default + // is to skip fine-grained memory pools + if (agent_info.device_type_ == HSA_DEVICE_TYPE_GPU) { + if (asyncDrvr->skip_gpu_coarse_grain_ != NULL) { + if (is_fine_grained == false) { + return HSA_STATUS_SUCCESS; + } + } else { + if (is_fine_grained == true) { + return HSA_STATUS_SUCCESS; + } + } + } - return HSA_STATUS_SUCCESS; -} + // Create an instance of agent_pool_info and add it to the list + pool_info_t pool_info(agent, asyncDrvr->agent_index_, pool, segment, max_size, + asyncDrvr->pool_index_, is_fine_grained, is_kernarg, access_to_all, + owner_access); + asyncDrvr->pool_list_.push_back(pool_info); -void PopulateBDF(uint32_t bdf_id, agent_info_t *agent_info) { + // Create an agent_pool_infot and add it to its list + asyncDrvr->agent_pool_list_[asyncDrvr->agent_index_].pool_list.push_back(pool_info); + asyncDrvr->pool_index_++; - uint8_t func_id = (bdf_id & 0x00000003); - uint8_t dev_id = ((bdf_id & 0x000000F8) >> 3); - uint8_t bus_id = ((bdf_id & 0x0000FF00) >> 8); - std::stringstream stream; - stream << std::setfill('0') << std::setw(sizeof(uint8_t) * 2); - stream << std::hex << +bus_id << ":" << +dev_id << "." << +func_id; - std::strcpy(agent_info->bdf_id_, (stream.str()).c_str()); + return HSA_STATUS_SUCCESS; +} + +void PopulateBDF(uint32_t bdf_id, agent_info_t* agent_info) { + uint8_t func_id = (bdf_id & 0x00000003); + uint8_t dev_id = ((bdf_id & 0x000000F8) >> 3); + uint8_t bus_id = ((bdf_id & 0x0000FF00) >> 8); + std::stringstream stream; + stream << std::setfill('0') << std::setw(sizeof(uint8_t) * 2); + stream << std::hex << +bus_id << ":" << +dev_id << "." << +func_id; + std::strcpy(agent_info->bdf_id_, (stream.str()).c_str()); } // @brief: Helper method to iterate throught the agents of // a system and discover its properties hsa_status_t AgentInfo(hsa_agent_t agent, void* data) { + RocmBandwidthTest* asyncDrvr = reinterpret_cast(data); + + // Get the name of the agent + char agent_name[64]; + hsa_status_t status; + status = hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_name); + ErrorCheck(status); + + // Get device type + hsa_device_type_t device_type; + status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type); + ErrorCheck(status); + + // Capture the handle of Cpu agent + if (device_type == HSA_DEVICE_TYPE_CPU) { + asyncDrvr->cpu_agent_ = agent; + asyncDrvr->cpu_index_ = asyncDrvr->agent_index_; + } + + // Instantiate an instance of agent_info_t and populate its name + // and BDF fields before adding it to the list of agent_info_t objects + agent_info_t agent_info(agent, asyncDrvr->agent_index_, device_type); + status = hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, + (void*)&agent_info.name_[0]); + + // Aqcuire GPU specific properties + // - BDF (a 32-bit integer) + // - UUID (a 21 char string including nil) + if (device_type == HSA_DEVICE_TYPE_GPU) { + status = + hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_UUID, agent_info.uuid_); + uint32_t bdf_id = 0; + status = + hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_BDFID, (void*)&bdf_id); + PopulateBDF(bdf_id, &agent_info); + } + asyncDrvr->agent_list_.push_back(agent_info); + + // Contruct an new agent_pool_info structure and add it to the list + agent_pool_info node; + node.agent = asyncDrvr->agent_list_.back(); + asyncDrvr->agent_pool_list_.push_back(node); + + status = hsa_amd_agent_iterate_memory_pools(agent, MemPoolInfo, asyncDrvr); + asyncDrvr->agent_index_++; - RocmBandwidthTest* asyncDrvr = reinterpret_cast(data); - - // Get the name of the agent - char agent_name[64]; - hsa_status_t status; - status = hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_name); - ErrorCheck(status); - - // Get device type - hsa_device_type_t device_type; - status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type); - ErrorCheck(status); - - // Capture the handle of Cpu agent - if (device_type == HSA_DEVICE_TYPE_CPU) { - asyncDrvr->cpu_agent_ = agent; - asyncDrvr->cpu_index_ = asyncDrvr->agent_index_; - } - - // Instantiate an instance of agent_info_t and populate its name - // and BDF fields before adding it to the list of agent_info_t objects - agent_info_t agent_info(agent, asyncDrvr->agent_index_, device_type); - status = hsa_agent_get_info(agent, - (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, - (void *)&agent_info.name_[0]); - - // Aqcuire GPU specific properties - // - BDF (a 32-bit integer) - // - UUID (a 21 char string including nil) - if (device_type == HSA_DEVICE_TYPE_GPU) { - status = hsa_agent_get_info(agent, - (hsa_agent_info_t)HSA_AMD_AGENT_INFO_UUID, - agent_info.uuid_); - uint32_t bdf_id = 0; - status = hsa_agent_get_info(agent, - (hsa_agent_info_t)HSA_AMD_AGENT_INFO_BDFID, - (void *)&bdf_id); - PopulateBDF(bdf_id, &agent_info); - } - asyncDrvr->agent_list_.push_back(agent_info); - - // Contruct an new agent_pool_info structure and add it to the list - agent_pool_info node; - node.agent = asyncDrvr->agent_list_.back(); - asyncDrvr->agent_pool_list_.push_back(node); - - status = hsa_amd_agent_iterate_memory_pools(agent, MemPoolInfo, asyncDrvr); - asyncDrvr->agent_index_++; - - return HSA_STATUS_SUCCESS; + return HSA_STATUS_SUCCESS; } void RocmBandwidthTest::PopulateAccessMatrix() { - - // Allocate memory to hold access lists - access_matrix_ = new uint32_t[agent_index_ * agent_index_](); - direct_access_matrix_ = new uint32_t[agent_index_ * agent_index_](); - - hsa_status_t status; - uint32_t size = pool_list_.size(); - for (uint32_t src_idx = 0; src_idx < size; src_idx++) { - - // Get handle of Src agent of the pool - uint32_t src_dev_idx = pool_list_[src_idx].agent_index_; - hsa_agent_t src_agent = pool_list_[src_idx].owner_agent_; - hsa_amd_memory_pool_t src_pool = pool_list_[src_idx].pool_; - hsa_device_type_t src_dev_type = agent_list_[src_dev_idx].device_type_; - - for (uint32_t dst_idx = 0; dst_idx < size; dst_idx++) { - - // Get handle of Dst pool - uint32_t dst_dev_idx = pool_list_[dst_idx].agent_index_; - hsa_agent_t dst_agent = pool_list_[dst_idx].owner_agent_; - hsa_amd_memory_pool_t dst_pool = pool_list_[dst_idx].pool_; - hsa_device_type_t dst_dev_type = agent_list_[dst_dev_idx].device_type_; - - // Determine if src agent has access to dst pool - hsa_amd_memory_pool_access_t access; - status = hsa_amd_agent_memory_pool_get_info(src_agent, dst_pool, - HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access); - ErrorCheck(status); - - // Record if Src device can access or not - uint32_t path; - path = (access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) ? 0 : 1; - direct_access_matrix_[(src_dev_idx * agent_index_) + dst_dev_idx] = path; - - if ((src_dev_type == HSA_DEVICE_TYPE_CPU) && - (dst_dev_type == HSA_DEVICE_TYPE_GPU) && - (access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED)) { - status = hsa_amd_agent_memory_pool_get_info(dst_agent, src_pool, - HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access); - ErrorCheck(status); - } - - // Access between the two agents is Non-Existent - path = (access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) ? 0 : 1; - access_matrix_[(src_dev_idx * agent_index_) + dst_dev_idx] = path; + // Allocate memory to hold access lists + access_matrix_ = new uint32_t[agent_index_ * agent_index_](); + direct_access_matrix_ = new uint32_t[agent_index_ * agent_index_](); + + hsa_status_t status; + uint32_t size = pool_list_.size(); + for (uint32_t src_idx = 0; src_idx < size; src_idx++) { + // Get handle of Src agent of the pool + uint32_t src_dev_idx = pool_list_[src_idx].agent_index_; + hsa_agent_t src_agent = pool_list_[src_idx].owner_agent_; + hsa_amd_memory_pool_t src_pool = pool_list_[src_idx].pool_; + hsa_device_type_t src_dev_type = agent_list_[src_dev_idx].device_type_; + + for (uint32_t dst_idx = 0; dst_idx < size; dst_idx++) { + // Get handle of Dst pool + uint32_t dst_dev_idx = pool_list_[dst_idx].agent_index_; + hsa_agent_t dst_agent = pool_list_[dst_idx].owner_agent_; + hsa_amd_memory_pool_t dst_pool = pool_list_[dst_idx].pool_; + hsa_device_type_t dst_dev_type = agent_list_[dst_dev_idx].device_type_; + + // Determine if src agent has access to dst pool + hsa_amd_memory_pool_access_t access; + status = hsa_amd_agent_memory_pool_get_info( + src_agent, dst_pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access); + ErrorCheck(status); + + // Record if Src device can access or not + uint32_t path; + path = (access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) ? 0 : 1; + direct_access_matrix_[(src_dev_idx * agent_index_) + dst_dev_idx] = path; + + if ((src_dev_type == HSA_DEVICE_TYPE_CPU) && (dst_dev_type == HSA_DEVICE_TYPE_GPU) && + (access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED)) { + status = hsa_amd_agent_memory_pool_get_info( + dst_agent, src_pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access); + ErrorCheck(status); + } + + // Access between the two agents is Non-Existent + path = (access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) ? 0 : 1; + access_matrix_[(src_dev_idx * agent_index_) + dst_dev_idx] = path; + } } - } } void RocmBandwidthTest::DiscoverTopology() { + // Populate the lists of agents and pools + err_ = hsa_iterate_agents(AgentInfo, this); - // Populate the lists of agents and pools - err_ = hsa_iterate_agents(AgentInfo, this); - - // Populate the access, link type and weight matrices - // Access matrix must be populated first - PopulateAccessMatrix(); - DiscoverLinkProps(); + // Populate the access, link type and weight matrices + // Access matrix must be populated first + PopulateAccessMatrix(); + DiscoverLinkProps(); } -uint32_t GetLinkType(hsa_device_type_t src_dev_type, - hsa_device_type_t dst_dev_type, +uint32_t GetLinkType(hsa_device_type_t src_dev_type, hsa_device_type_t dst_dev_type, hsa_amd_memory_pool_link_info_t* link_info, uint32_t hops) { - - // Link type is ignored, linkinfo is illegal - // Currently Thunk collapses multi-hop paths into one - // while accumulating their numa weight - // @note: Thunk retains the original link type - if (hops != 1) { + // Link type is ignored, linkinfo is illegal + // Currently Thunk collapses multi-hop paths into one + // while accumulating their numa weight + // @note: Thunk retains the original link type + if (hops != 1) { + return RocmBandwidthTest::LINK_TYPE_IGNORED; + } + + // Return link type only if it specified as XGMI + if ((link_info[0]).link_type == HSA_AMD_LINK_INFO_TYPE_XGMI) { + return RocmBandwidthTest::LINK_TYPE_XGMI; + } + + // In this case all we know is there is a path involving + // one or more links. Since it binding either two GPU's or + // one Gpu and one Cpu, we infer it to be of type PCIe + if ((src_dev_type == HSA_DEVICE_TYPE_GPU) || (dst_dev_type == HSA_DEVICE_TYPE_GPU)) { + return RocmBandwidthTest::LINK_TYPE_PCIE; + } + + // This occurs when both devices are CPU's return RocmBandwidthTest::LINK_TYPE_IGNORED; - } - - // Return link type only if it specified as XGMI - if ((link_info[0]).link_type == HSA_AMD_LINK_INFO_TYPE_XGMI) { - return RocmBandwidthTest::LINK_TYPE_XGMI; - } - - // In this case all we know is there is a path involving - // one or more links. Since it binding either two GPU's or - // one Gpu and one Cpu, we infer it to be of type PCIe - if ((src_dev_type == HSA_DEVICE_TYPE_GPU) || - (dst_dev_type == HSA_DEVICE_TYPE_GPU)) { - return RocmBandwidthTest::LINK_TYPE_PCIE; - } - - // This occurs when both devices are CPU's - return RocmBandwidthTest::LINK_TYPE_IGNORED; } uint32_t GetLinkWeight(hsa_amd_memory_pool_link_info_t* link_info, uint32_t hops) { - - uint32_t weight = 0; - for(uint32_t hopIdx = 0; hopIdx < hops; hopIdx++) { - weight += (link_info[hopIdx]).numa_distance; - } - return weight; + uint32_t weight = 0; + for (uint32_t hopIdx = 0; hopIdx < hops; hopIdx++) { + weight += (link_info[hopIdx]).numa_distance; + } + return weight; } void RocmBandwidthTest::BindLinkProps(uint32_t idx1, uint32_t idx2) { - - // Agent has no pools so no need to look for numa distance - if (agent_pool_list_[idx2].pool_list.size() == 0) { - link_hops_matrix_[(idx1 * agent_index_) + idx2] = 0xFFFFFFFF; - link_weight_matrix_[(idx1 * agent_index_) + idx2] = 0xFFFFFFFF; - link_type_matrix_[(idx1 * agent_index_) + idx2] = LINK_TYPE_NO_PATH; - return; - } - - uint32_t hops = 0; - hsa_agent_t agent1 = agent_list_[idx1].agent_; - hsa_amd_memory_pool_t& pool = agent_pool_list_[idx2].pool_list[0].pool_; - err_ = hsa_amd_agent_memory_pool_get_info(agent1, pool, - HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS, &hops); - if (hops < 1) { - link_hops_matrix_[(idx1 * agent_index_) + idx2] = 0xFFFFFFFF; - link_weight_matrix_[(idx1 * agent_index_) + idx2] = 0xFFFFFFFF; - link_type_matrix_[(idx1 * agent_index_) + idx2] = LINK_TYPE_NO_PATH; - return; - } - - hsa_amd_memory_pool_link_info_t *link_info; - uint32_t link_info_sz = hops * sizeof(hsa_amd_memory_pool_link_info_t); - link_info = (hsa_amd_memory_pool_link_info_t *)malloc(link_info_sz); - std::memset(link_info, 0, (hops * sizeof(hsa_amd_memory_pool_link_info_t))); - err_ = hsa_amd_agent_memory_pool_get_info(agent1, pool, - HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO, link_info); - - - link_hops_matrix_[(idx1 * agent_index_) + idx2] = hops; - link_weight_matrix_[(idx1 * agent_index_) + idx2] = GetLinkWeight(link_info, hops); - - // Initialize link type based on Src and Dst devices plus link - // type reported by ROCr library - hsa_device_type_t src_dev_type = agent_list_[idx1].device_type_; - hsa_device_type_t dst_dev_type = agent_list_[idx2].device_type_; - link_type_matrix_[(idx1 * agent_index_) + idx2] = GetLinkType(src_dev_type, - dst_dev_type, link_info, hops); - // Free the allocated link block - free(link_info); + // Agent has no pools so no need to look for numa distance + if (agent_pool_list_[idx2].pool_list.size() == 0) { + link_hops_matrix_[(idx1 * agent_index_) + idx2] = 0xFFFFFFFF; + link_weight_matrix_[(idx1 * agent_index_) + idx2] = 0xFFFFFFFF; + link_type_matrix_[(idx1 * agent_index_) + idx2] = LINK_TYPE_NO_PATH; + return; + } + + uint32_t hops = 0; + hsa_agent_t agent1 = agent_list_[idx1].agent_; + hsa_amd_memory_pool_t& pool = agent_pool_list_[idx2].pool_list[0].pool_; + err_ = hsa_amd_agent_memory_pool_get_info(agent1, pool, + HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS, &hops); + if (hops < 1) { + link_hops_matrix_[(idx1 * agent_index_) + idx2] = 0xFFFFFFFF; + link_weight_matrix_[(idx1 * agent_index_) + idx2] = 0xFFFFFFFF; + link_type_matrix_[(idx1 * agent_index_) + idx2] = LINK_TYPE_NO_PATH; + return; + } + + hsa_amd_memory_pool_link_info_t* link_info; + uint32_t link_info_sz = hops * sizeof(hsa_amd_memory_pool_link_info_t); + link_info = (hsa_amd_memory_pool_link_info_t*)malloc(link_info_sz); + std::memset(link_info, 0, (hops * sizeof(hsa_amd_memory_pool_link_info_t))); + err_ = hsa_amd_agent_memory_pool_get_info(agent1, pool, + HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO, link_info); + + + link_hops_matrix_[(idx1 * agent_index_) + idx2] = hops; + link_weight_matrix_[(idx1 * agent_index_) + idx2] = GetLinkWeight(link_info, hops); + + // Initialize link type based on Src and Dst devices plus link + // type reported by ROCr library + hsa_device_type_t src_dev_type = agent_list_[idx1].device_type_; + hsa_device_type_t dst_dev_type = agent_list_[idx2].device_type_; + link_type_matrix_[(idx1 * agent_index_) + idx2] = + GetLinkType(src_dev_type, dst_dev_type, link_info, hops); + // Free the allocated link block + free(link_info); } void RocmBandwidthTest::DiscoverLinkProps() { + // Allocate space if it is first time + if (link_weight_matrix_ == NULL) { + link_type_matrix_ = new uint32_t[agent_index_ * agent_index_](); + link_hops_matrix_ = new uint32_t[agent_index_ * agent_index_](); + link_weight_matrix_ = new uint32_t[agent_index_ * agent_index_](); + } - // Allocate space if it is first time - if (link_weight_matrix_ == NULL) { - link_type_matrix_ = new uint32_t[agent_index_ * agent_index_](); - link_hops_matrix_ = new uint32_t[agent_index_ * agent_index_](); - link_weight_matrix_ = new uint32_t[agent_index_ * agent_index_](); - } - - agent_info_t agent_info; - for (uint32_t idx1 = 0; idx1 < agent_index_; idx1++) { - for (uint32_t idx2 = 0; idx2 < agent_index_; idx2++) { - if (idx1 == idx2) { - link_hops_matrix_[(idx1 * agent_index_) + idx2] = 0; - link_weight_matrix_[(idx1 *agent_index_) + idx2] = 0; - link_type_matrix_[(idx1 * agent_index_) + idx2] = LINK_TYPE_SELF; - continue; - } - BindLinkProps(idx1, idx2); + agent_info_t agent_info; + for (uint32_t idx1 = 0; idx1 < agent_index_; idx1++) { + for (uint32_t idx2 = 0; idx2 < agent_index_; idx2++) { + if (idx1 == idx2) { + link_hops_matrix_[(idx1 * agent_index_) + idx2] = 0; + link_weight_matrix_[(idx1 * agent_index_) + idx2] = 0; + link_type_matrix_[(idx1 * agent_index_) + idx2] = LINK_TYPE_SELF; + continue; + } + BindLinkProps(idx1, idx2); + } } - } } - diff --git a/rocm_bandwidth_test_trans.cpp b/rocm_bandwidth_test_trans.cpp old mode 100755 new mode 100644 index 7aa4fdc..0815049 --- a/rocm_bandwidth_test_trans.cpp +++ b/rocm_bandwidth_test_trans.cpp @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL @@ -43,428 +43,393 @@ #include "common.hpp" #include "rocm_bandwidth_test.hpp" -bool RocmBandwidthTest::FindMirrorRequest(bool reverse, - uint32_t src_idx, uint32_t dst_idx) { - - uint32_t size = trans_list_.size(); - for (uint32_t idx = 0; idx < size; idx++) { - async_trans_t& mirror = trans_list_[idx]; - if(reverse) { - if ((src_idx == mirror.copy.dst_idx_) && - (dst_idx == mirror.copy.src_idx_)) { - return true; - } - } else { - if ((src_idx == mirror.copy.src_idx_) && - (dst_idx == mirror.copy.dst_idx_)) { - return true; - } +bool RocmBandwidthTest::FindMirrorRequest(bool reverse, uint32_t src_idx, uint32_t dst_idx) { + uint32_t size = trans_list_.size(); + for (uint32_t idx = 0; idx < size; idx++) { + async_trans_t& mirror = trans_list_[idx]; + if (reverse) { + if ((src_idx == mirror.copy.dst_idx_) && (dst_idx == mirror.copy.src_idx_)) { + return true; + } + } else { + if ((src_idx == mirror.copy.src_idx_) && (dst_idx == mirror.copy.dst_idx_)) { + return true; + } + } } - } - return false; + return false; } -bool RocmBandwidthTest::BuildReadOrWriteTrans(uint32_t req_type, - vector& in_list) { - - // Validate the list of pool-agent tuples - hsa_status_t status; - hsa_amd_memory_pool_access_t access; - uint32_t list_size = in_list.size(); - for (uint32_t idx = 0; idx < list_size; idx+=2) { +bool RocmBandwidthTest::BuildReadOrWriteTrans(uint32_t req_type, vector& in_list) { + // Validate the list of pool-agent tuples + hsa_status_t status; + hsa_amd_memory_pool_access_t access; + uint32_t list_size = in_list.size(); + for (uint32_t idx = 0; idx < list_size; idx += 2) { + uint32_t pool_idx = in_list[idx]; + uint32_t exec_idx = in_list[idx + 1]; + + // Retrieve Roc runtime handles for memory pool and agent + hsa_agent_t exec_agent = agent_list_[exec_idx].agent_; + hsa_amd_memory_pool_t pool = pool_list_[pool_idx].pool_; + + // Determine agent can access the memory pool + status = hsa_amd_agent_memory_pool_get_info(exec_agent, pool, + HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access); + ErrorCheck(status); + + // Determine if accessibility to agent is not denied + if (access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) { + PrintIOAccessError(exec_idx, pool_idx); + return false; + } - uint32_t pool_idx = in_list[idx]; - uint32_t exec_idx = in_list[idx + 1]; + // Agent has access, build an instance of transaction + // and add it to the list of transactions + async_trans_t trans(req_type); + trans.kernel.code_ = NULL; + trans.kernel.pool_ = pool; + trans.kernel.pool_idx_ = pool_idx; + trans.kernel.agent_ = exec_agent; + trans.kernel.agent_idx_ = exec_idx; + trans_list_.push_back(trans); + } + return true; +} - // Retrieve Roc runtime handles for memory pool and agent - hsa_agent_t exec_agent = agent_list_[exec_idx].agent_; - hsa_amd_memory_pool_t pool = pool_list_[pool_idx].pool_; +bool RocmBandwidthTest::BuildReadTrans() { return BuildReadOrWriteTrans(REQ_READ, read_list_); } - // Determine agent can access the memory pool - status = hsa_amd_agent_memory_pool_get_info(exec_agent, pool, - HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access); - ErrorCheck(status); +bool RocmBandwidthTest::BuildWriteTrans() { return BuildReadOrWriteTrans(REQ_WRITE, write_list_); } - // Determine if accessibility to agent is not denied - if (access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) { - PrintIOAccessError(exec_idx, pool_idx); - return false; +bool RocmBandwidthTest::FilterCpuPool(uint32_t req_type, hsa_device_type_t dev_type, + bool fine_grained) { + if ((req_type != REQ_COPY_ALL_BIDIR) && (req_type != REQ_COPY_ALL_UNIDIR)) { + return false; } - // Agent has access, build an instance of transaction - // and add it to the list of transactions - async_trans_t trans(req_type); - trans.kernel.code_ = NULL; - trans.kernel.pool_ = pool; - trans.kernel.pool_idx_ = pool_idx; - trans.kernel.agent_ = exec_agent; - trans.kernel.agent_idx_ = exec_idx; - trans_list_.push_back(trans); - } - return true; -} + // Determine if device is a Cpu - filter out only if + // it is a Cpu device + if (dev_type != HSA_DEVICE_TYPE_CPU) { + return false; + } -bool RocmBandwidthTest::BuildReadTrans() { - return BuildReadOrWriteTrans(REQ_READ, read_list_); -} + // If env to skip fine grain is NULL it means + // we should filter out coarse-grain pools + if (skip_cpu_fine_grain_ == NULL) { + return (fine_grained == false); + } -bool RocmBandwidthTest::BuildWriteTrans() { - return BuildReadOrWriteTrans(REQ_WRITE, write_list_); + // If env to skip fine grain is NON-NULL it means + // we should filter out fine-grain pools + return (fine_grained == true); } -bool RocmBandwidthTest::FilterCpuPool(uint32_t req_type, - hsa_device_type_t dev_type, - bool fine_grained) { - - if ((req_type != REQ_COPY_ALL_BIDIR) && - (req_type != REQ_COPY_ALL_UNIDIR)) { - return false; - } - - // Determine if device is a Cpu - filter out only if - // it is a Cpu device - if (dev_type != HSA_DEVICE_TYPE_CPU) { - return false; - } - - // If env to skip fine grain is NULL it means - // we should filter out coarse-grain pools - if (skip_cpu_fine_grain_ == NULL) { - return (fine_grained == false); - } +bool RocmBandwidthTest::BuildCopyTrans(uint32_t req_type, vector& src_list, + vector& dst_list) { + uint32_t src_size = src_list.size(); + uint32_t dst_size = dst_list.size(); + + for (uint32_t idx = 0; idx < src_size; idx++) { + // Retrieve Roc runtime handles for Src memory pool and agents + uint32_t src_idx = src_list[idx]; + uint32_t src_dev_idx = pool_list_[src_idx].agent_index_; + hsa_amd_memory_pool_t src_pool = pool_list_[src_idx].pool_; + hsa_device_type_t src_dev_type = agent_list_[src_dev_idx].device_type_; + + for (uint32_t jdx = 0; jdx < dst_size; jdx++) { + // Retrieve Roc runtime handles for Dst memory pool and agents + uint32_t dst_idx = dst_list[jdx]; + uint32_t dst_dev_idx = pool_list_[dst_idx].agent_index_; + hsa_amd_memory_pool_t dst_pool = pool_list_[dst_idx].pool_; + hsa_device_type_t dst_dev_type = agent_list_[dst_dev_idx].device_type_; + + // Filter out transactions that involve only Cpu agents/devices + // without regard to type of request, default run, partial or full + // unidirectional or bidirectional copies + if ((src_dev_type == HSA_DEVICE_TYPE_CPU) && (dst_dev_type == HSA_DEVICE_TYPE_CPU)) { + continue; + } + + // Filter out transactions that involve only same GPU as both + // Src and Dst device if the request is bidirectional copy that + // is either partial or full + if ((req_type == REQ_COPY_BIDIR) || (req_type == REQ_COPY_ALL_BIDIR)) { + if (src_dev_idx == dst_dev_idx) { + continue; + } + + bool mirror = FindMirrorRequest(true, src_idx, dst_idx); + if (mirror) { + continue; + } + } + + // Determine if accessibility to dst pool for src agent is not denied + uint32_t path_exists = access_matrix_[(src_dev_idx * agent_index_) + dst_dev_idx]; + if (path_exists == 0) { + if ((req_type == REQ_COPY_ALL_BIDIR) || (req_type == REQ_COPY_ALL_UNIDIR)) { + continue; + } else { + PrintCopyAccessError(src_idx, dst_idx); + return false; + } + } + + // For bidirectional copies determine both access paths are valid + // Both paths are valid when one of the devices is a CPU. This is + // not true when both of the devices are GPU's. + if ((req_type == REQ_COPY_ALL_BIDIR) || (req_type == REQ_COPY_ALL_UNIDIR)) { + path_exists = access_matrix_[(dst_dev_idx * agent_index_) + src_dev_idx]; + if (path_exists == 0) { + continue; + } + } + + // Update the list of agents active in any copy operation + if (active_agents_list_ == NULL) { + active_agents_list_ = new uint32_t[agent_index_](); + } + active_agents_list_[src_dev_idx] = 1; + active_agents_list_[dst_dev_idx] = 1; + + // Agents have access, build an instance of transaction + // and add it to the list of transactions + async_trans_t trans(req_type); + trans.copy.src_idx_ = src_idx; + trans.copy.dst_idx_ = dst_idx; + trans.copy.src_pool_ = src_pool; + trans.copy.dst_pool_ = dst_pool; + trans.copy.bidir_ = ((req_type == REQ_COPY_BIDIR) || (req_type == REQ_COPY_ALL_BIDIR)); + trans.copy.uses_gpu_ = + ((src_dev_type == HSA_DEVICE_TYPE_GPU) || (dst_dev_type == HSA_DEVICE_TYPE_GPU)); + trans_list_.push_back(trans); + } + } - // If env to skip fine grain is NON-NULL it means - // we should filter out fine-grain pools - return (fine_grained == true); + return true; } -bool RocmBandwidthTest::BuildCopyTrans(uint32_t req_type, - vector& src_list, - vector& dst_list) { - - uint32_t src_size = src_list.size(); - uint32_t dst_size = dst_list.size(); - - for (uint32_t idx = 0; idx < src_size; idx++) { - - // Retrieve Roc runtime handles for Src memory pool and agents - uint32_t src_idx = src_list[idx]; - uint32_t src_dev_idx = pool_list_[src_idx].agent_index_; - hsa_amd_memory_pool_t src_pool = pool_list_[src_idx].pool_; - hsa_device_type_t src_dev_type = agent_list_[src_dev_idx].device_type_; - - for (uint32_t jdx = 0; jdx < dst_size; jdx++) { - - // Retrieve Roc runtime handles for Dst memory pool and agents - uint32_t dst_idx = dst_list[jdx]; - uint32_t dst_dev_idx = pool_list_[dst_idx].agent_index_; - hsa_amd_memory_pool_t dst_pool = pool_list_[dst_idx].pool_; - hsa_device_type_t dst_dev_type = agent_list_[dst_dev_idx].device_type_; - - // Filter out transactions that involve only Cpu agents/devices - // without regard to type of request, default run, partial or full - // unidirectional or bidirectional copies - if ((src_dev_type == HSA_DEVICE_TYPE_CPU) && - (dst_dev_type == HSA_DEVICE_TYPE_CPU)) { - continue; - } - - // Filter out transactions that involve only same GPU as both - // Src and Dst device if the request is bidirectional copy that - // is either partial or full - if ((req_type == REQ_COPY_BIDIR) || - (req_type == REQ_COPY_ALL_BIDIR)) { - if (src_dev_idx == dst_dev_idx) { - continue; +bool RocmBandwidthTest::BuildConcurrentCopyTrans(uint32_t req_type, vector& dev_list) { + uint32_t size = dev_list.size(); + for (uint32_t idx = 0; idx < size; idx += 2) { + // Retrieve Roc runtime handles for Src memory pool and agents + uint32_t src_idx = dev_list[idx]; + uint32_t src_dev_idx = pool_list_[src_idx].agent_index_; + hsa_amd_memory_pool_t src_pool = pool_list_[src_idx].pool_; + hsa_device_type_t src_dev_type = agent_list_[src_dev_idx].device_type_; + + // Retrieve Roc runtime handles for Dst memory pool and agents + uint32_t dst_idx = dev_list[idx + 1]; + uint32_t dst_dev_idx = pool_list_[dst_idx].agent_index_; + hsa_amd_memory_pool_t dst_pool = pool_list_[dst_idx].pool_; + hsa_device_type_t dst_dev_type = agent_list_[dst_dev_idx].device_type_; + + // Filter out transactions that involve only Cpu agents/devices + // without regard to type of request, default run, partial or full + // unidirectional or bidirectional copies + if ((src_dev_type == HSA_DEVICE_TYPE_CPU) && (dst_dev_type == HSA_DEVICE_TYPE_CPU)) { + continue; } - bool mirror = FindMirrorRequest(true, src_idx, dst_idx); + // Determine there is no duplicate + bool mirror = false; + mirror = FindMirrorRequest(false, src_idx, dst_idx); if (mirror) { - continue; + continue; } - } - - // Determine if accessibility to dst pool for src agent is not denied - uint32_t path_exists = access_matrix_[(src_dev_idx * agent_index_) + dst_dev_idx]; - if (path_exists == 0) { - if ((req_type == REQ_COPY_ALL_BIDIR) || - (req_type == REQ_COPY_ALL_UNIDIR)) { - continue; - } else { - PrintCopyAccessError(src_idx, dst_idx); - return false; + + // Filter out transactions that involve only same GPU as both + // Src and Dst device if the request is bidirectional copy that + // is either partial or full + if (req_type == REQ_CONCURRENT_COPY_BIDIR) { + if (src_dev_idx == dst_dev_idx) { + continue; + } + + mirror = FindMirrorRequest(true, src_idx, dst_idx); + if (mirror) { + continue; + } } - } - - // For bidirectional copies determine both access paths are valid - // Both paths are valid when one of the devices is a CPU. This is - // not true when both of the devices are GPU's. - if ((req_type == REQ_COPY_ALL_BIDIR) || - (req_type == REQ_COPY_ALL_UNIDIR)) { - path_exists = access_matrix_[(dst_dev_idx * agent_index_) + src_dev_idx]; + + // Determine if accessibility to dst pool for src agent is not denied + uint32_t path_exists = access_matrix_[(src_dev_idx * agent_index_) + dst_dev_idx]; if (path_exists == 0) { - continue; + PrintCopyAccessError(src_idx, dst_idx); + return false; } - } - - // Update the list of agents active in any copy operation - if (active_agents_list_ == NULL) { - active_agents_list_ = new uint32_t[agent_index_](); - } - active_agents_list_[src_dev_idx] = 1; - active_agents_list_[dst_dev_idx] = 1; - - // Agents have access, build an instance of transaction - // and add it to the list of transactions - async_trans_t trans(req_type); - trans.copy.src_idx_ = src_idx; - trans.copy.dst_idx_ = dst_idx; - trans.copy.src_pool_ = src_pool; - trans.copy.dst_pool_ = dst_pool; - trans.copy.bidir_ = ((req_type == REQ_COPY_BIDIR) || - (req_type == REQ_COPY_ALL_BIDIR)); - trans.copy.uses_gpu_ = ((src_dev_type == HSA_DEVICE_TYPE_GPU) || - (dst_dev_type == HSA_DEVICE_TYPE_GPU)); - trans_list_.push_back(trans); - } - } - - return true; -} - -bool RocmBandwidthTest::BuildConcurrentCopyTrans(uint32_t req_type, - vector& dev_list) { - - uint32_t size = dev_list.size(); - for (uint32_t idx = 0; idx < size; idx += 2) { - - // Retrieve Roc runtime handles for Src memory pool and agents - uint32_t src_idx = dev_list[idx]; - uint32_t src_dev_idx = pool_list_[src_idx].agent_index_; - hsa_amd_memory_pool_t src_pool = pool_list_[src_idx].pool_; - hsa_device_type_t src_dev_type = agent_list_[src_dev_idx].device_type_; - - // Retrieve Roc runtime handles for Dst memory pool and agents - uint32_t dst_idx = dev_list[idx + 1]; - uint32_t dst_dev_idx = pool_list_[dst_idx].agent_index_; - hsa_amd_memory_pool_t dst_pool = pool_list_[dst_idx].pool_; - hsa_device_type_t dst_dev_type = agent_list_[dst_dev_idx].device_type_; - - // Filter out transactions that involve only Cpu agents/devices - // without regard to type of request, default run, partial or full - // unidirectional or bidirectional copies - if ((src_dev_type == HSA_DEVICE_TYPE_CPU) && - (dst_dev_type == HSA_DEVICE_TYPE_CPU)) { - continue; - } - - // Determine there is no duplicate - bool mirror = false; - mirror = FindMirrorRequest(false, src_idx, dst_idx); - if (mirror) { - continue; - } - // Filter out transactions that involve only same GPU as both - // Src and Dst device if the request is bidirectional copy that - // is either partial or full - if (req_type == REQ_CONCURRENT_COPY_BIDIR) { - if (src_dev_idx == dst_dev_idx) { - continue; - } - - mirror = FindMirrorRequest(true, src_idx, dst_idx); - if (mirror) { - continue; - } - } - - // Determine if accessibility to dst pool for src agent is not denied - uint32_t path_exists = access_matrix_[(src_dev_idx * agent_index_) + dst_dev_idx]; - if (path_exists == 0) { - PrintCopyAccessError(src_idx, dst_idx); - return false; - } + // For bidirectional copies determine both access paths are valid + // Both paths are valid when one of the devices is a CPU. This is + // not true when both of the devices are GPU's. + if (req_type == REQ_CONCURRENT_COPY_BIDIR) { + path_exists = access_matrix_[(dst_dev_idx * agent_index_) + src_dev_idx]; + if (path_exists == 0) { + PrintCopyAccessError(dst_idx, src_idx); + return false; + } + } - // For bidirectional copies determine both access paths are valid - // Both paths are valid when one of the devices is a CPU. This is - // not true when both of the devices are GPU's. - if (req_type == REQ_CONCURRENT_COPY_BIDIR) { - path_exists = access_matrix_[(dst_dev_idx * agent_index_) + src_dev_idx]; - if (path_exists == 0) { - PrintCopyAccessError(dst_idx, src_idx); - return false; - } + // Update the list of agents active in any copy operation + if (active_agents_list_ == NULL) { + active_agents_list_ = new uint32_t[agent_index_](); + } + active_agents_list_[src_dev_idx] = 1; + active_agents_list_[dst_dev_idx] = 1; + + // Agents have access, build an instance of transaction + // and add it to the list of transactions + async_trans_t trans(req_type); + trans.copy.src_idx_ = src_idx; + trans.copy.dst_idx_ = dst_idx; + trans.copy.src_pool_ = src_pool; + trans.copy.dst_pool_ = dst_pool; + trans.copy.bidir_ = (req_type == REQ_CONCURRENT_COPY_BIDIR); + trans.copy.uses_gpu_ = + ((src_dev_type == HSA_DEVICE_TYPE_GPU) || (dst_dev_type == HSA_DEVICE_TYPE_GPU)); + trans_list_.push_back(trans); } - // Update the list of agents active in any copy operation - if (active_agents_list_ == NULL) { - active_agents_list_ = new uint32_t[agent_index_](); - } - active_agents_list_[src_dev_idx] = 1; - active_agents_list_[dst_dev_idx] = 1; - - // Agents have access, build an instance of transaction - // and add it to the list of transactions - async_trans_t trans(req_type); - trans.copy.src_idx_ = src_idx; - trans.copy.dst_idx_ = dst_idx; - trans.copy.src_pool_ = src_pool; - trans.copy.dst_pool_ = dst_pool; - trans.copy.bidir_ = (req_type == REQ_CONCURRENT_COPY_BIDIR); - trans.copy.uses_gpu_ = ((src_dev_type == HSA_DEVICE_TYPE_GPU) || - (dst_dev_type == HSA_DEVICE_TYPE_GPU)); - trans_list_.push_back(trans); - } - - return true; + return true; } bool RocmBandwidthTest::BuildBidirCopyTrans() { - return BuildCopyTrans(REQ_COPY_BIDIR, bidir_list_, bidir_list_); + return BuildCopyTrans(REQ_COPY_BIDIR, bidir_list_, bidir_list_); } bool RocmBandwidthTest::BuildUnidirCopyTrans() { - return BuildCopyTrans(REQ_COPY_UNIDIR, src_list_, dst_list_); + return BuildCopyTrans(REQ_COPY_UNIDIR, src_list_, dst_list_); } bool RocmBandwidthTest::BuildAllPoolsBidirCopyTrans() { - return BuildCopyTrans(REQ_COPY_ALL_BIDIR, bidir_list_, bidir_list_); + return BuildCopyTrans(REQ_COPY_ALL_BIDIR, bidir_list_, bidir_list_); } bool RocmBandwidthTest::BuildAllPoolsUnidirCopyTrans() { - return BuildCopyTrans(REQ_COPY_ALL_UNIDIR, src_list_, dst_list_); + return BuildCopyTrans(REQ_COPY_ALL_UNIDIR, src_list_, dst_list_); } // @brief: Builds a list of transaction per user request bool RocmBandwidthTest::BuildTransList() { + // Build list of Read transactions per user request + if (req_read_ == REQ_READ) { + return BuildReadTrans(); + } - // Build list of Read transactions per user request - if (req_read_ == REQ_READ) { - return BuildReadTrans(); - } - - // Build list of Write transactions per user request - if (req_write_ == REQ_WRITE) { - return BuildWriteTrans(); - } - - // Build list of Bidirectional Copy transactions per user request - if (req_copy_bidir_ == REQ_COPY_BIDIR) { - return BuildBidirCopyTrans(); - } - - // Build list of Unidirectional Copy transactions per user request - if (req_copy_unidir_ == REQ_COPY_UNIDIR) { - return BuildUnidirCopyTrans(); - } - - // Build list of All Bidir Copy transactions per user request - if (req_copy_all_bidir_ == REQ_COPY_ALL_BIDIR) { - return BuildAllPoolsBidirCopyTrans(); - } - - // Build list of All Unidir Copy transactions per user request - if (req_copy_all_unidir_ == REQ_COPY_ALL_UNIDIR) { - return BuildAllPoolsUnidirCopyTrans(); - } - - // Build list of Bidir Concurrent Copy transactions per user request - if (req_concurrent_copy_bidir_ == REQ_CONCURRENT_COPY_BIDIR) { - return BuildConcurrentCopyTrans(req_concurrent_copy_bidir_, bidir_list_); - } - - // Build list of Unidir Concurrent Copy transactions per user request - if (req_concurrent_copy_unidir_ == REQ_CONCURRENT_COPY_UNIDIR) { - return BuildConcurrentCopyTrans(req_concurrent_copy_unidir_, bidir_list_); - } - - // All of the transaction are built up - return true; -} - -void RocmBandwidthTest::ComputeCopyTime(std::vector& trans_list) { - - uint32_t trans_cnt = trans_list.size(); - for (uint32_t idx = 0; idx < trans_cnt; idx++) { - async_trans_t& trans = trans_list[idx]; - ComputeCopyTime(trans); - } -} + // Build list of Write transactions per user request + if (req_write_ == REQ_WRITE) { + return BuildWriteTrans(); + } -void RocmBandwidthTest::ComputeCopyTime(async_trans_t& trans) { - - // Get the frequency of Gpu Timestamping - uint64_t sys_freq = 0; - hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sys_freq); - - double avg_time = 0; - double min_time = 0; - size_t data_size = 0; - double avg_bandwidth = 0; - double peak_bandwidth = 0; - uint32_t size_len = size_list_.size(); - for (uint32_t idx = 0; idx < size_len; idx++) { - - // Adjust size of data involved in copy - data_size = size_list_[idx]; - if (trans.copy.bidir_ == true) { - data_size += size_list_[idx]; + // Build list of Bidirectional Copy transactions per user request + if (req_copy_bidir_ == REQ_COPY_BIDIR) { + return BuildBidirCopyTrans(); } - // Double data size if copying the same device - if (trans.copy.src_idx_ == trans.copy.dst_idx_) { - data_size += data_size; + // Build list of Unidirectional Copy transactions per user request + if (req_copy_unidir_ == REQ_COPY_UNIDIR) { + return BuildUnidirCopyTrans(); } - // Get time taken by copy operation. Adjust time from nanoseconds - // to units of seconds - if ((print_cpu_time_) || - (trans.copy.uses_gpu_ != true)) { - avg_time = trans.cpu_avg_time_[idx]; - min_time = trans.cpu_min_time_[idx]; - avg_time = avg_time / 1000 / 1000 / 1000; - min_time = min_time / 1000 / 1000 / 1000; - } else { - avg_time = trans.gpu_avg_time_[idx]; - min_time = trans.gpu_min_time_[idx]; + // Build list of All Bidir Copy transactions per user request + if (req_copy_all_bidir_ == REQ_COPY_ALL_BIDIR) { + return BuildAllPoolsBidirCopyTrans(); } - // Determine if there was a validation failure - // @note: Value is set to VALIDATE_COPY_OP_FAILURE - // if copy transaction wa validated and it failed - hsa_status_t verify_status = HSA_STATUS_ERROR; - if ((avg_time != VALIDATE_COPY_OP_FAILURE) && - (min_time != VALIDATE_COPY_OP_FAILURE)) { - verify_status = HSA_STATUS_SUCCESS; + // Build list of All Unidir Copy transactions per user request + if (req_copy_all_unidir_ == REQ_COPY_ALL_UNIDIR) { + return BuildAllPoolsUnidirCopyTrans(); } - // Adjust Gpu time if there is no validation error - if ((trans.copy.uses_gpu_) && - (print_cpu_time_ == false) && - (verify_status == HSA_STATUS_SUCCESS)) { - avg_time = avg_time / sys_freq; - min_time = min_time / sys_freq; + // Build list of Bidir Concurrent Copy transactions per user request + if (req_concurrent_copy_bidir_ == REQ_CONCURRENT_COPY_BIDIR) { + return BuildConcurrentCopyTrans(req_concurrent_copy_bidir_, bidir_list_); } - // Compute bandwidth - divide bandwidth with - // 10^9 not 1024^3 to get size in GigaBytes - // @note: For validation failures bandwidth - // is encoded by VALIDATE_COPY_OP_FAILURE - if (verify_status != HSA_STATUS_SUCCESS) { - avg_bandwidth = VALIDATE_COPY_OP_FAILURE; - peak_bandwidth = VALIDATE_COPY_OP_FAILURE; - } else { - avg_bandwidth = (double)data_size / avg_time / 1000 / 1000 / 1000; - peak_bandwidth = (double)data_size / min_time / 1000 / 1000 / 1000; + // Build list of Unidir Concurrent Copy transactions per user request + if (req_concurrent_copy_unidir_ == REQ_CONCURRENT_COPY_UNIDIR) { + return BuildConcurrentCopyTrans(req_concurrent_copy_unidir_, bidir_list_); } - // Update computed bandwidth for the transaction - trans.min_time_.push_back(min_time); - trans.avg_time_.push_back(avg_time); - trans.avg_bandwidth_.push_back(avg_bandwidth); - trans.peak_bandwidth_.push_back(peak_bandwidth); - } + // All of the transaction are built up + return true; +} + +void RocmBandwidthTest::ComputeCopyTime(std::vector& trans_list) { + uint32_t trans_cnt = trans_list.size(); + for (uint32_t idx = 0; idx < trans_cnt; idx++) { + async_trans_t& trans = trans_list[idx]; + ComputeCopyTime(trans); + } } +void RocmBandwidthTest::ComputeCopyTime(async_trans_t& trans) { + // Get the frequency of Gpu Timestamping + uint64_t sys_freq = 0; + hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sys_freq); + + double avg_time = 0; + double min_time = 0; + size_t data_size = 0; + double avg_bandwidth = 0; + double peak_bandwidth = 0; + uint32_t size_len = size_list_.size(); + for (uint32_t idx = 0; idx < size_len; idx++) { + // Adjust size of data involved in copy + data_size = size_list_[idx]; + if (trans.copy.bidir_ == true) { + data_size += size_list_[idx]; + } + + // Double data size if copying the same device + if (trans.copy.src_idx_ == trans.copy.dst_idx_) { + data_size += data_size; + } + + // Get time taken by copy operation. Adjust time from nanoseconds + // to units of seconds + if ((print_cpu_time_) || (trans.copy.uses_gpu_ != true)) { + avg_time = trans.cpu_avg_time_[idx]; + min_time = trans.cpu_min_time_[idx]; + avg_time = avg_time / 1000 / 1000 / 1000; + min_time = min_time / 1000 / 1000 / 1000; + } else { + avg_time = trans.gpu_avg_time_[idx]; + min_time = trans.gpu_min_time_[idx]; + } + + // Determine if there was a validation failure + // @note: Value is set to VALIDATE_COPY_OP_FAILURE + // if copy transaction wa validated and it failed + hsa_status_t verify_status = HSA_STATUS_ERROR; + if ((avg_time != VALIDATE_COPY_OP_FAILURE) && (min_time != VALIDATE_COPY_OP_FAILURE)) { + verify_status = HSA_STATUS_SUCCESS; + } + + // Adjust Gpu time if there is no validation error + if ((trans.copy.uses_gpu_) && (print_cpu_time_ == false) && + (verify_status == HSA_STATUS_SUCCESS)) { + avg_time = avg_time / sys_freq; + min_time = min_time / sys_freq; + } + + // Compute bandwidth - divide bandwidth with + // 10^9 not 1024^3 to get size in GigaBytes + // @note: For validation failures bandwidth + // is encoded by VALIDATE_COPY_OP_FAILURE + if (verify_status != HSA_STATUS_SUCCESS) { + avg_bandwidth = VALIDATE_COPY_OP_FAILURE; + peak_bandwidth = VALIDATE_COPY_OP_FAILURE; + } else { + avg_bandwidth = (double)data_size / avg_time / 1000 / 1000 / 1000; + peak_bandwidth = (double)data_size / min_time / 1000 / 1000 / 1000; + } + + // Update computed bandwidth for the transaction + trans.min_time_.push_back(min_time); + trans.avg_time_.push_back(avg_time); + trans.avg_bandwidth_.push_back(avg_bandwidth); + trans.peak_bandwidth_.push_back(peak_bandwidth); + } +} diff --git a/rocm_bandwidth_test_validate.cpp b/rocm_bandwidth_test_validate.cpp old mode 100755 new mode 100644 index f8e5c59..1610cd9 --- a/rocm_bandwidth_test_validate.cpp +++ b/rocm_bandwidth_test_validate.cpp @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL @@ -44,155 +44,141 @@ #include "rocm_bandwidth_test.hpp" #include -#include #include + +#include #include #include bool RocmBandwidthTest::PoolIsPresent(vector& in_list) { - - bool is_present; - uint32_t idx1 = 0; - uint32_t idx2 = 0; - uint32_t count = in_list.size(); - uint32_t pool_count = pool_list_.size(); - for (idx1 = 0; idx1 < count; idx1++) { - is_present = false; - for (idx2 = 0; idx2 < pool_count; idx2++) { - if (in_list[idx1] == pool_list_[idx2].index_) { - is_present = true; - break; - } + bool is_present; + uint32_t idx1 = 0; + uint32_t idx2 = 0; + uint32_t count = in_list.size(); + uint32_t pool_count = pool_list_.size(); + for (idx1 = 0; idx1 < count; idx1++) { + is_present = false; + for (idx2 = 0; idx2 < pool_count; idx2++) { + if (in_list[idx1] == pool_list_[idx2].index_) { + is_present = true; + break; + } + } + if (is_present == false) { + return false; + } } - if (is_present == false) { - return false; - } - } - return true; + return true; } bool RocmBandwidthTest::PoolIsDuplicated(vector& in_list) { - - uint32_t idx1 = 0; - uint32_t idx2 = 0; - uint32_t count = in_list.size(); - for (idx1 = 0; idx1 < count; idx1++) { - for (idx2 = 0; idx2 < count; idx2++) { - if ((in_list[idx1] == in_list[idx2]) && (idx1 != idx2)){ - return false; - } + uint32_t idx1 = 0; + uint32_t idx2 = 0; + uint32_t count = in_list.size(); + for (idx1 = 0; idx1 < count; idx1++) { + for (idx2 = 0; idx2 < count; idx2++) { + if ((in_list[idx1] == in_list[idx2]) && (idx1 != idx2)) { + return false; + } + } } - } - return true; + return true; } bool RocmBandwidthTest::ValidateReadOrWriteReq(vector& in_list) { + // Determine read / write request is even + // Request is specified as a list of memory + // pool, agent tuples - first element identifies + // memory pool while the second element denotes + // an agent + uint32_t list_size = in_list.size(); + if ((list_size % 2) != 0) { + return false; + } - // Determine read / write request is even - // Request is specified as a list of memory - // pool, agent tuples - first element identifies - // memory pool while the second element denotes - // an agent - uint32_t list_size = in_list.size(); - if ((list_size % 2) != 0) { - return false; - } - - // Validate the list of pool-agent tuples - for (uint32_t idx = 0; idx < list_size; idx+=2) { - uint32_t pool_idx = in_list[idx]; - uint32_t exec_idx = in_list[idx + 1]; - // Determine the pool and agent exist in system - if ((pool_idx >= pool_index_) || - (exec_idx >= agent_index_)) { - return false; + // Validate the list of pool-agent tuples + for (uint32_t idx = 0; idx < list_size; idx += 2) { + uint32_t pool_idx = in_list[idx]; + uint32_t exec_idx = in_list[idx + 1]; + // Determine the pool and agent exist in system + if ((pool_idx >= pool_index_) || (exec_idx >= agent_index_)) { + return false; + } } - } - return true; + return true; } -bool RocmBandwidthTest::ValidateReadReq() { - return ValidateReadOrWriteReq(read_list_); -} +bool RocmBandwidthTest::ValidateReadReq() { return ValidateReadOrWriteReq(read_list_); } -bool RocmBandwidthTest::ValidateWriteReq() { - return ValidateReadOrWriteReq(write_list_); -} +bool RocmBandwidthTest::ValidateWriteReq() { return ValidateReadOrWriteReq(write_list_); } bool RocmBandwidthTest::ValidateCopyReq(vector& in_list) { - - // Determine pool list length is valid - uint32_t count = in_list.size(); - uint32_t pool_count = pool_list_.size(); - if (count > pool_count) { - return false; - } - - // Determine no pool is duplicated - bool status = PoolIsDuplicated(in_list); - if (status == false) { - return false; - } - - // Determine every pool is present in system - return PoolIsPresent(in_list); -} + // Determine pool list length is valid + uint32_t count = in_list.size(); + uint32_t pool_count = pool_list_.size(); + if (count > pool_count) { + return false; + } -bool RocmBandwidthTest::ValidateBidirCopyReq() { - return ValidateCopyReq(bidir_list_); + // Determine no pool is duplicated + bool status = PoolIsDuplicated(in_list); + if (status == false) { + return false; + } + + // Determine every pool is present in system + return PoolIsPresent(in_list); } +bool RocmBandwidthTest::ValidateBidirCopyReq() { return ValidateCopyReq(bidir_list_); } + bool RocmBandwidthTest::ValidateUnidirCopyReq() { - return ((ValidateCopyReq(src_list_)) && (ValidateCopyReq(dst_list_))); + return ((ValidateCopyReq(src_list_)) && (ValidateCopyReq(dst_list_))); } bool RocmBandwidthTest::ValidateConcurrentCopyReq() { - - // Determine every pool is present in system - return PoolIsPresent(bidir_list_); + // Determine every pool is present in system + return PoolIsPresent(bidir_list_); } bool RocmBandwidthTest::ValidateArguments() { - - // Determine if user has requested a READ - // operation and gave valid inputs - if (req_read_ == REQ_READ) { - return ValidateReadReq(); - } - - // Determine if user has requested a WRITE - // operation and gave valid inputs - if (req_write_ == REQ_WRITE) { - return ValidateWriteReq(); - } - - // Determine if user has requested a Copy - // operation that is bidirectional and gave - // valid inputs. Same validation is applied - // for all-to-all unidirectional copy operation - if ((req_copy_bidir_ == REQ_COPY_BIDIR) || - (req_copy_all_bidir_ == REQ_COPY_ALL_BIDIR)) { - return ValidateBidirCopyReq(); - } - - // Determine if user has requested a Copy - // operation that is unidirectional and gave - // valid inputs. Same validation is applied - // for all-to-all bidirectional copy operation - if ((req_copy_unidir_ == REQ_COPY_UNIDIR) || - (req_copy_all_unidir_ == REQ_COPY_ALL_UNIDIR)) { - return ValidateUnidirCopyReq(); - } - - // Determine if user has requested a Concurrent - // Copy operation that is unidirectional or bidirectional - // and gave valid inputs. - if ((req_concurrent_copy_bidir_ == REQ_CONCURRENT_COPY_BIDIR) || - (req_concurrent_copy_unidir_ == REQ_CONCURRENT_COPY_UNIDIR)) { - return ValidateConcurrentCopyReq(); - } - - // All of the request are well formed - return true; + // Determine if user has requested a READ + // operation and gave valid inputs + if (req_read_ == REQ_READ) { + return ValidateReadReq(); + } + + // Determine if user has requested a WRITE + // operation and gave valid inputs + if (req_write_ == REQ_WRITE) { + return ValidateWriteReq(); + } + + // Determine if user has requested a Copy + // operation that is bidirectional and gave + // valid inputs. Same validation is applied + // for all-to-all unidirectional copy operation + if ((req_copy_bidir_ == REQ_COPY_BIDIR) || (req_copy_all_bidir_ == REQ_COPY_ALL_BIDIR)) { + return ValidateBidirCopyReq(); + } + + // Determine if user has requested a Copy + // operation that is unidirectional and gave + // valid inputs. Same validation is applied + // for all-to-all bidirectional copy operation + if ((req_copy_unidir_ == REQ_COPY_UNIDIR) || (req_copy_all_unidir_ == REQ_COPY_ALL_UNIDIR)) { + return ValidateUnidirCopyReq(); + } + + // Determine if user has requested a Concurrent + // Copy operation that is unidirectional or bidirectional + // and gave valid inputs. + if ((req_concurrent_copy_bidir_ == REQ_CONCURRENT_COPY_BIDIR) || + (req_concurrent_copy_unidir_ == REQ_CONCURRENT_COPY_UNIDIR)) { + return ValidateConcurrentCopyReq(); + } + + // All of the request are well formed + return true; }