create nsys-rep using nvtxw

add nvtxw API headers and implementation code to export CUPTI data to an nys-rep using nvtxw
NVIDIA · Sep 27, 2024 · ea40f9c · ea40f9c
1 parent a8f1c7f
commit ea40f9c
Show file tree

Hide file tree

Showing 14 changed files with 4,697 additions and 7 deletions.
diff --git a/src/main/cpp/profiler/CMakeLists.txt b/src/main/cpp/profiler/CMakeLists.txt
@@ -77,6 +77,11 @@ configure_file(
 
 add_executable(spark_rapids_profile_converter
   spark_rapids_profile_converter.cpp
+  initialize_nvtxw.cpp
+  nvtxw3.cpp
+  nvtxw3.h
+  NvtxwEvents.cpp
+  NvtxwEvents.h
   "${SPARK_RAPIDS_JNI_GENERATED_SOURCE_DIR}/profiler_schema.cpp"
   "${SPARK_RAPIDS_JNI_GENERATED_SOURCE_DIR}/spark_rapids_jni_version.cpp"
   "${SPARK_RAPIDS_JNI_GENERATED_INCLUDE_DIR}/profiler_generated.h"
@@ -86,6 +91,8 @@ target_include_directories(
   spark_rapids_profile_converter
   PRIVATE
   "${CUDAToolkit_INCLUDE_DIRS}"
+  "${SPARK_RAPIDS_JNI_SOURCE_DIR}"
+  "${SPARK_RAPIDS_JNI_SOURCE_DIR}/profiler"
   "${SPARK_RAPIDS_JNI_SOURCE_DIR}/src"
   "${SPARK_RAPIDS_JNI_GENERATED_INCLUDE_DIR}"
 )

diff --git a/src/main/cpp/profiler/NvtxwEvents.cpp b/src/main/cpp/profiler/NvtxwEvents.cpp
diff --git a/src/main/cpp/profiler/NvtxwEvents.h b/src/main/cpp/profiler/NvtxwEvents.h
@@ -0,0 +1,188 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) <year> NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * 
+ * Licensed under the Apache License v2.0 with LLVM Exceptions.
+ * See LICENSE.txt for license information.
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include <nvtx3/nvToolsExtPayload.h>
+
+namespace NvidiaNvtxw 
+{
+
+namespace PayloadSchemaId 
+{
+    static constexpr uint64_t nameId              = 0xffffff00;
+    static constexpr uint64_t nvtxRangePushPopId  = 0xffffff01;
+    static constexpr uint64_t cuptiApiId          = 0xffffff02;
+    static constexpr uint64_t cuptiMemcpyId       = 0xffffff03;
+    static constexpr uint64_t cuptiMemsetId       = 0xffffff04;
+    static constexpr uint64_t cuptiDeviceId       = 0xffffff05;
+    static constexpr uint64_t cuptiKernelId       = 0xffffff06;
+    static constexpr uint64_t cuptiOverheadId     = 0xffffff07;
+    static constexpr uint64_t nvtxRangeStartEndId = 0xffffff08;
+};
+
+const nvtxPayloadSchemaAttr_t* GetNameSchemaAttr();
+
+struct nvtxRangeEvent {
+    uint64_t time_start;
+    uint64_t time_stop;
+    const char* name;
+    uint32_t process_id;
+    uint32_t thread_id;
+    uint32_t color;
+};
+const nvtxPayloadSchemaAttr_t* GetNvtxRangePushPopSchemaAttr();
+const nvtxPayloadSchemaAttr_t* GetNvtxRangeStartEndSchemaAttr();
+struct cuptiApiEvent {
+    uint64_t time_start;
+    uint64_t time_stop;
+    uint32_t kind;
+    uint32_t cbid;
+    uint32_t process_id;
+    uint32_t thread_id;
+    uint32_t correlation_id;
+    uint32_t return_value;
+};
+const nvtxPayloadSchemaAttr_t* GetCuptiApiSchemaAttr();
+struct cuptiDevice {
+    uint64_t global_memory_bandwidth;
+    uint64_t global_memory_size;
+    uint32_t constant_memory_size;
+    uint32_t l2_cache_size;
+    uint32_t num_threads_per_warp;
+    uint32_t core_clock_rate;
+    uint32_t num_memcpy_engines;
+    uint32_t num_multiprocessors;
+    uint32_t max_ipc;
+    uint32_t max_warps_per_multiprocessor;
+    uint32_t max_blocks_per_multiprocessor;
+    uint32_t max_shared_memory_per_multiprocessor;
+    uint32_t max_registers_per_multiprocessor;
+    uint32_t max_registers_per_block;
+    uint32_t max_shared_memory_per_block;
+    uint32_t max_threads_per_block;
+    uint32_t max_block_dim_x;
+    uint32_t max_block_dim_y;
+    uint32_t max_block_dim_z;
+    uint32_t max_grid_dim_x;
+    uint32_t max_grid_dim_y;
+    uint32_t max_grid_dim_z;
+    uint32_t compute_capability_major;
+    uint32_t compute_capability_minor;
+    uint32_t id;
+    uint32_t ecc_enabled;
+    const char* name;
+};
+const nvtxPayloadSchemaAttr_t* GetCuptiDeviceSchemaAttr();
+struct cuptiKernelEvent {
+    uint64_t time_start;
+    uint64_t time_stop;
+    uint64_t completed;
+    uint64_t grid_id;
+    uint64_t queued;
+    uint64_t submitted;
+    uint64_t graph_node_id;
+    uint64_t local_memory_total_v2;
+    const char * name;
+    uint32_t device_id;
+    uint32_t context_id;
+    uint32_t stream_id;
+    uint32_t process_id;
+    uint32_t grid_x;
+    uint32_t grid_y;
+    uint32_t grid_z;
+    uint32_t block_x;
+    uint32_t block_y;
+    uint32_t block_z;
+    uint32_t static_shared_memory;
+    uint32_t dynamic_shared_memory;
+    uint32_t local_memory_per_thread;
+    uint32_t local_memory_total;
+    uint32_t correlation_id;
+    uint32_t shared_memory_executed;
+    uint32_t graph_id;
+    uint32_t channel_id;
+    uint32_t cluster_x;
+    uint32_t cluster_y;
+    uint32_t cluster_z;
+    uint32_t cluster_scheduling_policy;
+    uint16_t registers_per_thread;
+    uint8_t requested;
+    uint8_t executed;
+    uint8_t shared_memory_config;
+    uint8_t partitioned_global_cache_requested;
+    uint8_t partitioned_global_cache_executed;
+    uint8_t launch_type;
+    uint8_t is_shared_memory_carveout_requested;
+    uint8_t shared_memory_carveout_requested;
+    uint8_t shmem_limit_config;
+    uint8_t channel_type;
+};
+const nvtxPayloadSchemaAttr_t* GetCuptiKernelSchemaAttr();
+
+struct cuptiMemcpyEvent {
+    uint64_t time_start;
+    uint64_t time_stop;
+    uint64_t bytes;
+    uint64_t graph_node_id;
+    uint32_t device_id;
+    uint32_t context_id;
+    uint32_t stream_id;
+    uint32_t process_id;
+    uint32_t correlation_id;
+    uint32_t runtime_correlation_id;
+    uint32_t graph_id;
+    uint32_t channel_id;
+    uint8_t channelType;
+    uint8_t copy_kind;
+    uint8_t src_kind;
+    uint8_t dst_kind;
+};
+const nvtxPayloadSchemaAttr_t* GetCuptiMemcpySchemaAttr();
+
+struct cuptiMemsetEvent {
+    uint64_t time_start;
+    uint64_t time_stop;
+    uint64_t bytes;
+    uint64_t graph_node_id;    
+    uint32_t device_id;
+    uint32_t context_id;
+    uint32_t stream_id;
+    uint32_t process_id;
+    uint32_t correlation_id;
+    uint32_t graph_id;
+    uint32_t channel_id;
+    uint32_t value;
+    uint8_t channelType;
+    uint8_t mem_kind;
+    uint8_t flags;
+};
+const nvtxPayloadSchemaAttr_t* GetCuptiMemsetSchemaAttr();
+struct cuptiOverheadEvent {
+    uint64_t time_start;
+    uint64_t time_stop;
+    uint32_t process_id;
+    uint32_t thread_id;
+    uint8_t overhead_kind;
+};
+const nvtxPayloadSchemaAttr_t* GetCuptiOverheadSchemaAttr();
+
+}