Common SQLite examples¶
This section lists recepies to frequently asked questions on “how-to’s” with NVIDIA Nsight Systems data exported in SQLite format.
General notes¶
The data exported into SQLite database don’t contain any indexes by default and are as close to the contents of the report as possible. Therefore, for better readability and speed of execution, queries below will alter tables.
Common helper commands¶
When utilizing sqlite3 commandline tool, it’s helpful to have data printed as named columns, this can be done with:
.mode column
.headers on
Default column width is determined by the data in the first row of results. If this doesn’t work out well, you can specify widths manually:
.width 10 20 50
Obtaining sample report¶
CLI interface of Nsight Systems was used to profile radixSortThrust
CUDA sample, then the resulting .qdrep
file was exported using the
nsys-exporter
tool.
nsys profile --trace=cuda,osrt radixSortThrust
nsys-exporter --export-sqlite report1.qdrep
Transfer CUDA kernel names onto API ranges¶
ALTER TABLE CUPTI_ACTIVITY_KIND_RUNTIME ADD COLUMN name TEXT;
ALTER TABLE CUPTI_ACTIVITY_KIND_RUNTIME ADD COLUMN kernelName TEXT;
UPDATE CUPTI_ACTIVITY_KIND_RUNTIME SET kernelName =
(SELECT value FROM StringIds
JOIN CUPTI_ACTIVITY_KIND_KERNEL AS cuda_gpu
ON cuda_gpu.demangledName = StringIds.id
AND CUPTI_ACTIVITY_KIND_RUNTIME.correlationId = cuda_gpu.correlationId);
UPDATE CUPTI_ACTIVITY_KIND_RUNTIME SET name =
(SELECT value FROM StringIds WHERE nameId = StringIds.id);
-- Select 10 longest CUDA API ranges that resulted in kernel execution.
SELECT name, kernelName, start, end FROM CUPTI_ACTIVITY_KIND_RUNTIME
WHERE kernelName IS NOT NULL ORDER BY end - start LIMIT 10;
Results:
name kernelName start end
---------------------- -------------------------------------------------------------------------------------------------------------------- ---------- ----------
cudaLaunchKernel_v7000 void thrust::cuda_cub::cub::RadixSortScanBinsKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<unsigned int, unsigned int, int>::Policy700, int>(int*, int) 658863435 658868490
cudaLaunchKernel_v7000 void thrust::cuda_cub::cub::RadixSortScanBinsKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<unsigned int, unsigned int, int>::Policy700, int>(int*, int) 609755015 609760075
cudaLaunchKernel_v7000 void thrust::cuda_cub::cub::RadixSortScanBinsKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<unsigned int, unsigned int, int>::Policy700, int>(int*, int) 632683286 632688349
cudaLaunchKernel_v7000 void thrust::cuda_cub::cub::RadixSortScanBinsKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<unsigned int, unsigned int, int>::Policy700, int>(int*, int) 606495356 606500439
cudaLaunchKernel_v7000 void thrust::cuda_cub::cub::RadixSortScanBinsKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<unsigned int, unsigned int, int>::Policy700, int>(int*, int) 603114486 603119586
cudaLaunchKernel_v7000 void thrust::cuda_cub::cub::RadixSortScanBinsKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<unsigned int, unsigned int, int>::Policy700, int>(int*, int) 802729785 802734906
cudaLaunchKernel_v7000 void thrust::cuda_cub::cub::RadixSortScanBinsKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<unsigned int, unsigned int, int>::Policy700, int>(int*, int) 593381170 593386294
cudaLaunchKernel_v7000 void thrust::cuda_cub::cub::RadixSortScanBinsKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<unsigned int, unsigned int, int>::Policy700, int>(int*, int) 658759955 658765090
cudaLaunchKernel_v7000 void thrust::cuda_cub::cub::RadixSortScanBinsKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<unsigned int, unsigned int, int>::Policy700, int>(int*, int) 681549917 681555059
cudaLaunchKernel_v7000 void thrust::cuda_cub::cub::RadixSortScanBinsKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<unsigned int, unsigned int, int>::Policy700, int>(int*, int) 717812527 717817671
Remove ranges overlapping with overhead¶
-- Use the this query to count CUDA API ranges overlapping with the overhead ones.
-- Replace "SELECT COUNT(*)" with "DELETE" to remove such ranges.
SELECT COUNT(*) FROM CUPTI_ACTIVITY_KIND_RUNTIME WHERE id IN
(
SELECT cuda.id
FROM PROFILER_OVERHEAD as overhead
INNER JOIN CUPTI_ACTIVITY_KIND_RUNTIME as cuda ON
(cuda.start BETWEEN overhead.start and overhead.end)
OR (cuda.end BETWEEN overhead.start and overhead.end)
OR (cuda.start < overhead.start AND cuda.end > overhead.end)
);
Results:
COUNT(*)
----------
1095
Backtraces for OSRT ranges¶
-- Adding text columns makes results of the query below more human-readable.
-- These steps are completely optional.
ALTER TABLE OSRT_API ADD COLUMN name TEXT;
UPDATE OSRT_API SET name = (SELECT value FROM StringIds WHERE OSRT_API.nameId = StringIds.id);
ALTER TABLE OSRT_CALLCHAINS ADD COLUMN symbolName TEXT;
UPDATE OSRT_CALLCHAINS SET symbolName = (SELECT value FROM StringIds WHERE symbol = StringIds.id);
ALTER TABLE OSRT_CALLCHAINS ADD COLUMN moduleName TEXT;
UPDATE OSRT_CALLCHAINS SET moduleName = (SELECT value FROM StringIds WHERE module = StringIds.id);
-- Print backtrace of the longest OSRT range
SELECT globalTid / 0x1000000 % 0x1000000 AS PID, globalTid % 0x1000000 AS TID,
OSRT_API.id, start, end, name, callchainId, stackDepth, symbolName, moduleName
FROM OSRT_API LEFT JOIN OSRT_CALLCHAINS ON callchainId == OSRT_CALLCHAINS.id
WHERE OSRT_API.id IN (SELECT id FROM OSRT_API ORDER BY end - start DESC LIMIT 1)
ORDER BY stackDepth LIMIT 10;
Results:
PID TID id start end name callchainId stackDepth symbolName moduleName
---------- ---------- ---------- ---------- ---------- ---------------------- ----------- ---------- ------------------------------ ----------------------------------------
19163 19176 676 360897690 860966851 pthread_cond_timedwait 88 0 pthread_cond_timedwait@GLIBC_2 /lib/x86_64-linux-gnu/libpthread-2.27.so
19163 19176 676 360897690 860966851 pthread_cond_timedwait 88 1 0x7fbc983b7227 /usr/lib/x86_64-linux-gnu/libcuda.so.418
19163 19176 676 360897690 860966851 pthread_cond_timedwait 88 2 0x7fbc9835d5c7 /usr/lib/x86_64-linux-gnu/libcuda.so.418
19163 19176 676 360897690 860966851 pthread_cond_timedwait 88 3 0x7fbc983b64a8 /usr/lib/x86_64-linux-gnu/libcuda.so.418
19163 19176 676 360897690 860966851 pthread_cond_timedwait 88 4 start_thread /lib/x86_64-linux-gnu/libpthread-2.27.so
19163 19176 676 360897690 860966851 pthread_cond_timedwait 88 5 __clone /lib/x86_64-linux-gnu/libc-2.27.so