Source code for habitat.analysis.wave_scaling.roofline
import math
from habitat.analysis.metrics import Metric
from habitat.analysis.kernels import PredictedKernel
from habitat.analysis.wave_scaling.common import calculate_wave_info
[docs]def roofline_wave_scaling(
kernel,
origin_device,
dest_device,
metadata_manager,
):
gamma = _roofline_gamma(kernel, origin_device, dest_device)
gamma_compl = 1.0 - gamma
origin_wave_size, dest_wave_size, origin_occupancy, dest_occupancy = (
calculate_wave_info(
kernel,
origin_device,
dest_device,
metadata_manager,
)
)
# 1. Check if the kernel is too "small" - if it doesn't fill a single wave
# on the current device AND if it doesn't fill a single wave on the
# destination device
if (kernel.num_blocks // origin_wave_size == 0 and
kernel.num_blocks // dest_wave_size == 0):
# We scale the run time by the compute factor only
origin_max_occupancy = math.ceil(
kernel.num_blocks / origin_device.num_sms
)
dest_max_occupancy = math.ceil(
kernel.num_blocks / dest_device.num_sms
)
partial_compute_factor = (
(origin_device.base_clock_mhz / dest_device.base_clock_mhz) *
(dest_max_occupancy / origin_max_occupancy)
)
return PredictedKernel(
kernel,
kernel.run_time_ns * math.pow(partial_compute_factor, gamma_compl),
)
# 2. Compute the three scaling factors
bandwidth_factor = (
origin_device.mem_bandwidth_gb / dest_device.mem_bandwidth_gb
)
clock_factor = (
origin_device.base_clock_mhz / dest_device.base_clock_mhz
)
sm_factor = (
origin_device.num_sms / dest_device.num_sms
)
# 3. Scale and return the predicted run time
scaled_run_time_ns = (
kernel.run_time_ns *
math.pow(bandwidth_factor, gamma) *
math.pow(clock_factor, gamma_compl) *
math.pow(sm_factor, gamma_compl)
)
return PredictedKernel(kernel, scaled_run_time_ns)
[docs]def _roofline_gamma(kernel, origin_device, dest_device):
flop_efficiency = kernel.get_metric(Metric.SinglePrecisionFLOPEfficiency)
dram_read_bytes = kernel.get_metric(Metric.DRAMReadBytes)
dram_write_bytes = kernel.get_metric(Metric.DRAMWriteBytes)
total_gb = (dram_read_bytes + dram_write_bytes) / 1024 / 1024 / 1024
gflops_per_second = flop_efficiency / 100 * origin_device.peak_gflops_per_second
num_gflops = gflops_per_second * kernel.run_time_ns / 1e9
# We only consider the dest ridge point (R).
# We use a decreasing linear function to interpolate between an intensity
# of 0 and R, and use a 1/x function to map intensities greater than R.
#
# gamma = -0.5/R * intensity + 1 if 0 <= intensity <= R
# 0.5R / intensity otherwise
if num_gflops < 1e-9:
# We treat these cases as fully memory bandwidth bound, even though
# total_gb could also be 0
gamma = 1.
elif total_gb == 0:
# num_gflops must be non-zero, so this means the kernel is fully
# compute bound
gamma = 0.
else:
intensity_gflops_per_gb = num_gflops / total_gb
dest_ridge_point = _ridge_point(dest_device)
if intensity_gflops_per_gb > dest_ridge_point:
gamma = 0.5 * dest_ridge_point / intensity_gflops_per_gb
else:
gamma = -0.5 / dest_ridge_point * intensity_gflops_per_gb + 1.
assert gamma >= 0 and gamma <= 1
return gamma
[docs]def _ridge_point(device):
return device.peak_gflops_per_second / device.mem_bandwidth_gb