Skip to content

Instantly share code, notes, and snippets.

@smpanaro
Last active August 11, 2024 23:34
Show Gist options
  • Save smpanaro/5bb1370157bbaa0dd31cdd0007bb3f75 to your computer and use it in GitHub Desktop.
Save smpanaro/5bb1370157bbaa0dd31cdd0007bb3f75 to your computer and use it in GitHub Desktop.
Apple Neural Engine Throughput Bench

Toy model to exercise the ANE.

$ pip install coremltools torch numpy calflops
$ python throughput.py

Converting PyTorch Frontend ==> MIL Ops:  99%|███████████████████████████████████████████████████████████████████████████████████████████████████████▊ | 92/93 [00:00<00:00, 7276.15 ops/s]
Running MIL frontend_pytorch pipeline: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 1071.89 passes/s]
Running MIL default pipeline: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 79/79 [00:15<00:00,  5.03 passes/s]
Running MIL backend_mlprogram pipeline: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 1207.37 passes/s]
<class 'coremltools.optimize.coreml._quantization_passes.palettize_weights'>
Running compression pass palettize_weights: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:24<00:00,  1.53s/ ops]
Running MIL frontend_milinternal pipeline: 0 passes [00:00, ? passes/s]
Running MIL default pipeline: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 77/77 [00:00<00:00, 430.91 passes/s]
Running MIL backend_mlprogram pipeline: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 803.33 passes/s]

Total GFLOPs: 369.38973184
Mean Prediction: 33.389056 ms
TFLOP/sec: 11.063197828653799

Try increasing the batch size (line 74) if TFLOP/sec is lower than expected.

import torch
from torch import nn
import numpy as np
import coremltools as ct
import coremltools.optimize as cto
import time
class MLP(nn.Module):
def __init__(self, outer_dim, inner_dim):
super(MLP, self).__init__()
self.up = nn.Conv2d(outer_dim, inner_dim, 1,bias=False)
self.proj = nn.Conv2d(inner_dim, outer_dim, 1, bias=False)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x_fc_1 = self.up(x)
x = torch.nn.functional.silu(x_fc_1)
return self.proj(x)
class Net(nn.Module):
def __init__(self, outer_dim, inner_dim):
super(Net, self).__init__()
self.mlps = nn.Sequential(*[MLP(outer_dim, inner_dim) for _ in range(8)])
def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.mlps(x)
def make_model(input_shape):
net = Net(4096, 11008).eval()
inp = torch.randn(input_shape)
flops = None
try:
from calflops import calculate_flops
flops, macs, params = calculate_flops(model=net,
input_shape=tuple(inp.shape),
print_results=False,
output_as_string=False)
except ImportError:
print("To calculate FLOPs automatically run:")
print("pip install --upgrade calflops")
model = ct.convert(
torch.jit.trace(net, inp),
inputs=[ct.TensorType(name="input", shape=inp.shape, dtype=np.float16)],
outputs=[ct.TensorType(name="output", dtype=np.float16)],
minimum_deployment_target=ct.target.iOS16,
compute_precision=ct.precision.FLOAT16,
convert_to="mlprogram",
)
# Compress to 1 bit to minimize impact of memory bandwidth.
# (4 bits also saturates the M1 ANE at higher batch sizes, but it's slower to convert the model.)
config = cto.coreml.OptimizationConfig(
global_config=cto.coreml.OpPalettizerConfig(mode="uniform", nbits=1)
)
return cto.coreml.palettize_weights(model, config), flops
def bench(model, input_shape):
inp = torch.randn(input_shape)
# Warm up.
for i in range(100):
model.predict({"input": inp.numpy()})
# Benchmark.
start = time.time_ns()
loops = 1000
for i in range(loops):
model.predict({"input": inp.numpy()})
end = time.time_ns()
elapsed_ns = (end-start)/loops
return elapsed_ns / 1_000_000
input_shape = (4,4096,8,8)
model, flops = make_model(input_shape)
duration_ms = bench(model, input_shape)
if flops is None:
print(f"\nMean Prediction: {duration_ms} ms")
print("To calculate FLOPs automatically run:")
print("pip install --upgrade calflops")
else:
flops_sec = flops / (duration_ms / 1000.0)
print(f"\nTotal GFLOPs: {flops / 1e9}\nMean Prediction: {duration_ms} ms\nTFLOP/sec: {flops_sec / 1e12}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment