Skip to content

Instantly share code, notes, and snippets.

@Smthri
Last active August 16, 2021 08:07
Show Gist options
  • Save Smthri/e7a414798d7fa32fc9f41f44c9bd1b11 to your computer and use it in GitHub Desktop.
Save Smthri/e7a414798d7fa32fc9f41f44c9bd1b11 to your computer and use it in GitHub Desktop.
Our result is approximately equal to the float output of the non-quantized convolution. However, sometimes PyTorch yields result with a large error.
import numpy as np
import torch
import torch.quantization as tq
from torch import nn
from copy import deepcopy
## Manual quantization
# weight, bias, input
kernel_size = 3
w = np.random.randn(kernel_size, kernel_size)
# In this case bias is set to 0. With non-zero bias torch also yields incorrect results.
b = np.random.randn(1) * 0
x = np.random.randn(kernel_size, kernel_size)
print(w)
print(b)
print(x)
# float output
out_f = (np.sum(w*x) + b)[0]
print(out_f)
# first, calibrate wx + b, wx, w, x and b
def get_scale_zp(max_, min_, Qmin = 0, Qmax = 255):
scale = (max_ - min_) / (Qmax - Qmin)
if scale == 0.:
return (1., 0.)
zp = Qmin - np.round(min_ / scale)
return (scale, zp)
# Since MinMaxObservers, when processing single-value tensor, update only the min or the max value, but the other is set to 0.
wxb_max, wxb_min = (max(out_f, 0), min(out_f, 0))
wx_max, wx_min = (max(np.sum(w*x), 0), min(np.sum(w*x), 0))
w_max, w_min = (np.max(w), np.min(w))
x_max, x_min = (np.max(x), np.min(x))
b_max, b_min = (np.max(b), np.min(b))
wxb_scale, wxb_zp = get_scale_zp(wxb_max, wxb_min)
print(f'wx + b: {wxb_scale}, {wxb_zp}')
# Looks like this is never actually used
wx_scale, wx_zp = get_scale_zp(wx_max, wx_min)
print(f'wx: {wx_scale}, {wx_zp}')
# We're quantizing to qint8 now
w_scale, w_zp = get_scale_zp(w_max, w_min, -128, 127)
print(f'w: {w_scale}, {w_zp}')
x_scale, x_zp = get_scale_zp(x_max, x_min)
print(f'x: {x_scale}, {x_zp}')
b_scale, b_zp = get_scale_zp(max(b_max, 0), min(b_min, 0))
print(f'b: {b_scale}, {b_zp}')
# quantize weights
x_q = np.round(x / x_scale + x_zp)
w_q = np.round(w / w_scale + w_zp)
b_q = np.round(b / b_scale + b_zp)
print(w_q, '\r\n\r\n', b_q, '\r\n\r\n', x_q)
# transform float multiplier and perform quantized operation
Q = (w_scale * x_scale / wxb_scale)
n = int(np.floor(np.log2((2 ** 16 - 1) / Q)))
A = int(np.floor(2 ** n * Q))
wx_q = np.round(np.clip(np.sum((w_q - w_zp) * (x_q - x_zp)), -32768, 32767) * A / (2 ** n)) + \
np.round((b_q - b_zp) * (b_scale / wxb_scale)) + \
wxb_zp
wx_q = np.clip(wx_q, 0, 255)
print(wx_q)
# dequantize
wxb_q = wx_q
wxb = (wxb_q - wxb_zp) * wxb_scale
print(f'dequant output:\r\n{wxb}')
print(f'diff:\r\n{wxb - out_f}')
## Comparing with torch
xt = torch.Tensor([[x]])
wt = nn.Parameter(torch.Tensor([[w]]))
bt = nn.Parameter(torch.Tensor(b))
def quantize(model, input_shape):
with torch.no_grad():
# model = tq.QuantWrapper(model)
observer = tq.PerChannelMinMaxObserver()
model.qconfig = torch.quantization.QConfig(activation=tq.MinMaxObserver,
weight=observer.with_args(dtype=torch.qint8,
qscheme=torch.per_channel_affine))
#model.qconfig = torch.quantization.get_default_qconfig('qnnpack')
model = tq.QuantWrapper(model)
tq.prepare(model, inplace=True)
tmp = model(torch.Tensor([[x]]))
tq.convert(model, inplace=True)
return model
model = nn.Conv2d(1, 1, kernel_size, bias=True)
model.weight = wt
model.bias = bt
# generate another model with zero weights and a bias, to see what value the bias is quantized to
modelb = deepcopy(model)
for p in modelb.parameters():
torch.nn.init.zeros_(p)
modelb.bias = bt
print(model.weight.data, '\r\n\r\n',
model.bias.data,
'\r\n\r\ntorch output:\r\n',
model(xt),
'\r\n\r\nwhereas our output is:\r\n',
out_f)
model = quantize(model, (1, kernel_size, kernel_size))
modelb = quantize(modelb, (1, kernel_size, kernel_size))
# step-by-step forward:
q_inp = model.quant(xt)
q_outp = model.module(q_inp)
f_outp = model.dequant(q_outp)
inp_scale = q_inp.q_scale()
inp_zero_point = q_inp.q_zero_point()
outp_scale = q_outp.q_scale()
outp_zero_point = q_outp.q_zero_point()
print('torch input scale, zp:\r\n',
inp_scale, inp_zero_point,
'\r\nwhereas our scale, zp are:\r\n',
x_scale, x_zp, '\r\n')
print('torch output scale, zp:\r\n',
outp_scale, outp_zero_point,
'\r\nwhereas our scale, zp are:\r\n',
wxb_scale, wxb_zp, '\r\n')
per_ch_zp = model.module.weight().q_per_channel_zero_points()
per_ch_scales = model.module.weight().q_per_channel_scales()
print('torch channels scale, zp:\r\n',
per_ch_scales, per_ch_zp,
'\r\nwhereas our scale, zp are:\r\n',
w_scale, w_zp, '\r\n')
bqt_outp = modelb.module(modelb.quant(xt))
print('torch bias:\r\n',
bqt_outp,
'\r\nwhereas our bias is:\r\n',
b, 'scale, zp: ', b_scale, b_zp, '\r\n')
print('output of quantized model:\r\n',
q_outp,
'\r\n\r\nwhereas our result is:\r\n',
wxb,
'\r\n\r\nand the ground truth is:\r\n',
out_f,
'\r\n\r\ntorch quantized output:\r\n',
q_outp.int_repr(),
'\r\n\r\nour quantized output:\r\n',
wxb_q)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment