Last active
August 16, 2021 08:07
-
-
Save Smthri/e7a414798d7fa32fc9f41f44c9bd1b11 to your computer and use it in GitHub Desktop.
Our result is approximately equal to the float output of the non-quantized convolution. However, sometimes PyTorch yields result with a large error.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import torch | |
import torch.quantization as tq | |
from torch import nn | |
from copy import deepcopy | |
## Manual quantization | |
# weight, bias, input | |
kernel_size = 3 | |
w = np.random.randn(kernel_size, kernel_size) | |
# In this case bias is set to 0. With non-zero bias torch also yields incorrect results. | |
b = np.random.randn(1) * 0 | |
x = np.random.randn(kernel_size, kernel_size) | |
print(w) | |
print(b) | |
print(x) | |
# float output | |
out_f = (np.sum(w*x) + b)[0] | |
print(out_f) | |
# first, calibrate wx + b, wx, w, x and b | |
def get_scale_zp(max_, min_, Qmin = 0, Qmax = 255): | |
scale = (max_ - min_) / (Qmax - Qmin) | |
if scale == 0.: | |
return (1., 0.) | |
zp = Qmin - np.round(min_ / scale) | |
return (scale, zp) | |
# Since MinMaxObservers, when processing single-value tensor, update only the min or the max value, but the other is set to 0. | |
wxb_max, wxb_min = (max(out_f, 0), min(out_f, 0)) | |
wx_max, wx_min = (max(np.sum(w*x), 0), min(np.sum(w*x), 0)) | |
w_max, w_min = (np.max(w), np.min(w)) | |
x_max, x_min = (np.max(x), np.min(x)) | |
b_max, b_min = (np.max(b), np.min(b)) | |
wxb_scale, wxb_zp = get_scale_zp(wxb_max, wxb_min) | |
print(f'wx + b: {wxb_scale}, {wxb_zp}') | |
# Looks like this is never actually used | |
wx_scale, wx_zp = get_scale_zp(wx_max, wx_min) | |
print(f'wx: {wx_scale}, {wx_zp}') | |
# We're quantizing to qint8 now | |
w_scale, w_zp = get_scale_zp(w_max, w_min, -128, 127) | |
print(f'w: {w_scale}, {w_zp}') | |
x_scale, x_zp = get_scale_zp(x_max, x_min) | |
print(f'x: {x_scale}, {x_zp}') | |
b_scale, b_zp = get_scale_zp(max(b_max, 0), min(b_min, 0)) | |
print(f'b: {b_scale}, {b_zp}') | |
# quantize weights | |
x_q = np.round(x / x_scale + x_zp) | |
w_q = np.round(w / w_scale + w_zp) | |
b_q = np.round(b / b_scale + b_zp) | |
print(w_q, '\r\n\r\n', b_q, '\r\n\r\n', x_q) | |
# transform float multiplier and perform quantized operation | |
Q = (w_scale * x_scale / wxb_scale) | |
n = int(np.floor(np.log2((2 ** 16 - 1) / Q))) | |
A = int(np.floor(2 ** n * Q)) | |
wx_q = np.round(np.clip(np.sum((w_q - w_zp) * (x_q - x_zp)), -32768, 32767) * A / (2 ** n)) + \ | |
np.round((b_q - b_zp) * (b_scale / wxb_scale)) + \ | |
wxb_zp | |
wx_q = np.clip(wx_q, 0, 255) | |
print(wx_q) | |
# dequantize | |
wxb_q = wx_q | |
wxb = (wxb_q - wxb_zp) * wxb_scale | |
print(f'dequant output:\r\n{wxb}') | |
print(f'diff:\r\n{wxb - out_f}') | |
## Comparing with torch | |
xt = torch.Tensor([[x]]) | |
wt = nn.Parameter(torch.Tensor([[w]])) | |
bt = nn.Parameter(torch.Tensor(b)) | |
def quantize(model, input_shape): | |
with torch.no_grad(): | |
# model = tq.QuantWrapper(model) | |
observer = tq.PerChannelMinMaxObserver() | |
model.qconfig = torch.quantization.QConfig(activation=tq.MinMaxObserver, | |
weight=observer.with_args(dtype=torch.qint8, | |
qscheme=torch.per_channel_affine)) | |
#model.qconfig = torch.quantization.get_default_qconfig('qnnpack') | |
model = tq.QuantWrapper(model) | |
tq.prepare(model, inplace=True) | |
tmp = model(torch.Tensor([[x]])) | |
tq.convert(model, inplace=True) | |
return model | |
model = nn.Conv2d(1, 1, kernel_size, bias=True) | |
model.weight = wt | |
model.bias = bt | |
# generate another model with zero weights and a bias, to see what value the bias is quantized to | |
modelb = deepcopy(model) | |
for p in modelb.parameters(): | |
torch.nn.init.zeros_(p) | |
modelb.bias = bt | |
print(model.weight.data, '\r\n\r\n', | |
model.bias.data, | |
'\r\n\r\ntorch output:\r\n', | |
model(xt), | |
'\r\n\r\nwhereas our output is:\r\n', | |
out_f) | |
model = quantize(model, (1, kernel_size, kernel_size)) | |
modelb = quantize(modelb, (1, kernel_size, kernel_size)) | |
# step-by-step forward: | |
q_inp = model.quant(xt) | |
q_outp = model.module(q_inp) | |
f_outp = model.dequant(q_outp) | |
inp_scale = q_inp.q_scale() | |
inp_zero_point = q_inp.q_zero_point() | |
outp_scale = q_outp.q_scale() | |
outp_zero_point = q_outp.q_zero_point() | |
print('torch input scale, zp:\r\n', | |
inp_scale, inp_zero_point, | |
'\r\nwhereas our scale, zp are:\r\n', | |
x_scale, x_zp, '\r\n') | |
print('torch output scale, zp:\r\n', | |
outp_scale, outp_zero_point, | |
'\r\nwhereas our scale, zp are:\r\n', | |
wxb_scale, wxb_zp, '\r\n') | |
per_ch_zp = model.module.weight().q_per_channel_zero_points() | |
per_ch_scales = model.module.weight().q_per_channel_scales() | |
print('torch channels scale, zp:\r\n', | |
per_ch_scales, per_ch_zp, | |
'\r\nwhereas our scale, zp are:\r\n', | |
w_scale, w_zp, '\r\n') | |
bqt_outp = modelb.module(modelb.quant(xt)) | |
print('torch bias:\r\n', | |
bqt_outp, | |
'\r\nwhereas our bias is:\r\n', | |
b, 'scale, zp: ', b_scale, b_zp, '\r\n') | |
print('output of quantized model:\r\n', | |
q_outp, | |
'\r\n\r\nwhereas our result is:\r\n', | |
wxb, | |
'\r\n\r\nand the ground truth is:\r\n', | |
out_f, | |
'\r\n\r\ntorch quantized output:\r\n', | |
q_outp.int_repr(), | |
'\r\n\r\nour quantized output:\r\n', | |
wxb_q) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment