Smthri · August 16, 2021 08:07
diff --git a/quantization_test.py b/quantization_test.py
 import numpy as np
 import torch
 import torch.quantization as tq
 from torch import nn
 from copy import deepcopy

 ## Manual quantization

 # weight, bias, input
 kernel_size = 3
 w = np.random.randn(kernel_size, kernel_size)
 # In this case bias is set to 0. With non-zero bias torch also yields incorrect results.
 b = np.random.randn(1) * 0
 x = np.random.randn(kernel_size, kernel_size)
 print(w)
 print(b)
 print(x)

 # float output
 out_f = (np.sum(w*x) + b)[0]
 print(out_f)

 # first, calibrate wx + b, wx, w, x and b
 def get_scale_zp(max_, min_, Qmin = 0, Qmax = 255):
    scale = (max_ - min_) / (Qmax - Qmin)
    if scale == 0.:
        return (1., 0.)
    
    zp = Qmin - np.round(min_ / scale)
    return (scale, zp)

 # Since MinMaxObservers, when processing single-value tensor, update only the min or the max value, but the other is set to 0.
 wxb_max, wxb_min = (max(out_f, 0), min(out_f, 0))
 wx_max, wx_min = (max(np.sum(w*x), 0), min(np.sum(w*x), 0))
 w_max, w_min = (np.max(w), np.min(w))
 x_max, x_min = (np.max(x), np.min(x))
 b_max, b_min = (np.max(b), np.min(b))

 wxb_scale, wxb_zp = get_scale_zp(wxb_max, wxb_min)
 print(f'wx + b: {wxb_scale}, {wxb_zp}')

 # Looks like this is never actually used
 wx_scale, wx_zp = get_scale_zp(wx_max, wx_min)
 print(f'wx: {wx_scale}, {wx_zp}')

 # We're quantizing to qint8 now
 w_scale, w_zp = get_scale_zp(w_max, w_min, -128, 127)
 print(f'w: {w_scale}, {w_zp}')

 x_scale, x_zp = get_scale_zp(x_max, x_min)
 print(f'x: {x_scale}, {x_zp}')

 b_scale, b_zp = get_scale_zp(max(b_max, 0), min(b_min, 0))
 print(f'b: {b_scale}, {b_zp}')

 # quantize weights
 x_q = np.round(x / x_scale + x_zp)
 w_q = np.round(w / w_scale + w_zp)
 b_q = np.round(b / b_scale + b_zp)
 print(w_q, '\r\n\r\n', b_q, '\r\n\r\n', x_q)

 # transform float multiplier and perform quantized operation
 Q = (w_scale * x_scale / wxb_scale)
 n = int(np.floor(np.log2((2 ** 16 - 1) / Q)))
 A = int(np.floor(2 ** n * Q))

 wx_q = np.round(np.clip(np.sum((w_q - w_zp) * (x_q - x_zp)), -32768, 32767) * A / (2 ** n)) + \
    np.round((b_q - b_zp) * (b_scale / wxb_scale)) + \
    wxb_zp

 wx_q = np.clip(wx_q, 0, 255)
 print(wx_q)

 # dequantize
 wxb_q = wx_q
 wxb = (wxb_q - wxb_zp) * wxb_scale
 print(f'dequant output:\r\n{wxb}')
 print(f'diff:\r\n{wxb - out_f}')

 ## Comparing with torch

 xt = torch.Tensor([[x]])
 wt = nn.Parameter(torch.Tensor([[w]]))
 bt = nn.Parameter(torch.Tensor(b))

 def quantize(model, input_shape):
    with torch.no_grad():
        # model = tq.QuantWrapper(model)
        observer = tq.PerChannelMinMaxObserver()
        model.qconfig = torch.quantization.QConfig(activation=tq.MinMaxObserver,
                                                   weight=observer.with_args(dtype=torch.qint8,
                                                                             qscheme=torch.per_channel_affine))
        #model.qconfig = torch.quantization.get_default_qconfig('qnnpack')
        model = tq.QuantWrapper(model)
        tq.prepare(model, inplace=True)

        tmp = model(torch.Tensor([[x]]))
        tq.convert(model, inplace=True)
    return model
    
 model = nn.Conv2d(1, 1, kernel_size, bias=True)
 model.weight = wt
 model.bias = bt
 # generate another model with zero weights and a bias, to see what value the bias is quantized to
 modelb = deepcopy(model)
 for p in modelb.parameters():
    torch.nn.init.zeros_(p)
 modelb.bias = bt
    
 print(model.weight.data, '\r\n\r\n', 
      model.bias.data, 
      '\r\n\r\ntorch output:\r\n', 
      model(xt),
     '\r\n\r\nwhereas our output is:\r\n', 
      out_f)
 model = quantize(model, (1, kernel_size, kernel_size))
 modelb = quantize(modelb, (1, kernel_size, kernel_size))

 # step-by-step forward:
 q_inp = model.quant(xt)
 q_outp = model.module(q_inp)
 f_outp = model.dequant(q_outp)

 inp_scale = q_inp.q_scale()
 inp_zero_point = q_inp.q_zero_point()
 outp_scale = q_outp.q_scale()
 outp_zero_point = q_outp.q_zero_point()

 print('torch input scale, zp:\r\n',
     inp_scale, inp_zero_point,
     '\r\nwhereas our scale, zp are:\r\n',
     x_scale, x_zp, '\r\n')
 print('torch output scale, zp:\r\n',
     outp_scale, outp_zero_point,
     '\r\nwhereas our scale, zp are:\r\n',
     wxb_scale, wxb_zp, '\r\n')

 per_ch_zp = model.module.weight().q_per_channel_zero_points()
 per_ch_scales = model.module.weight().q_per_channel_scales()

 print('torch channels scale, zp:\r\n',
     per_ch_scales, per_ch_zp,
     '\r\nwhereas our scale, zp are:\r\n',
     w_scale, w_zp, '\r\n')

 bqt_outp = modelb.module(modelb.quant(xt))
 print('torch bias:\r\n',
     bqt_outp,
     '\r\nwhereas our bias is:\r\n',
     b, 'scale, zp: ', b_scale, b_zp, '\r\n')
     
 print('output of quantized model:\r\n', 
      q_outp, 
      '\r\n\r\nwhereas our result is:\r\n', 
      wxb,
     '\r\n\r\nand the ground truth is:\r\n',
     out_f,
     '\r\n\r\ntorch quantized output:\r\n',
     q_outp.int_repr(),
     '\r\n\r\nour quantized output:\r\n',
     wxb_q)
	import numpy as np
	import torch
	import torch.quantization as tq
	from torch import nn
	from copy import deepcopy

	## Manual quantization

	# weight, bias, input
	kernel_size = 3
	w = np.random.randn(kernel_size, kernel_size)
	# In this case bias is set to 0. With non-zero bias torch also yields incorrect results.
	b = np.random.randn(1) * 0
	x = np.random.randn(kernel_size, kernel_size)
	print(w)
	print(b)
	print(x)

	# float output
	out_f = (np.sum(w*x) + b)[0]
	print(out_f)

	# first, calibrate wx + b, wx, w, x and b
	def get_scale_zp(max_, min_, Qmin = 0, Qmax = 255):
	scale = (max_ - min_) / (Qmax - Qmin)
	if scale == 0.:
	return (1., 0.)

	zp = Qmin - np.round(min_ / scale)
	return (scale, zp)

	# Since MinMaxObservers, when processing single-value tensor, update only the min or the max value, but the other is set to 0.
	wxb_max, wxb_min = (max(out_f, 0), min(out_f, 0))
	wx_max, wx_min = (max(np.sum(wx), 0), min(np.sum(wx), 0))
	w_max, w_min = (np.max(w), np.min(w))
	x_max, x_min = (np.max(x), np.min(x))
	b_max, b_min = (np.max(b), np.min(b))

	wxb_scale, wxb_zp = get_scale_zp(wxb_max, wxb_min)
	print(f'wx + b: {wxb_scale}, {wxb_zp}')

	# Looks like this is never actually used
	wx_scale, wx_zp = get_scale_zp(wx_max, wx_min)
	print(f'wx: {wx_scale}, {wx_zp}')

	# We're quantizing to qint8 now
	w_scale, w_zp = get_scale_zp(w_max, w_min, -128, 127)
	print(f'w: {w_scale}, {w_zp}')

	x_scale, x_zp = get_scale_zp(x_max, x_min)
	print(f'x: {x_scale}, {x_zp}')

	b_scale, b_zp = get_scale_zp(max(b_max, 0), min(b_min, 0))
	print(f'b: {b_scale}, {b_zp}')

	# quantize weights
	x_q = np.round(x / x_scale + x_zp)
	w_q = np.round(w / w_scale + w_zp)
	b_q = np.round(b / b_scale + b_zp)
	print(w_q, '\r\n\r\n', b_q, '\r\n\r\n', x_q)

	# transform float multiplier and perform quantized operation
	Q = (w_scale * x_scale / wxb_scale)
	n = int(np.floor(np.log2((2 ** 16 - 1) / Q)))
	A = int(np.floor(2 ** n * Q))

	wx_q = np.round(np.clip(np.sum((w_q - w_zp) * (x_q - x_zp)), -32768, 32767) * A / (2 ** n)) + \
	np.round((b_q - b_zp) * (b_scale / wxb_scale)) + \
	wxb_zp

	wx_q = np.clip(wx_q, 0, 255)
	print(wx_q)

	# dequantize
	wxb_q = wx_q
	wxb = (wxb_q - wxb_zp) * wxb_scale
	print(f'dequant output:\r\n{wxb}')
	print(f'diff:\r\n{wxb - out_f}')

	## Comparing with torch

	xt = torch.Tensor([[x]])
	wt = nn.Parameter(torch.Tensor([[w]]))
	bt = nn.Parameter(torch.Tensor(b))

	def quantize(model, input_shape):
	with torch.no_grad():
	# model = tq.QuantWrapper(model)
	observer = tq.PerChannelMinMaxObserver()
	model.qconfig = torch.quantization.QConfig(activation=tq.MinMaxObserver,
	weight=observer.with_args(dtype=torch.qint8,
	qscheme=torch.per_channel_affine))
	#model.qconfig = torch.quantization.get_default_qconfig('qnnpack')
	model = tq.QuantWrapper(model)
	tq.prepare(model, inplace=True)

	tmp = model(torch.Tensor([[x]]))
	tq.convert(model, inplace=True)
	return model

	model = nn.Conv2d(1, 1, kernel_size, bias=True)
	model.weight = wt
	model.bias = bt
	# generate another model with zero weights and a bias, to see what value the bias is quantized to
	modelb = deepcopy(model)
	for p in modelb.parameters():
	torch.nn.init.zeros_(p)
	modelb.bias = bt

	print(model.weight.data, '\r\n\r\n',
	model.bias.data,
	'\r\n\r\ntorch output:\r\n',
	model(xt),
	'\r\n\r\nwhereas our output is:\r\n',
	out_f)
	model = quantize(model, (1, kernel_size, kernel_size))
	modelb = quantize(modelb, (1, kernel_size, kernel_size))

	# step-by-step forward:
	q_inp = model.quant(xt)
	q_outp = model.module(q_inp)
	f_outp = model.dequant(q_outp)

	inp_scale = q_inp.q_scale()
	inp_zero_point = q_inp.q_zero_point()
	outp_scale = q_outp.q_scale()
	outp_zero_point = q_outp.q_zero_point()

	print('torch input scale, zp:\r\n',
	inp_scale, inp_zero_point,
	'\r\nwhereas our scale, zp are:\r\n',
	x_scale, x_zp, '\r\n')
	print('torch output scale, zp:\r\n',
	outp_scale, outp_zero_point,
	'\r\nwhereas our scale, zp are:\r\n',
	wxb_scale, wxb_zp, '\r\n')

	per_ch_zp = model.module.weight().q_per_channel_zero_points()
	per_ch_scales = model.module.weight().q_per_channel_scales()

	print('torch channels scale, zp:\r\n',
	per_ch_scales, per_ch_zp,
	'\r\nwhereas our scale, zp are:\r\n',
	w_scale, w_zp, '\r\n')

	bqt_outp = modelb.module(modelb.quant(xt))
	print('torch bias:\r\n',
	bqt_outp,
	'\r\nwhereas our bias is:\r\n',
	b, 'scale, zp: ', b_scale, b_zp, '\r\n')

	print('output of quantized model:\r\n',
	q_outp,
	'\r\n\r\nwhereas our result is:\r\n',
	wxb,
	'\r\n\r\nand the ground truth is:\r\n',
	out_f,
	'\r\n\r\ntorch quantized output:\r\n',
	q_outp.int_repr(),
	'\r\n\r\nour quantized output:\r\n',
	wxb_q)