"""Supportive descriptor of quantization
Describe how a tensor should be quantized. A QuantDescriptor and a tensor defines a quantized tensor.
Args:
num_bits: An integer. Number of bits of quantization. It is used to calculate scaling factor. Default 8.
name: Seems a nice thing to have
Keyword Arguments:
fake_quant: A boolean. If True, use fake quantization mode. Default True.
axis: None, int or tuple of int. axes which will have its own max for computing scaling factor.
If None (the default), use per tensor scale.
Must be in the range [-rank(input_tensor), rank(input_tensor)).
e.g. For a KCRS weight tensor, quant_axis=(0) will yield per channel scaling.
Default None.
amax: A float or list/ndarray of floats of user specified absolute max range(绝对值最大的元素). If supplied,
ignore quant_axis and use this to quantize. If learn_amax is True, will be used to initialize
learnable amax. Default None.
learn_amax: A boolean. If True, learn amax. Default False.
scale_amax: A float. If supplied, multiply amax by scale_amax. Default None. It is useful for some
quick experiment.
calib_method: A string. One of ["max", "histogram"] indicates which calibration to use. Except the simple
max calibration, other methods are all hisogram based. Default "max".
unsigned: A Boolean. If True, use unsigned. Default False.
Raises:
TypeError: If unsupported type is passed in.
Read-only properties:
- fake_quant:
- name:
- learn_amax:
- scale_amax:
- axis:
- calib_method:
- num_bits:
- amax:
- unsigned:
"""
# Find the TensorQuantizer and enable calibration
for name, module in model.named_modules():
if name.endswith('_quantizer'):
module.enable_calib()
module.disable_quant() # Use full precision data to calibrate
# Feeding data samples
model(x)
# ...
# Finalize calibration
for name, module in model.named_modules():
if name.endswith('_quantizer'):
module.load_calib_amax()
module.disable_calib()
module.enable_quant()
# If running on GPU, it needs to call .cuda() again because new tensors will be created by calibration process
model.cuda()
# Keep running the quantized model
# ...
在将模型导出到 ONNX 之前需要进行校准。
Quantization Aware Training
Quantization Aware Training 基于直通估计(STE)导数近似。它有时被称为 "量化感知训练"。
import torch
import torch.utils.data
from torch import nn
from pytorch_quantization import nn as quant_nn
from pytorch_quantization import calib
from pytorch_quantization.tensor_quant import QuantDescriptor
from torchvision import models
sys.path.append("path to torchvision/references/classification/")
from train import evaluate, train_one_epoch, load_data
def collect_stats(model, data_loader, num_batches):
"""Feed data to the network and collect statistic"""
# Enable calibrators
for name, module in model.named_modules():
if isinstance(module, quant_nn.TensorQuantizer):
if module._calibrator is not None:
module.disable_quant()
module.enable_calib()
else:
module.disable()
for i, (image, _) in tqdm(enumerate(data_loader), total=num_batches):
model(image.cuda())
if i >= num_batches:
break
# Disable calibrators
for name, module in model.named_modules():
if isinstance(module, quant_nn.TensorQuantizer):
if module._calibrator is not None:
module.enable_quant()
module.disable_calib()
else:
module.enable()
def compute_amax(model, **kwargs):
# Load calib result
for name, module in model.named_modules():
if isinstance(module, quant_nn.TensorQuantizer):
if module._calibrator is not None:
if isinstance(module._calibrator, calib.MaxCalibrator):
module.load_calib_amax()
else:
module.load_calib_amax(**kwargs)
print(F"{name:40}: {module}")
model.cuda()
# It is a bit slow since we collect histograms on CPU
with torch.no_grad():
collect_stats(model, data_loader, num_batches=2)
compute_amax(model, method="percentile", percentile=99.99)
criterion = nn.CrossEntropyLoss()
with torch.no_grad():
evaluate(model, criterion, data_loader_test, device="cuda", print_freq=20)
# Save the model
torch.save(model.state_dict(), "/tmp/quant_resnet50-calibrated.pth")
top-1 的准确率为 76.1%,接近预训练模型 76.2% 的准确率。
Use different calibration
我们可以尝试不同的校准方式,而无需重新收集直方图,看看哪种校准方式能获得最佳精度。
with torch.no_grad():
compute_amax(model, method="percentile", percentile=99.9)
evaluate(model, criterion, data_loader_test, device="cuda", print_freq=20)
with torch.no_grad():
for method in ["mse", "entropy"]:
print(F"{method} calibration")
compute_amax(model, method=method)
evaluate(model, criterion, data_loader_test, device="cuda", print_freq=20)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)
# Training takes about one and half hour per epoch on a single V100
train_one_epoch(model, criterion, optimizer, data_loader, "cuda", 0, 100)
# Save the model
torch.save(model.state_dict(), "/tmp/quant_resnet50-finetuned.pth")
def forward(self, x: Tensor) -> Tensor:
# other code...
if self._quantize:
out += self.residual_quantizer(identity)
else:
out += identity
out = self.relu(out)
return out