Quantization
Last updated
Last updated
기본은 모델의 파라미터 구성을 압축하는 것 입니다.
압축 시에 연결을 제거하는 가지치기 Pruning 방법론 사용합니다.
Knowledge Distillation (지식 전이) 방식은 원래 모델(Instructor)을 사용하여 더 작은 모델(Student)을 훈련모델을 압축 할 수 있습니다.
Quantization의 기본 원리는 모델 파라미터의 데이터 형식을 더 작게 만드는 것 입니다. (Low Precision)
실수형(Floating Point)는 bit 별로 FP32, F16, BF16으로 표현되며, 정수형(Integer)는 int32, int16, int8으로 표현됩니다.
머신러닝 모델의 매개변수를 저장하는 데 사용되는 일반적인 데이터 유형에 대해 알아봅니다.
import torch
# Information of `8-bit unsigned integer`
torch.iinfo(torch.uint8)
> iinfo(min=0, max=255, dtype=uint8)
# Information of `8-bit (signed) integer`
torch.iinfo(torch.int8)
>iinfo(min=-128, max=127, dtype=int8)
### Information of `16-bit (signed) integer`
torch.iinfo(torch.int16)
> iinfo(min=-32768, max=32767, dtype=int16)
### Information of `32-bit (signed) integer`
torch.iinfo(torch.int32)
> iinfo(min=-2.14748e+09, max=2.14748e+09, dtype=int32)
### Information of `64-bit (signed) integer`
torch.iinfo(torch.int64)
> iinfo(min=-9.22337e+18, max=9.22337e+18, dtype=int64)
# by default, python stores float data in fp64
value = 1/3
format(value, '.60f')
> '0.333333333333333314829616256247390992939472198486328125000000'
# 64-bit floating point
tensor_fp64 = torch.tensor(value, dtype = torch.float64)
print(f"fp64 tensor: {format(tensor_fp64.item(), '.60f')}")
> fp64 tensor: 0.333333333333333314829616256247390992939472198486328125000000
tensor_fp32 = torch.tensor(value, dtype = torch.float32)
tensor_fp16 = torch.tensor(value, dtype = torch.float16)
tensor_bf16 = torch.tensor(value, dtype = torch.bfloat16)
print(f"fp64 tensor: {format(tensor_fp64.item(), '.60f')}")
print(f"fp32 tensor: {format(tensor_fp32.item(), '.60f')}")
print(f"fp16 tensor: {format(tensor_fp16.item(), '.60f')}")
print(f"bf16 tensor: {format(tensor_bf16.item(), '.60f')}")
> fp64 tensor: 0.333333333333333314829616256247390992939472198486328125000000
fp32 tensor: 0.333333343267440795898437500000000000000000000000000000000000
fp16 tensor: 0.333251953125000000000000000000000000000000000000000000000000
bf16 tensor: 0.333984375000000000000000000000000000000000000000000000000000
# Information of `16-bit brain floating point`
torch.finfo(torch.bfloat16)
> finfo(resolution=0.01, min=-3.38953e+38, max=3.38953e+38, eps=0.0078125, smallest_normal=1.17549e-38, tiny=1.17549e-38, dtype=bfloat16)
### Information of `16-bit floating point`
torch.finfo(torch.float16)
> finfo(resolution=0.001, min=-65504, max=65504, eps=0.000976562, smallest_normal=6.10352e-05, tiny=6.10352e-05, dtype=float16)
# Information of `32-bit floating point`
torch.finfo(torch.float32)
> finfo(resolution=1e-06, min=-3.40282e+38, max=3.40282e+38, eps=1.19209e-07, smallest_normal=1.17549e-38, tiny=1.17549e-38, dtype=float32)
### Information of `64-bit floating point`
torch.finfo(torch.float64)
> finfo(resolution=1e-15, min=-1.79769e+308, max=1.79769e+308, eps=2.22045e-16, smallest_normal=2.22507e-308, tiny=2.22507e-308, dtype=float64)
# random pytorch tensor: float32, size=1000
tensor_fp32 = torch.rand(1000, dtype = torch.float32)
# first 5 elements of the random tensor
tensor_fp32[:5]
> tensor([0.6183, 0.6456, 0.0934, 0.3034, 0.5854])
# downcast the tensor to bfloat16 using the "to" method
tensor_fp32_to_bf16 = tensor_fp32.to(dtype = torch.bfloat16)
tensor_fp32_to_bf16[:5]
> tensor([0.6172, 0.6445, 0.0933, 0.3027, 0.5859], dtype=torch.bfloat16)
# tensor_fp32 x tensor_fp32
m_float32 = torch.dot(tensor_fp32, tensor_fp32)
m_float32
> tensor(344.0361)
# tensor_fp32_to_bf16 x tensor_fp32_to_bf16
m_bfloat16 = torch.dot(tensor_fp32_to_bf16, tensor_fp32_to_bf16)
m_bfloat16
> tensor(344., dtype=torch.bfloat16)
다양한 데이터 유형으로 ML 모델을 로드합니다.
import torch
import torch.nn as nn
import requests
from PIL import Image
import warnings
# Ignore specific UserWarnings related to max_length in transformers
warnings.filterwarnings("ignore",
message=".*Using the model-agnostic default `max_length`.*")
class DummyModel(nn.Module):
"""
A dummy model that consists of an embedding layer
with two blocks of a linear layer followed by a layer
norm layer.
"""
def __init__(self):
super().__init__()
torch.manual_seed(123)
self.token_embedding = nn.Embedding(2, 2)
# Block 1
self.linear_1 = nn.Linear(2, 2)
self.layernorm_1 = nn.LayerNorm(2)
# Block 2
self.linear_2 = nn.Linear(2, 2)
self.layernorm_2 = nn.LayerNorm(2)
self.head = nn.Linear(2, 2)
def forward(self, x):
hidden_states = self.token_embedding(x)
# Block 1
hidden_states = self.linear_1(hidden_states)
hidden_states = self.layernorm_1(hidden_states)
# Block 2
hidden_states = self.linear_2(hidden_states)
hidden_states = self.layernorm_2(hidden_states)
logits = self.head(hidden_states)
return logits
model = DummyModel()
model
DummyModel(
(token_embedding): Embedding(2, 2)
(linear_1): Linear(in_features=2, out_features=2, bias=True)
(layernorm_1): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
(linear_2): Linear(in_features=2, out_features=2, bias=True)
(layernorm_2): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
(head): Linear(in_features=2, out_features=2, bias=True)
)
모델에 있는 매개변수의 데이터 유형을 검사하는 함수를 만듭니다.
def print_param_dtype(model):
for name, param in model.named_parameters():
print(f"{name} is loaded in {param.dtype}")
print_param_dtype(model)
> token_embedding.weight is loaded in torch.float32
linear_1.weight is loaded in torch.float32
linear_1.bias is loaded in torch.float32
layernorm_1.weight is loaded in torch.float32
layernorm_1.bias is loaded in torch.float32
linear_2.weight is loaded in torch.float32
linear_2.bias is loaded in torch.float32
layernorm_2.weight is loaded in torch.float32
layernorm_2.bias is loaded in torch.float32
head.weight is loaded in torch.float32
head.bias is loaded in torch.float32
float16
Cast the model into a different precision.
# float 16
model_fp16 = DummyModel().half()
매개변수의 데이터 유형을 검사합니다.
print_param_dtype(model_fp16)
> token_embedding.weight is loaded in torch.float16
linear_1.weight is loaded in torch.float16
linear_1.bias is loaded in torch.float16
layernorm_1.weight is loaded in torch.float16
layernorm_1.bias is loaded in torch.float16
linear_2.weight is loaded in torch.float16
linear_2.bias is loaded in torch.float16
layernorm_2.weight is loaded in torch.float16
layernorm_2.bias is loaded in torch.float16
head.weight is loaded in torch.float16
head.bias is loaded in torch.float16
model_fp16
> DummyModel(
(token_embedding): Embedding(2, 2)
(linear_1): Linear(in_features=2, out_features=2, bias=True)
(layernorm_1): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
(linear_2): Linear(in_features=2, out_features=2, bias=True)
(layernorm_2): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
(head): Linear(in_features=2, out_features=2, bias=True)
)
모델을 사용하여 간단한 추론을 실행합니다.
import torch
dummy_input = torch.LongTensor([[1, 0], [0, 1]])
# inference using float32 model
logits_fp32 = model(dummy_input)
logits_fp32
> tensor([[[-0.6872, 0.7132],
[-0.6872, 0.7132]],
[[-0.6872, 0.7132],
[-0.6872, 0.7132]]], grad_fn=<ViewBackward0>)
# inference using float16 model
try:
logits_fp16 = model_fp16(dummy_input)
except Exception as error:
print("\033[91m", type(error).__name__, ": ", error, "\033[0m")
> [91m RuntimeError : "LayerNormKernelImpl" not implemented for 'Half' [0m
bfloat16
from copy import deepcopy
model_bf16 = deepcopy(model)
model_bf16 = model_bf16.to(torch.bfloat16)
print_param_dtype(model_bf16)
logits_bf16 = model_bf16(dummy_input)
> token_embedding.weight is loaded in torch.bfloat16
linear_1.weight is loaded in torch.bfloat16
linear_1.bias is loaded in torch.bfloat16
layernorm_1.weight is loaded in torch.bfloat16
layernorm_1.bias is loaded in torch.bfloat16
linear_2.weight is loaded in torch.bfloat16
linear_2.bias is loaded in torch.bfloat16
layernorm_2.weight is loaded in torch.bfloat16
layernorm_2.bias is loaded in torch.bfloat16
head.weight is loaded in torch.bfloat16
head.bias is loaded in torch.bfloat16
이제 logits_fp32와 logits_bf16의 차이를 비교해 보겠습니다.
mean_diff = torch.abs(logits_bf16 - logits_fp32).mean().item()
max_diff = torch.abs(logits_bf16 - logits_fp32).max().item()
print(f"Mean diff: {mean_diff} | Max diff: {max_diff}")
> Mean diff: 0.000997886061668396 | Max diff: 0.0016907453536987305
from transformers import BlipForConditionalGeneration
model_name = "Salesforce/blip-image-captioning-base"
model = BlipForConditionalGeneration.from_pretrained(model_name)
# inspect the default data types of the model
# print_param_dtype(model)
모델의 메모리 사용량을 확인합니다.
fp32_mem_footprint = model.get_memory_footprint()
print("Footprint of the fp32 model in bytes: ",
fp32_mem_footprint)
print("Footprint of the fp32 model in MBs: ",
fp32_mem_footprint/1e+6)
> Footprint of the fp32 model in bytes: 989660400
Footprint of the fp32 model in MBs: 989.6604
bfloat16
에서 동일한 모델을 로드합니다.
model_bf16 = BlipForConditionalGeneration.from_pretrained(
model_name,
torch_dtype=torch.bfloat16
)
> bf16_mem_footprint = model_bf16.get_memory_footprint()
# Get the relative difference
relative_diff = bf16_mem_footprint / fp32_mem_footprint
print("Footprint of the bf16 model in MBs: ",
bf16_mem_footprint/1e+6)
print(f"Relative diff: {relative_diff}")
> Footprint of the bf16 model in MBs: 494.832248
Relative diff: 0.5000020693967345
float32
vs bfloat16
이제 두 모델의 생성 결과를 비교해 보겠습니다.
def get_generation(model, processor, image, dtype):
inputs = processor(image, return_tensors="pt").to(dtype)
out = model.generate(**inputs)
return processor.decode(out[0], skip_special_tokens=True)
def load_image(img_url):
image = Image.open(requests.get(
img_url, stream=True).raw).convert('RGB')
return image
from transformers import BlipProcessor
processor = BlipProcessor.from_pretrained(model_name)
from IPython.display import display
img_url = 'https://storage.googleapis.com/\
sfr-vision-language-research/BLIP/demo.jpg'
image = load_image(img_url)
display(image.resize((500, 350)))
results_fp32 = get_generation(model,
processor,
image,
torch.float32)
print("fp32 Model Results:\n", results_fp32)
> fp32 Model Results:
a woman sitting on the beach with her dog
results_bf16 = get_generation(model_bf16,
processor,
image,
torch.bfloat16)
print("bf16 Model Results:\n", results_bf16)
> bf16 Model Results:
a woman sitting on the beach with a dog
허깅페이스 트랜스포머 라이브러리의 경우, 모델을 로드할 기본 데이터 유형은 float32
입니다.
'기본 데이터 유형'을 원하는 것으로 설정할 수 있습니다.
desired_dtype = torch.bfloat16
torch.set_default_dtype(desired_dtype)
dummy_model_bf16 = DummyModel()
print_param_dtype(dummy_model_bf16)
> token_embedding.weight is loaded in torch.bfloat16
linear_1.weight is loaded in torch.bfloat16
linear_1.bias is loaded in torch.bfloat16
layernorm_1.weight is loaded in torch.bfloat16
layernorm_1.bias is loaded in torch.bfloat16
linear_2.weight is loaded in torch.bfloat16
linear_2.bias is loaded in torch.bfloat16
layernorm_2.weight is loaded in torch.bfloat16
layernorm_2.bias is loaded in torch.bfloat16
head.weight is loaded in torch.bfloat16
head.bias is loaded in torch.bfloat16
마찬가지로 기본 데이터 유형을 float32로 재설정할 수 있습니다.
torch.set_default_dtype(torch.float32)
print_param_dtype(dummy_model_bf16)
> token_embedding.weight is loaded in torch.bfloat16
linear_1.weight is loaded in torch.bfloat16
linear_1.bias is loaded in torch.bfloat16
layernorm_1.weight is loaded in torch.bfloat16
layernorm_1.bias is loaded in torch.bfloat16
linear_2.weight is loaded in torch.bfloat16
linear_2.bias is loaded in torch.bfloat16
layernorm_2.weight is loaded in torch.bfloat16
layernorm_2.bias is loaded in torch.bfloat16
head.weight is loaded in torch.bfloat16
head.bias is loaded in torch.bfloat16
Linear Quantization (선형 양자화)
LLM.INT8 (8-bit만 해당t)
QLoRA(4-bit만 해당)
HQQ(최대 2-bit)
Fine-tuning Quantization Model
양자화에서 정확도가 그대로 재현
특정 사용 사례 및 애플리케이션에 맞게 모델의 Fine-tuning 동시에 가능
QAT: Fine-tune with Quantization Aware Training
정량화된 버전이 최적의 성능을 발휘할 수 있도록 모델을 미세 조정 합니다.
Post Training Quantization(PTQ) 기법과는 호환되지 않습니다.
Linear Quantization(선형 양자화) 방법은 PTQ의 예입니다.
PEFT(Parameters efficient fine-tuning)
전체 미세 조정과 동일한 성능을 유지하면서 모델의 학습 가능한 매개변수 수를 대폭 줄일 수 있습니다.
대표적으로 PEFT +QLoRA 활용: https://pytorch.org/blog/finetune-llms/
QLoRA는 사전 학습 된 기본 가중치(그림의 blue 컬러)를 4비트 정밀도로 정량화합니다.
Low Rank Adaptor(LoRA) 가중치의 정밀도(그림의 orang 컬러)와 일치합니다.
모델은 사전 학습된 가중치(blue)와 어댑터 가중치(orange)의 활성화를 추가할 수 있습니다.
이 두 활성화의 합은 네트워크의 다음 계층에 입력으로 제공될 수 있습니다.
-> QLoRA를 사용하면 더 효율적인 미세 조정 + 더 작은 모델을 얻을 수 있습니다!
Linear Quantization(선형 양자화)에 대하여 알아보겠습니다.
#%pip install sentencepiece
#%pip install quanto==0.0.11
import torch
def named_module_tensors(module, recurse=False):
for named_parameter in module.named_parameters(recurse=recurse):
name, val = named_parameter
flag = True
if hasattr(val,"_data") or hasattr(val,"_scale"):
if hasattr(val,"_data"):
yield name + "._data", val._data
if hasattr(val,"_scale"):
yield name + "._scale", val._scale
else:
yield named_parameter
for named_buffer in module.named_buffers(recurse=recurse):
yield named_buffer
def dtype_byte_size(dtype):
"""
Returns the size (in bytes) occupied by one parameter of type `dtype`.
"""
import re
if dtype == torch.bool:
return 1 / 8
bit_search = re.search(r"[^\d](\d+)$", str(dtype))
if bit_search is None:
raise ValueError(f"`dtype` is not a valid dtype: {dtype}.")
bit_size = int(bit_search.groups()[0])
return bit_size // 8
def compute_module_sizes(model):
"""
Compute the size of each submodule of a given model.
"""
from collections import defaultdict
module_sizes = defaultdict(int)
for name, tensor in named_module_tensors(model, recurse=True):
size = tensor.numel() * dtype_byte_size(tensor.dtype)
name_parts = name.split(".")
for idx in range(len(name_parts) + 1):
module_sizes[".".join(name_parts[:idx])] += size
return module_sizes
model_name = "google/flan-t5-small"
import sentencepiece as spm
from transformers import T5Tokenizer, T5ForConditionalGeneration
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small")
input_text = "Hello, my name is "
input_ids = tokenizer(input_text, return_tensors="pt").input_ids
outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0]))
module_sizes = compute_module_sizes(model)
print(f"The model size is {module_sizes[''] * 1e-9} GB")
The model size is 0.307844608 GB
from quanto import quantize, freeze
import torch
quantize(model, weights=torch.int8, activations=None)
print(model)
freeze(model)
module_sizes = compute_module_sizes(model)
print(f"The model size is {module_sizes[''] * 1e-9} GB")
The model size is 0.12682868 GB
input_text = "Hello, my name is "
input_ids = tokenizer(input_text, return_tensors="pt").input_ids
outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0]))
Linear Quantization과 Downcasting 방법의 차이점을 요약하면 다음과 같습니다:
모델을 다운캐스팅할 때는 모델의 파라미터를 보다 간결한 데이터 유형(bfloat16)으로 변환합니다. 추론하는 동안 모델은 이 데이터 유형으로 계산을 수행하고 활성화는 이 데이터 유형으로 이루어집니다. 다운캐스팅은 bfloat16 데이터 유형으로 작동할 수 있지만 더 작은 데이터 유형에서는 모델 성능이 저하될 수 있으며, 정수 데이터 유형(이 단원의 int8과 같은)으로 변환하는 경우에는 작동하지 않을 수 있습니다.
linear quantization
를 사용하여 추론 중에 압축된 데이터 유형에서 원래의 FP32 데이터 유형으로 다시 변환함으로써 양자화된 모델이 원래 모델에 훨씬 가까운 성능을 유지할 수 있도록 했습니다. 따라서 모델이 예측을 할 때 행렬 곱셈은 FP32로, 활성화는 FP32로 수행합니다. 이를 통해 이 예제에서는 int8과 같이 bfloat16보다 작은 데이터 유형으로 모델을 정량화할 수 있습니다.