# tensor_fp32 x tensor_fp32
m_float32 = torch.dot(tensor_fp32, tensor_fp32)
m_float32
> tensor(344.0361)
# tensor_fp32_to_bf16 x tensor_fp32_to_bf16
m_bfloat16 = torch.dot(tensor_fp32_to_bf16, tensor_fp32_to_bf16)
m_bfloat16
> tensor(344., dtype=torch.bfloat16)
ML Models with Different Data Types
다양한 데이터 유형으로 ML 모델을 로드합니다.
import torch
import torch.nn as nn
import requests
from PIL import Image
import warnings
# Ignore specific UserWarnings related to max_length in transformers
warnings.filterwarnings("ignore",
message=".*Using the model-agnostic default `max_length`.*")
class DummyModel(nn.Module):
"""
A dummy model that consists of an embedding layer
with two blocks of a linear layer followed by a layer
norm layer.
"""
def __init__(self):
super().__init__()
torch.manual_seed(123)
self.token_embedding = nn.Embedding(2, 2)
# Block 1
self.linear_1 = nn.Linear(2, 2)
self.layernorm_1 = nn.LayerNorm(2)
# Block 2
self.linear_2 = nn.Linear(2, 2)
self.layernorm_2 = nn.LayerNorm(2)
self.head = nn.Linear(2, 2)
def forward(self, x):
hidden_states = self.token_embedding(x)
# Block 1
hidden_states = self.linear_1(hidden_states)
hidden_states = self.layernorm_1(hidden_states)
# Block 2
hidden_states = self.linear_2(hidden_states)
hidden_states = self.layernorm_2(hidden_states)
logits = self.head(hidden_states)
return logits
def print_param_dtype(model):
for name, param in model.named_parameters():
print(f"{name} is loaded in {param.dtype}")
print_param_dtype(model)
> token_embedding.weight is loaded in torch.float32
linear_1.weight is loaded in torch.float32
linear_1.bias is loaded in torch.float32
layernorm_1.weight is loaded in torch.float32
layernorm_1.bias is loaded in torch.float32
linear_2.weight is loaded in torch.float32
linear_2.bias is loaded in torch.float32
layernorm_2.weight is loaded in torch.float32
layernorm_2.bias is loaded in torch.float32
head.weight is loaded in torch.float32
head.bias is loaded in torch.float32
Model Casting: float16
Cast the model into a different precision.
# float 16
model_fp16 = DummyModel().half()
매개변수의 데이터 유형을 검사합니다.
print_param_dtype(model_fp16)
> token_embedding.weight is loaded in torch.float16
linear_1.weight is loaded in torch.float16
linear_1.bias is loaded in torch.float16
layernorm_1.weight is loaded in torch.float16
layernorm_1.bias is loaded in torch.float16
linear_2.weight is loaded in torch.float16
linear_2.bias is loaded in torch.float16
layernorm_2.weight is loaded in torch.float16
layernorm_2.bias is loaded in torch.float16
head.weight is loaded in torch.float16
head.bias is loaded in torch.float16
> token_embedding.weight is loaded in torch.bfloat16
linear_1.weight is loaded in torch.bfloat16
linear_1.bias is loaded in torch.bfloat16
layernorm_1.weight is loaded in torch.bfloat16
layernorm_1.bias is loaded in torch.bfloat16
linear_2.weight is loaded in torch.bfloat16
linear_2.bias is loaded in torch.bfloat16
layernorm_2.weight is loaded in torch.bfloat16
layernorm_2.bias is loaded in torch.bfloat16
head.weight is loaded in torch.bfloat16
head.bias is loaded in torch.bfloat16
> Mean diff: 0.000997886061668396 | Max diff: 0.0016907453536987305
LLM Models in Different Data Types
from transformers import BlipForConditionalGeneration
model_name = "Salesforce/blip-image-captioning-base"
model = BlipForConditionalGeneration.from_pretrained(model_name)
# inspect the default data types of the model
# print_param_dtype(model)
모델의 메모리 사용량을 확인합니다.
fp32_mem_footprint = model.get_memory_footprint()
print("Footprint of the fp32 model in bytes: ",
fp32_mem_footprint)
print("Footprint of the fp32 model in MBs: ",
fp32_mem_footprint/1e+6)
> Footprint of the fp32 model in bytes: 989660400
Footprint of the fp32 model in MBs: 989.6604
# Get the relative difference
relative_diff = bf16_mem_footprint / fp32_mem_footprint
print("Footprint of the bf16 model in MBs: ",
bf16_mem_footprint/1e+6)
print(f"Relative diff: {relative_diff}")
> Footprint of the bf16 model in MBs: 494.832248
Relative diff: 0.5000020693967345
> token_embedding.weight is loaded in torch.bfloat16
linear_1.weight is loaded in torch.bfloat16
linear_1.bias is loaded in torch.bfloat16
layernorm_1.weight is loaded in torch.bfloat16
layernorm_1.bias is loaded in torch.bfloat16
linear_2.weight is loaded in torch.bfloat16
linear_2.bias is loaded in torch.bfloat16
layernorm_2.weight is loaded in torch.bfloat16
layernorm_2.bias is loaded in torch.bfloat16
head.weight is loaded in torch.bfloat16
head.bias is loaded in torch.bfloat16
마찬가지로 기본 데이터 유형을 float32로 재설정할 수 있습니다.
torch.set_default_dtype(torch.float32)
print_param_dtype(dummy_model_bf16)
> token_embedding.weight is loaded in torch.bfloat16
linear_1.weight is loaded in torch.bfloat16
linear_1.bias is loaded in torch.bfloat16
layernorm_1.weight is loaded in torch.bfloat16
layernorm_1.bias is loaded in torch.bfloat16
linear_2.weight is loaded in torch.bfloat16
linear_2.bias is loaded in torch.bfloat16
layernorm_2.weight is loaded in torch.bfloat16
layernorm_2.bias is loaded in torch.bfloat16
head.weight is loaded in torch.bfloat16
head.bias is loaded in torch.bfloat16
LLM 양자화 주요 방법론:
Linear Quantization (선형 양자화)
LLM.INT8 (8-bit만 해당t)
QLoRA(4-bit만 해당)
HQQ(최대 2-bit)
Fine-tune Quantization 종류
Fine-tuning Quantization Model
양자화에서 정확도가 그대로 재현
특정 사용 사례 및 애플리케이션에 맞게 모델의 Fine-tuning 동시에 가능
QAT: Fine-tune with Quantization Aware Training
정량화된 버전이 최적의 성능을 발휘할 수 있도록 모델을 미세 조정 합니다.
Post Training Quantization(PTQ) 기법과는 호환되지 않습니다.
Linear Quantization(선형 양자화) 방법은 PTQ의 예입니다.
PEFT(Parameters efficient fine-tuning)
전체 미세 조정과 동일한 성능을 유지하면서 모델의 학습 가능한 매개변수 수를 대폭 줄일 수 있습니다.
대표적으로 PEFT +QLoRA 활용: https://pytorch.org/blog/finetune-llms/
QLoRA 방법론
QLoRA는 사전 학습 된 기본 가중치(그림의 blue 컬러)를 4비트 정밀도로 정량화합니다.
Low Rank Adaptor(LoRA) 가중치의 정밀도(그림의 orang 컬러)와 일치합니다.
모델은 사전 학습된 가중치(blue)와 어댑터 가중치(orange)의 활성화를 추가할 수 있습니다.
import torch
def named_module_tensors(module, recurse=False):
for named_parameter in module.named_parameters(recurse=recurse):
name, val = named_parameter
flag = True
if hasattr(val,"_data") or hasattr(val,"_scale"):
if hasattr(val,"_data"):
yield name + "._data", val._data
if hasattr(val,"_scale"):
yield name + "._scale", val._scale
else:
yield named_parameter
for named_buffer in module.named_buffers(recurse=recurse):
yield named_buffer
def dtype_byte_size(dtype):
"""
Returns the size (in bytes) occupied by one parameter of type `dtype`.
"""
import re
if dtype == torch.bool:
return 1 / 8
bit_search = re.search(r"[^\d](\d+)$", str(dtype))
if bit_search is None:
raise ValueError(f"`dtype` is not a valid dtype: {dtype}.")
bit_size = int(bit_search.groups()[0])
return bit_size // 8
def compute_module_sizes(model):
"""
Compute the size of each submodule of a given model.
"""
from collections import defaultdict
module_sizes = defaultdict(int)
for name, tensor in named_module_tensors(model, recurse=True):
size = tensor.numel() * dtype_byte_size(tensor.dtype)
name_parts = name.split(".")
for idx in range(len(name_parts) + 1):
module_sizes[".".join(name_parts[:idx])] += size
return module_sizes
Without Quantization
model_name = "google/flan-t5-small"
import sentencepiece as spm
from transformers import T5Tokenizer, T5ForConditionalGeneration
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small")
input_text = "Hello, my name is "
input_ids = tokenizer(input_text, return_tensors="pt").input_ids
outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0]))
module_sizes = compute_module_sizes(model)
print(f"The model size is {module_sizes[''] * 1e-9} GB")
The model size is 0.307844608 GB
Quantize the model (8-bit precision)
from quanto import quantize, freeze
import torch
quantize(model, weights=torch.int8, activations=None)
print(model)
Freeze the model
freeze(model)
module_sizes = compute_module_sizes(model)
print(f"The model size is {module_sizes[''] * 1e-9} GB")
The model size is 0.12682868 GB
Try running inference on the quantized model
input_text = "Hello, my name is "
input_ids = tokenizer(input_text, return_tensors="pt").input_ids
outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0]))
Linear Quantization과 Downcasting 방법의 차이점을 요약하면 다음과 같습니다:
모델을 다운캐스팅할 때는 모델의 파라미터를 보다 간결한 데이터 유형(bfloat16)으로 변환합니다. 추론하는 동안 모델은 이 데이터 유형으로 계산을 수행하고 활성화는 이 데이터 유형으로 이루어집니다. 다운캐스팅은 bfloat16 데이터 유형으로 작동할 수 있지만 더 작은 데이터 유형에서는 모델 성능이 저하될 수 있으며, 정수 데이터 유형(이 단원의 int8과 같은)으로 변환하는 경우에는 작동하지 않을 수 있습니다.
linear quantization를 사용하여 추론 중에 압축된 데이터 유형에서 원래의 FP32 데이터 유형으로 다시 변환함으로써 양자화된 모델이 원래 모델에 훨씬 가까운 성능을 유지할 수 있도록 했습니다. 따라서 모델이 예측을 할 때 행렬 곱셈은 FP32로, 활성화는 FP32로 수행합니다. 이를 통해 이 예제에서는 int8과 같이 bfloat16보다 작은 데이터 유형으로 모델을 정량화할 수 있습니다.