内容纲要
大模型微调(SFT)
目录
1. 微调概述
1.1 什么是微调
微调(Fine-tuning)是在预训练大模型基础上,使用特定领域数据继续训练,使模型适应特定任务或领域。
┌─────────────────────────────────────────────────┐
│ 微调流程 │
├─────────────────────────────────────────────────┤
│ │
│ 预训练模型 微调数据 │
│ (GPT-4, Llama2...) + ──────► 微调模型 │
│ │ │
│ 通用知识 │ │
│ ────────────────► │ │
└─────────────────────────────────────────────────┘
1.2 微调类型
| 类型 | 特点 | 适用场景 | 成本 |
|---|---|---|---|
| 全量微调 | 更新所有参数 | 大规模定制 | 高 |
| LoRA微调 | 只更新低秩矩阵 | 大部分场景 | 低 |
| Prefix微调 | 添加前缀 | 轻量适配 | 低 |
| Adapter微调 | 添加适配层 | 多任务 | 中 |
| 混合微调 | 组合多种方法 | 复杂场景 | 中 |
1.3 微调优势
┌─────────────────────────────────────────────────┐
│ 微调的价值 │
├─────────────────────────────────────────────────┤
│ │
│ ✓ 领域知识注入 │
│ ✓ 任务行为优化 │
│ ✓ 输出格式控制 │
│ ✓ 降低幻觉率 │
│ ✓ 提升推理效率 │
│ ✓ 数据隐私保护 │
│ │
└─────────────────────────────────────────────────┘
2. 数据准备
2.1 数据格式
from typing import List, Dict, Optional
from dataclasses import dataclass
from enum import Enum
class TaskType(Enum):
"""任务类型"""
CAUSAL_LM = "causal_lm" # 因果语言建模
MASKED_LM = "masked_lm" # 掩码语言建模
SEQ_TO_SEQ = "seq_to_seq" # 序列到序列
CLASSIFICATION = "classification" # 分类
@dataclass
class TrainingExample:
"""训练样本"""
input: str # 输入文本
output: Optional[str] = None # 输出文本(seq2seq)
label: Optional[int] = None # 标签(分类)
metadata: Dict = None # 元数据
task_type: TaskType = TaskType.CAUSAL_LM
def __post_init__(self):
if self.metadata is None:
self.metadata = {}
def to_causal_lm_format(self) -> Dict:
"""转换为因果LM格式"""
if self.task_type == TaskType.CAUSAL_LM:
return {"text": self.input}
elif self.task_type == TaskType.SEQ_TO_SEQ:
return {"text": f"{self.input} -> {self.output}"}
else:
raise ValueError(f"不支持的格式: {self.task_type}")
def to_instruction_format(self) -> Dict:
"""转换为指令格式"""
return {
"instruction": self.input,
"output": self.output or "",
"input": ""
}
def to_chat_format(self) -> List[Dict]:
"""转换为聊天格式"""
messages = [
{"role": "user", "content": self.input}
]
if self.output:
messages.append({"role": "assistant", "content": self.output})
return messages
class DatasetLoader:
"""数据集加载器"""
def __init__(self, task_type: TaskType = TaskType.CAUSAL_LM):
self.task_type = task_type
self.examples: List[TrainingExample] = []
def load_from_json(self, file_path: str):
"""从JSON加载"""
import json
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
if isinstance(data, list):
for item in data:
self.examples.append(TrainingExample(**item))
elif isinstance(data, dict):
if "data" in data:
for item in data["data"]:
self.examples.append(TrainingExample(**item))
def load_from_jsonl(self, file_path: str):
"""从JSONL加载"""
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
if line.strip():
import json
item = json.loads(line)
self.examples.append(TrainingExample(**item))
def load_from_csv(self, file_path: str):
"""从CSV加载"""
import pandas as pd
df = pd.read_csv(file_path)
for _, row in df.iterrows():
example = TrainingExample(
input=str(row.get("input", "")),
output=str(row.get("output", "")) if "output" in row else None,
label=int(row["label"]) if "label" in row else None,
metadata=row.to_dict()
)
self.examples.append(example)
def get_training_data(self, format: str = "causal") -> List[Dict]:
"""获取训练数据"""
if format == "causal":
return [e.to_causal_lm_format() for e in self.examples]
elif format == "instruction":
return [e.to_instruction_format() for e in self.examples]
elif format == "chat":
return [e.to_chat_format() for e in self.examples]
else:
raise ValueError(f"未知格式: {format}")
def split(
self,
train_ratio: float = 0.8,
val_ratio: float = 0.1
) -> tuple:
"""分割数据集"""
n = len(self.examples)
train_end = int(n * train_ratio)
val_end = int(n * (train_ratio + val_ratio))
return (
self.examples[:train_end],
self.examples[train_end:val_end],
self.examples[val_end:]
)
2.2 数据预处理
import re
from typing import List
class DataPreprocessor:
"""数据预处理器"""
def __init__(self):
self.cleaners = []
def add_cleaner(self, cleaner):
"""添加清洗函数"""
self.cleaners.append(cleaner)
def clean_text(self, text: str) -> str:
"""清洗文本"""
for cleaner in self.cleaners:
text = cleaner(text)
return text
def clean_examples(self, examples: List[TrainingExample]) -> List[TrainingExample]:
"""清洗样本"""
cleaned = []
for example in examples:
cleaned_input = self.clean_text(example.input)
cleaned_output = self.clean_text(example.output) if example.output else None
cleaned.append(TrainingExample(
input=cleaned_input,
output=cleaned_output,
label=example.label,
metadata=example.metadata.copy(),
task_type=example.task_type
))
return cleaned
# 内置清洗器
@staticmethod
def remove_special_chars(text: str) -> str:
"""移除特殊字符"""
# 保留中文、字母、数字、基本标点
text = re.sub(r'[^\w\s\u4e00-\u9fff\uff00-\uffef.,;:!?()【】""''-]', '', text)
return text
@staticmethod
def normalize_whitespace(text: str) -> str:
"""规范化空白"""
text = re.sub(r'\s+', ' ', text)
return text.strip()
@staticmethod
def remove_empty_lines(text: str) -> str:
"""移除空行"""
lines = [line.strip() for line in text.split('\n') if line.strip()]
return '\n'.join(lines)
@staticmethod
def truncate_by_tokens(text: str, max_tokens: int = 1000) -> str:
"""按token截断"""
# 简化实现:按字符截断
# 实际应使用tokenizer
char_limit = max_tokens * 2 # 假设平均2字符/token
return text[:char_limit]
2.3 数据增强
from typing import List, Callable
import random
class DataAugmentor:
"""数据增强器"""
def __init__(self):
self.augmenters: List[Callable] = []
def add_augmenter(self, augmenter: Callable):
"""添加增强函数"""
self.augmenters.append(augmenter)
def augment_examples(
self,
examples: List[TrainingExample],
augment_ratio: float = 0.3
) -> List[TrainingExample]:
"""增强样本"""
augmented = examples.copy()
# 随机选择部分样本进行增强
n_augment = int(len(examples) * augment_ratio)
selected = random.sample(examples, min(n_augment, len(examples)))
for example in selected:
for augmenter in self.augmenters:
try:
new_example = augmenter(example)
augmented.append(new_example)
except:
pass
return augmented
# 内置增强器
@staticmethod
def synonym_replacement(example: TrainingExample) -> TrainingExample:
"""同义词替换(简化)"""
# 实际应使用同义词库
synonyms = {
"好": ["优秀", "棒", "不错"],
"大": ["巨大", "大型", "宏大"],
"小": ["微小", "小型", "细微"]
}
text = example.input
for word, syns in synonyms.items():
if word in text:
replacement = random.choice(syns)
text = text.replace(word, replacement, 1)
return TrainingExample(
input=text,
output=example.output,
label=example.label,
metadata=example.metadata.copy(),
task_type=example.task_type
)
@staticmethod
def back_translation(example: TrainingExample) -> TrainingExample:
"""回译(需要翻译API)"""
# 简化实现
return example
@staticmethod
def noise_injection(example: TrainingExample) -> TrainingExample:
"""注入噪声"""
noises = ["的", "了", "是", "很"]
noise = random.choice(noises)
# 随机位置插入
words = example.input.split()
if len(words) > 1:
pos = random.randint(0, len(words) - 1)
words.insert(pos, noise)
return TrainingExample(
input=' '.join(words),
output=example.output,
label=example.label,
metadata=example.metadata.copy(),
task_type=example.task_type
)
3. 微调方法
3.1 全量微调
import torch
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
TrainingArguments,
Trainer
)
from torch.utils.data import Dataset
class FullFineTuner:
"""全量微调器"""
def __init__(
self,
model_name: str = "Qwen/Qwen2-7B-Instruct"
):
self.model_name = model_name
self.tokenizer = None
self.model = None
def load_model(self):
"""加载模型和tokenizer"""
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.model = AutoModelForCausalLM.from_pretrained(self.model_name)
def prepare_dataset(
self,
examples: List[TrainingExample],
max_length: int = 512
):
"""准备数据集"""
class SFTDataset(Dataset):
def __init__(self, examples, tokenizer, max_length):
self.examples = examples
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.examples)
def __getitem__(self, idx):
example = self.examples[idx]
# 构建训练文本
if example.task_type == TaskType.SEQ_TO_SEQ:
text = f"{example.input} -> {example.output}"
else:
text = example.input
# Tokenize
encodings = self.tokenizer(
text,
truncation=True,
max_length=self.max_length,
padding="max_length",
return_tensors="pt"
)
return {
"input_ids": encodings["input_ids"].squeeze(),
"attention_mask": encodings["attention_mask"].squeeze(),
"labels": encodings["input_ids"].squeeze() # 因果LM,labels=input_ids
}
return SFTDataset(examples, self.tokenizer, max_length)
def train(
self,
train_examples: List[TrainingExample],
val_examples: List[TrainingExample],
output_dir: str = "./output",
**training_args
):
"""训练"""
# 准备数据集
train_dataset = self.prepare_dataset(train_examples)
val_dataset = self.prepare_dataset(val_examples) if val_examples else None
# 训练参数
default_args = {
"output_dir": output_dir,
"num_train_epochs": 3,
"per_device_train_batch_size": 4,
"per_device_eval_batch_size": 4,
"gradient_accumulation_steps": 1,
"optim": "adamw_torch",
"save_steps": 500,
"logging_steps": 50,
"learning_rate": 2e-5,
"weight_decay": 0.01,
"warmup_ratio": 0.1,
"lr_scheduler_type": "cosine",
"fp16": True,
"gradient_checkpointing": True,
}
default_args.update(training_args)
training_args = TrainingArguments(**default_args)
)
# 训练器
trainer = Trainer(
model=self.model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
tokenizer=self.tokenizer,
)
# 开始训练
trainer.train()
# 保存模型
trainer.save_model(output_dir)
return trainer
3.2 指令微调
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TrainingArguments,
Trainer,
DataCollatorForCompletion
)
class InstructionTuner:
"""指令微调器"""
def __init__(
self,
model_name: str = "Qwen/Qwen2-7B-Instruct"
):
self.model_name = model_name
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForCausalLM.from_pretrained(model_name)
# 设置pad_token
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
def format_instruction(
self,
instruction: str,
input_text: str = "",
output: str = ""
) -> str:
"""格式化指令"""
# 根据模型调整格式
if "Qwen" in self.model_name:
return f"<|im_start|>user\n{instruction}\n<|im_end|>\n<|im_start|>assistant\n{output}<|im_end_start|>"
elif "Llama" in self.model_name:
return f"[INST] {instruction} {input_text} [/INST] {output}"
else:
return f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n{output}"
def prepare_dataset(
self,
examples: List[TrainingExample],
max_length: int = 512
):
"""准备数据集"""
class InstructionDataset(Dataset):
def __init__(self, examples, tokenizer, format_func, max_length):
self.examples = examples
self.tokenizer = tokenizer
self.format_func = format_func
self.max_length = max_length
def __len__(self):
return len(self.examples)
def __getitem__(self, idx):
example = self.examples[idx]
# 格式化指令
if example.task_type == TaskType.INSTRUCTION:
text = self.format_func(
instruction=example.input,
output=example.output or ""
)
else:
text = example.input
# Tokenize
encodings = self.tokenizer(
text,
truncation=True,
max_length=self.max_length,
padding="max_length",
return_tensors="pt"
)
return {
"input_ids": encodings["input_ids"].squeeze(),
"attention_mask": encodings["attention_mask"].squeeze(),
"labels": encodings["input_ids"].squeeze()
}
return InstructionDataset(
examples,
self.tokenizer,
self.format_instruction,
max_length
)
def train(
self,
train_examples: List[TrainingExample],
val_examples: List[TrainingExample],
output_dir: str = "./output/instruction_tuned",
**training_args
):
"""训练"""
train_dataset = self.prepare_dataset(train_examples)
val_dataset = self.prepare_dataset(val_examples) if val_examples else None
default_args = {
"output_dir": output_dir,
"num_train_epochs": 3,
"per_device_train_batch_size": 4,
"per_device_eval_batch_size": 4,
"gradient_accumulation_steps": 1,
"learning_rate": 2e-5,
"fp16": True,
"gradient_checkpointing": True,
"save_total_limit": 3,
"logging_steps": 10,
}
default_args.update(training_args)
training_args = TrainingArguments(**default_args)
trainer = Trainer(
model=self.model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
tokenizer=self.tokenizer,
)
trainer.train()
trainer.save_model(output_dir)
return trainer
4.1 LoRA概述
LoRA(Low-Rank Adaptation)是一种参数高效微调方法,通过添加低秩矩阵来适应新任务。
┌─────────────────────────────────────────────────┐
│ LoRA工作原理 │
├─────────────────────────────────────────────────┤
│ │
│ 原始权重 W (d x k) │
│ │
│ ┌─────────────────────┐ │
│ │ 预训练参数(冻结) │ │
│ └─────────────────────┘ │
│ + │
│ ┌─────────────────────┐ │
│ │ LoRA矩阵 A (r x k) │ │
│ │ LoRA矩阵 B (d x r) │ │
│ │ W' = W + BA │ │
│ └─────────────────────┘ │
│ ↓ │
│ 微调后的权重 W' │
│ │
│ 参数量:d x k → d x k + d x r + k x r │
│ (r << k, 通常 r=4,8,16) │
│ │
└─────────────────────────────────────────────────┘
4.2 LoRA配置
from peft import LoraConfig, get_peft_model, TaskType
class LoRATuner:
"""LoRA微调器"""
def __init__(
self,
model_name: str = "Qwen/Qwen2-7B-Instruct"
):
self.model_name = model_name
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForCausalLM.from_pretrained(model_name)
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
def get_lora_config(
self,
r: int = 8, # LoRA秩
lora_alpha: int = 32, # LoRA alpha
lora_dropout: float = 0.05, # Dropout
bias: str = "none" # bias处理方式
) -> LoraConfig:
"""获取LoRA配置"""
return LoraConfig(
r=r, # LoRA秩
lora_alpha=l=lora_alpha, # LoRA alpha (scaling = alpha/r)
lora_dropout=lora_dropout, # Dropout
bias=bias, # bias: 'none', 'all', 'lora_only'
task_type=TaskType.CAUSAL_LM, # 任务类型
target_modules=[ # 要应用LoRA的层
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj"
],
inference_mode=False, # 推理模式
)
def apply_lora(
self,
lora_config: LoraConfig
):
"""应用LoRA"""
self.model = get_peft_model(
self.model,
lora_config=lora_config,
trainable=True # 标记可训练参数
)
# 打印可训练参数
self.model.print_trainable_parameters()
def train(
self,
train_examples: List[TrainingExample],
val_examples: List[TrainingExample],
output_dir: str = "./output/lora_tuned",
lora_r: int = 8,
lora_alpha: int = 32,
**training_args
):
"""训练"""
# 配置LoRA
lora_config = self.get_lora_config(
r=lora_r,
lora_alpha=lora_alpha
)
# 应用LoRA
self.apply_lora(lora_config)
# 准备数据集
train_dataset = self.prepare_dataset(train_examples)
val_dataset = self.prepare_dataset(val_examples) if val_examples else None
# 训练参数
default_args = {
"output_dir": output_dir,
"num_train_epochs": 3,
"per_device_train_batch_size": 4,
"gradient_accumulation_steps": 1,
"learning_rate": 2e-4,
"fp16": True,
"gradient_checkpointing": True,
"logging_steps": 10,
"save_total_limit": 2,
}
default_args.update(training_args)
training_args = TrainingArguments(**default_args)
# 训练器
trainer = Trainer(
model=self.model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
tokenizer=self.tokenizer,
)
trainer.train()
# 保存LoRA权重
trainer.save_model(output_dir)
return trainer
def prepare_dataset(self, examples: List[TrainingExample], max_length: int = 512):
"""准备数据集"""
class SFTDataset(Dataset):
def __init__(self, examples, tokenizer, max_length):
self.examples = examples
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.examples)
def __getitem__(self, idx):
example = self.examples[idx]
text = example.input
if example.output:
text = f"{text} -> {example.output}"
encodings = self.tokenizer(
text,
truncation=True,
max_length=self.max_length,
padding="max_length",
return_tensors="pt"
)
return {
"input_ids": encodings["input_ids"].squeeze(),
"attention_mask": encodings["attention_mask"].squeeze(),
"labels": encodings["input_ids"].squeeze()
}
return SFTDataset(examples, self.tokenizer, max_length)
def merge_and_save(self, output_dir: str):
"""合并LoRA权重并保存"""
# 合并LoRA权重到原始模型
merged_model = self.model.merge_and_unload()
# 保存合并后的模型
merged_model.save_pretrained(output_dir)
self.tokenizer.save_pretrained(output_dir)
5. PEFT技术
5.1 QLoRA(量化LoRA)
from peft import prepare_model_for_kbit_training
from transformers import BitsAndBytesConfig
class QLoRATuner(LoRATuner):
"""量化LoRA微调器"""
def load_quantized_model(
self,
load_in_4bit: bool = True,
load_in_8bit: bool = False
):
"""加载量化模型"""
if load_in_8bit:
quantization_config = BitsAndBytesConfig(
load_in_8bit=True,
llm_int8_threshold=6.0
)
else:
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
)
self.model = AutoModelForCausalLM.from_pretrained(
self.model_name,
quantization_config=quantization_config,
device_map="auto"
)
# 准备量化训练
self.model = prepare_model_for_kbit_training(self.model)
5.2 P-tuning
from peft import PromptTuningConfig, get_peft_model
class PromptTuner:
"""P-tuning微调器"""
def __init__(
self,
model_name: str = "gpt2-medium"
):
self.model_name = model_name
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForCausalLM.from_pretrained(model_name)
def get_prompt_tuning_config(
self,
num_virtual_tokens: int = 100
) -> PromptTuningConfig:
"""获取P-tuning配置"""
return PromptTuningConfig(
peft_type="PROMPT_TUNING",
task_type=TaskType.CAUSAL_LM,
num_virtual_tokens=num_virtual_tokens, # 虚拟token数量
tokenizer_name_or_path=self.model_name,
prompt_embedding_init="TEXT", # 初始化方式
)
def apply_prompt_tuning(self, config: PromptTuningConfig):
"""应用P-tuning"""
self.model = get_peft_model(self.model.model, config)
6. 微调评估
6.1 评估指标
from typing import List
import numpy as np
class FineTuneEvaluator:
"""微调评估器"""
def __init__(self, model, tokenizer):
self.model = model
self.tokenizer = tokenizer
def evaluate_generation(
self,
test_examples: List[TrainingExample],
max_length: int = 100
) -> Dict:
"""评估生成质量"""
predictions = []
references = []
for example in test_examples:
# 生成预测
prediction = self.generate(example.input, max_length)
predictions.append(prediction)
references.append(example.output or "")
# 计算BLEU
bleu_scores = [self._calculate_bleu(p, r) for p, r in zip(predictions, references)]
# 计算ROUGE
rouge_scores = [self._calculate_rouge(p, r) for p, r in zip(predictions, references)]
return {
"bleu": np.mean(bleu_scores),
"rouge1": np.mean([s["rouge1"] for s in rouge_scores]),
"rouge2": np.mean([s["rouge2"] for s in rouge_scores]),
"rougeL": np.mean([s["rougeL"] for s in rouge_scores])
}
def generate(self, prompt: str, max_length: int = 100) -> str:
"""生成文本"""
inputs = self.tokenizer(prompt, return_tensors="pt")
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_new_tokens=max_length,
do_sample=True,
temperature=0.7,
top_p=0.9
)
return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
def _calculate_bleu(self, pred: str, ref: str) -> float:
"""计算BLEU(简化)"""
pred_words = pred.split()
ref_words = ref.split()
if not pred_words:
return 0
# 1-gram匹配
pred_set = set(pred_words)
ref_set = set(ref_words)
match = len(pred_set & ref_set)
# 精确率
precision = match / len(pred_set) if pred_set else 0
# 召回率
recall = match / len(ref_set) if ref_set else 0
# F1
if precision + recall == 0:
return 0
return 2 * precision * recall / (precision + recall)
def _calculate_rouge(self, pred: str, ref: str) -> Dict:
"""计算ROUGE(简化)"""
pred_words = set(pred.split())
ref_words = set(ref.split())
if not ref_words:
return {"rouge1": 0, "rouge2": 0, "rougeL": 0}
# ROUGE-N (简化为Jaccard)
intersection = len(pred_words & ref_words)
union = len(pred_words | ref_words)
score = intersection / union if union > 0 else 0
return {"rouge1": score, "rouge2": score, "rougeL": score}
6.2 Perplexity评估
def calculate_perplexity(model, tokenizer, text: str) -> float:
"""计算困惑度"""
inputs = tokenizer(text, return_tensors="pt")
input_ids = inputs["input_ids"]
with torch.no_grad():
outputs = model(input_ids, labels=input_ids)
loss = outputs.loss.item()
perplexity = np.exp(loss)
return perplexity
7. 模型部署
7.1 模型量化
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
def quantize_model(
model_path: str,
output_path: str,
quantization: str = "int8" # int8, float16, etc
):
"""量化模型"""
# 加载模型
model = AutoModelForCausalLM.from_pretrained(model_path)
# 量化
if quantization == "int8":
model = torch.quantization.quantize_dynamic(
model,
{torch.nn.Linear},
torch.qint8
)
elif quantization == "float16":
model = model.half()
# 保存量化模型
model.save_pretrained(output_path)
7.2 模型推理
class QuantizedModelInference:
"""量化模型推理"""
def __init__(self, model_path: str):
# 加载量化配置
quantization_config = BitsAndBytesConfig(
load_in_4bit=True
)
# 加载模型
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
self.model = AutoModelForCausalLM.from_pretrained(
model_path,
quantization_config=quantization_config,
device_map="auto"
)
def generate(self, prompt: str, max_length: int = 100) -> str:
"""生成文本"""
inputs = self.tokenizer(prompt, return_tensors="pt")
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_new_tokens=max_length,
do_sample=True,
temperature=0.7,
top_p=0.9
)
return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
8. 实现示例
8.1 完整微调流程
"""
完整微调流程
1. 数据准备
2. 数据增强
3. LoRA微调
4. 评估
5. 部署
"""
if __name__ == "__main__":
# 1. 加载数据
print("=== 加载数据 ===")
loader = DatasetLoader(task_type=TaskType.CAUSAL_LM)
loader.load_from_jsonl("./data/train_data.jsonl")
# 2. 数据分割
train_examples, val_examples, test_examples = loader.split(
train_ratio=0.8,
val_ratio=0.1
)
print(f"训练集: {len(train_examples)}")
print(f"验证集: {len(val_examples)}")
print(f"测试集: {len(test_examples)}")
# 3. 数据增强(可选)
print("\n=== 数据增强 ===")
augmentor = DataAugmentor()
augmentor.add_augmenter(DataAugmentor.noise_injection)
train_examples = augmentor.augment_examples(
train_examples,
augment_ratio=0.1
)
print(f"增强后训练集: {len(train_examples)}")
# 4. LoRA微调
print("\n=== LoRA微调 ===")
tuner = LoRATuner(model_name="Qwen/Qwen2-7B-Instruct")
trainer = tuner.train(
train_examples=train_examples,
val_examples=val_examples,
output_dir="./output/lora_model",
lora_r=8,
lora_alpha=32,
num_train_epochs=3,
per_device_train_batch_size=4,
learning_rate=2e-4
)
# 5. 合并模型
print("\n=== 合并模型 ===")
tuner.merge_and_save("./output/merged_model")
# 6. 评估
print("\n=== 评估模型 ===")
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained("./output/merged_model")
tokenizer = AutoTokenizer.from_pretrained("./output/merged_model")
evaluator = FineTuneEvaluator(model, tokenizer)
metrics = evaluator.evaluate_generation(test_examples)
print(f"BLEU: {metrics['bleu']:.4f}")
print(f"ROUGE-1: {metrics['rouge1']:.4f}")
print(f"ROUGE-2: {metrics['rouge2']:.4f}")
print(f"ROUGE-L: {metrics['rougeL']:.4f}")
print("\n=== 微调完成 ===")
面试高频问法
Q1: 全量微调和LoRA微调有什么区别?
标准回答:
区别对比:
| 维度 | 全量微调 | LoRA微调 |
|------|---------|----------|
| 参数更新 | 更新所有参数 | 只更新低秩矩阵 |
| 显存需求 | 高(需要存储所有梯度) | 低(只需存储LoRA参数)|
| 训练速度 | 慢 | 快 |
| 适配能力 | 完全适应 | 有限适配 |
| 存储空间 | 需要保存完整模型 | 只需保存LoRA权重 |
LoRA原理:
- W' = W + BA(B和A是低秩矩阵)
- 原始参数W冻结,只训练B和A
- 典型配置:r=8或16
工程选择:
- 资源充足 → 全量微调
- 资源受限 → LoRA微调
- 多任务微调 → LoRA(每个任务独立的LoRA)
Q2: 微调需要多少数据?
标准回答:
数据量建议:
1. 指令微调
- 简单任务:100-1000条
- 中等复杂:1000-10000条
- 复杂任务:10000-100000条
2. LoRA微调
- 轻量适配:100-1000条
- 领域适配:1000-10000条
- 风格适配:10000-50000条
3. 全量微调
- 至少10000条高质量数据
- 理想50000-100000条
数据质量要求:
- 数据多样性:覆盖不同场景
- 数据准确性:标注正确
- 数据平衡:各类别均衡
- 数据清洗:去除噪声数据
经验公式:
最少样本数 = 参数量 * 0.01
LoRA参数量 = d*r + k*r (d>>k,r<<k)
Q3: 如何选择微调的超参数?
标准回答:
关键超参数:
1. Learning Rate
- LoRA: 1e-4 ~ 2e-4
- 全量: 1e-5 ~ 5e-5
- QLoRA: 2e-4 ~ 5e-4
2. LoRA Rank (r)
- 4-16(常用8)
- 越小适配能力弱
- 越大参数越多
3. LoRA Alpha
- 通常2*rank (rank=8, alpha=16或32)
- 影响缩放因子
4. Batch Size
- 显存允许下越大越好
- 常用4-16
- 配合梯度累积
5. Epochs
- 避免过拟合:1-3轮
- 早停机制
调优策略:
1. 固定其他参数,调learning rate
2. 调整batch size(梯度累积补偿)
3. 调整LoRA rank
4. 观察loss曲线调整epochs
总结
微调核心要点
| 要点 | 策略 |
|---|---|
| 数据准备 | 质量多样、清洗增强 |
| 方法选择 | LoRA优先、全量根据需求 |
| 超参数 | LR、rank、batch调优 |
| 评估 | BLEU、ROUGE、perplexity |
| 部署 | 量化、融合 |
最佳实践
- 数据为王:高质量数据比大模型更重要
- LoRA优先:参数高效、训练快速
- 小步迭代:先少量数据验证,再扩展
- 持续评估:训练中监控loss和指标
- 量化部署:降低推理成本