内容纲要
模块六:评估与回归测试(详解版)
覆盖:Golden Set、离线评估、在线 A/B、LLM-as-Judge
目录
必须掌握的概念
6.1 Golden Set(黄金测试集)
定义:
由人工标注的高质量问答对数据集,用于评估和测试。
数据结构:
{
"question": "Python 并发编程是什么?",
"ground_truth": {
"answer": "Python 提供了多种并发编程方式",
"contexts": ["Python threading 文档"],
"metadata": {
"difficulty": "medium",
"category": "programming",
"tags": ["concurrency", "python"]
}
},
"retrieved_contexts": ["相关文档1", "相关文档2"],
"generated_answer": "生成的答案"
}
6.2 离线评估(Offline Eval)
定义:
在不影响生产的情况下,使用 Golden Set 对系统进行评估。
| 评估指标: | 指标 | 说明 | 取值范围 |
|---|---|---|---|
| Faithfulness | 忠实度,答案是否基于上下文 | [0, 1] | |
| Answer Relevance | 答案相关性 | [0, 1] | |
| Context Precision | 上下文精确度 | [0, 1] | |
| Context Recall | 上下文召回率 | [0, 1] | |
| Correctness | 答案正确性 | [0, 1] |
6.3 在线 A/B 测试
定义:
在生产环境中,对比不同版本的 User Feedback。
指标:
- 点赞率:用户对答案的点赞比例
- 采纳率:用户采纳建议的比例
- 修正率:用户修正答案的比例
6.4 LLM-as-Judge
定义:
使用 LLM(如 GPT-4)评估其他 LLM 的输出质量。
关键设计点
6.1 Golden Set 构建
# evaluation/golden_set_builder.py
"""
Golden Set 构建工具
包含:数据收集、标注、导出
"""
from typing import List, Dict, Optional
from dataclasses import dataclass, field
import json
from datetime import datetime
@dataclass
class QAItem:
question: str
ground_truth: Dict
metadata: Dict = field(default_factory=dict)
@dataclass
class GroundTruth:
answer: str
contexts: List[str]
references: List[str] = field(default_factory=list)
class GoldenSetBuilder:
def __init__(self):
self._items: List[QAItem] = []
def add_item(
self,
question: str,
answer: str,
contexts: List[str] = None,
difficulty: str = "medium",
category: str = "general",
tags: List[str] = None
):
"""添加问答对"""
self._items.append(QAItem(
question=question,
ground_truth=GroundTruth(
answer=answer,
contexts=contexts or [],
references=[]
),
metadata={
"difficulty": difficulty,
"category": category,
"tags": tags or []
}
))
def add_from_dict(self, data: Dict):
"""从字典添加"""
return self.add_item(
question=data["question"],
answer=data["answer"],
contexts=data.get("contexts"),
difficulty=data.get("difficulty", "medium"),
category=data.get("category", "general"),
tags=data.get("tags")
)
def load_from_file(self, file_path: str) -> int:
"""从文件加载"""
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
count = 0
for item in data:
self.add_from_dict(item)
count += 1
return count
def export_to_file(self, file_path: str):
"""导出到文件"""
with open(file_path, 'w', encoding='utf-8') as f:
data = [
{
"question": item.question,
"ground_truth": {
"answer": item.ground_truth.answer,
"contexts": item.ground_truth.contexts,
"references": item.ground_truth.references
},
"metadata": item.metadata
}
for item in self._items
]
json.dump(data, f, ensure_ascii=False, indent=2)
def split(self, ratios: Dict[str, float] = None) -> Dict[str, List]:
"""划分数据集"""
if ratios is None:
ratios = {"train": 0.7, "test": 0.3}
total = len(self._items)
splits = {}
start = 0
for split_name, ratio in ratios.items():
end = start + int(total * ratio)
splits[split_name] = self._items[start:end]
start = end
return splits
# 使用示例
builder = GoldenSetBuilder()
# 添加单个
builder.add_item(
question="Python 是什么?",
answer="Python 是一种高级编程语言",
contexts=["Python 官方文档"],
difficulty="easy",
category="programming",
tags=["python", "intro"]
)
# 批量添加
builder.add_item(
From Query("Java 是什么?", "Java 是一种编程语言", difficulty="easy"),
From Query("什么是微服务?", "微服务是架构风格", difficulty="hard")
)
# 导出
builder.export_to_file("golden_set.json")
# 划分划分
splits = builder.split({"train": 0.7, "test": 0.3})
6.2 离线评估框架
# evaluation/offline_evaluator.py
"""
离线评估器
使用 RAGAS 框架
"""
from typing import List, Dict, Any
from dataclasses import dataclass
import pandas as pd
try:
from ragas import evaluate, RunConfig, llm_generation
from ragas.metrics import (
faithfulness,
answer_relevance,
context_precision,
context_recall,
correctness
)
RAGAS_AVAILABLE = True
except ImportError:
RAGAS_AVAILABLE = False
print("RAGAS 不可用,将使用简化评估")
@dataclass
class EvaluationResult:
metric_name: str
score: float
details: Optional[Dict] = None
@dataclass
class EvaluationReport:
overall_metrics: Dict[str, float]
detailed_metrics: List[EvaluationResult]
dataset_size: int
evaluation_time: float
class OfflineEvaluator:
def __init__(self):
self.metrics = []
if RAGAS_AVAILABLE:
self.metrics = [
faithfulness,
answer_relevance,
context_precision,
context_recall,
correctness
]
def evaluate(
self,
dataset: List[Dict],
raise_exceptions: bool = False
) -> EvaluationReport:
"""评估数据集"""
import time
start_time = time.time()
if RAGAS_AVAILABLE:
return self._evaluate_with_ragas(dataset)
else:
return self._evaluate_simple(dataset)
def _evaluate_with_ragas(self, dataset: List[Dict]) -> EvaluationReport:
"""使用 RAGAS 评估"""
from ragas import RunConfig, llm_generation
# 配置运行
run_config = RunConfig(
max_workers=4,
raise_exceptions=raise_exceptions
)
# 执行评估
results = evaluate(
dataset=dataset,
metrics=self.metrics,
run_config=run_config
)
return EvaluationReport(
overall_metrics=results.to_pandas().to_dict(orient='records'),
detailed_metrics=self._parse_detailed(results),
dataset_size=len(dataset),
evaluation_time=0 0
)
def _evaluate_simple(self, dataset: List[Dict]) -> EvaluationReport:
"""简化评估(RAGAS 不可用时)"""
import time
scores = []
for item in dataset:
# 简化的评估
scores.append(self._simple_evaluate(item))
# 计算平均分
avg_score = sum(scores) / len(scores) if scores else 0
return EvaluationReport(
overall_metrics={"score": avg_score},
detailed_metrics=[EvaluationResult(
metric_name="simple_score",
score=avg_score
)],
dataset_size=len(dataset),
evaluation_time=0
)
def _simple_evaluate(self, item: Dict) -> float:
"""简单评估实现"""
generated = item.get("generated_answer", "")
ground_truth = item.get("ground_truth", {}).get("answer", "")
# 比较字符串相似度
similarity = self._string_similarity(ground_truth, generated)
return similarity
def _string_similarity(self, str1: str, str2: str) -> float:
"""字符串相似度(Jaccard)"""
# 简化实现:字符重合度
if not str1 or not str2:
return 0.0
set1 = set(str1.lower())
set2 = set(str2.lower())
intersection = set1 & set2
union = set1 | set2
return len(intersection) / len(union) if union else 0.0
def _parse_detailed(self, ragas_result) -> List[EvaluationResult]:
"""解析 RAGAS 详细结果"""
# 将 RAGAS 结果转换为统一格式
return [
EvaluationResult(
metric_name="faithfulness",
score=result
)
for result in ragas_result['faithfulness']
]
# 使用示例
if __name__ == "__main__":
# 加载 Golden Set
with open('golden_set.json', 'r', encoding='utf-8') as f:
golden_set = json.load(f)
# 模拟生成答案
for item in golden_set:
item['generated_answer'] = item['ground_truth']['answer'] # 简单模拟
# 评估
evaluator = OfflineEvaluator()
report = evaluator.evaluate(golden_set)
print("=" * 60)
print("离线评估报告")
print("=" * 60)
print(f"数据集大小: {report.dataset_size}")
print("\n指标得分:")
for metric, score in report.overall_metrics.items():
print(f" {metric}: {score:.3f}")
6.3 A/B 测试框架
# evaluation/ab_test_framework.py
"""
A/B 测试框架
包含:流量分配、指标收集、统计分析
"""
from typing import Dict, List, Optional
from dataclasses import dataclass
import random
import time
from datetime import datetime
@dataclass
class ABTestConfig:
variant_a_name: str
variant_b_name: str
traffic_ratio: float = 0.5 # variant_a 的流量比例
min_samples: int = 100 # 最小样本数
duration_seconds: Optional[int] = None
@dataclass
class ABTestResult:
variant_a_metrics: Dict[str, Any]
variant_b_metrics: Dict[str, Any]
statistical_significance: Optional[float] = None
winner: Optional[str] = None
total_samples: int
class ABTestFramework:
def __init__(
self,
variant_a: Callable,
variant_b: Callable,
config: ABTestConfig
):
self.variant_a = variant_a
self.variant_b = variant_b
self.config = config
self._results_a = []
self._results_b = []
def run_test(self, test_cases: List[str]) -> ABTestResult:
"""运行 A/B 测试"""
print(f"启动 A/B 测试: {self.config.variant_a_name} vs {self.config.variant_b_name}")
print(f"流量比例: {self.config.traffic_ratio * 100}% / {(1 - self.config.traffic_ratio) * 100}%")
print(f"最小样本: {self.config.min_samples}")
for i, test_case in enumerate(test_cases[:self.config.min_samples]):
# 决定路由到哪个版本
if random.random() < self.config.traffic_ratio:
variant = "A"
result = self.variant_a(test_case)
self._results_a.append(result)
print(f"样本 {i+1}: Variant A")
else:
variant = "B"
result = self.variant_b(test_case)
self._results_b.append(result)
print(f"样本 {i+1}: Variant B")
# 测试时长检查
if (self.config.duration_seconds and
time.time() > self.config.duration_seconds):
print("达到测试时长,停止")
break
# 分析结果
return self._analyze_results()
def _analyze_results(self) -> ABTestResult:
"""分析结果"""
# 提取指标
metrics_a = self._extract_metrics(self._results_a)
metrics_b = self._extract_metrics(self._results_b)
# 统计显著性(t-test)
significance = self._calculate_significance(metrics_a, metrics_b)
# 确定赢家
winner = self._determine_winner(metrics_a, metrics_b)
return ABTestResult(
variant_a_metrics=metrics_a,
variant_b_metrics=metrics_b,
statistical_significance=significance,
winner=winner,
total_samples=len(self._results_a) + len(self._results_b)
)
def _extract_metrics(self, results: List) -> Dict[str, Any]:
"""提取指标"""
return {
"avg_score": sum(r.get("score", 0) for r in results) / len(results),
"success_rate": sum(1 for r in results if r.get("success")) / len(results),
"avg_duration_ms": sum(r.get("duration_ms", 0) for r in results) / len(results)
}
def _calculate_significance(
self,
metrics_a: Dict,
metrics_b: Dict
) -> Optional[float]:
"""计算统计显著性(简化版)"""
# 简化实现:t-test
from scipy import stats
metric = "avg_score"
data_a = [self._get_result_metric(r, metric) for r in self._results_a]
data_b = [self._get_result_metric(r, metric) for r in self._results_b]
if len(data_a) < 30 or len(data_b) < 30:
return None # 样本不足
t_stat, p_value = stats.ttest_ind(data_a, data_b)
return p_value
def _determine_winner(
self,
metrics_a: Dict,
metrics_b: Dict
) -> Optional[str]:
"""确定赢家"""
if metrics_a["avg_score"] > metrics_b["avg_score"]:
return self.config.variant_a_name
elif metrics_b["avg_score"] > metrics_a["avg_score"]:
return self.config.variant_b_name
return None
def _get_result_metric(self, result: Dict, metric: str) -> float:
"""获取结果指标"""
return result.get(metric, 0)
6.4 LLM-as-Judge 框架
# evaluation/llm_as_judge.py
"""
LLM-as-Judge 评估器
使用 LLM 评估其他 LLM 的输出
"""
from typing import List, Dict, Optional
from dataclasses import dataclass
@dataclass
class EvaluationPrompt:
system_prompt: str
evaluation_prompt: str
score_prompt: str
@dataclass
class LLMJudgement:
"""LLM 判断结果"""
is_accurate: bool
reasoning: str
confidence: float
class LLMAsJudge:
def __init__(self, judge_llm: Callable):
self.judge_llm = judge_llm
self.prompts = EvaluationPrompt(
system_prompt="你是一个专业的评估助手,负责评估 AI 生成的答案质量。",
evaluation_prompt="""请评估以下答案的质量。
问题:{question}
参考答案:{ground_truth}
AI 生成的答案:{answer}
从以下维度评估:
1. 完整性:是否完整回答了问题
2. 准确性:答案是否准确
3. 相关性:答案是否相关
4. 可用性:答案是否可以实际使用
每个维度打分 [0, 1]。
""",
score_prompt="""请给出综合得分 [0, 1]。
考虑因素:
- 答案是否完整
- 答案是否准确
- 答案是否相关
- 答案是否可读
请以以下 JSON 格式回答:
{{
"is_accurate": true/false,
"reasoning": "评估理由",
"confidence": 0.0-1.0
}}"""
)
def evaluate(
self,
question: str,
ground_truth: str,
generated_answer: str
) -> LLMJudgement:
"""评估单个答案"""
prompt = self.prompts.evaluation_prompt.format(
question=question,
ground_truth=ground_truth,
answer=generated_answer
)
response = self.judge_llm.generate(prompt)
return self._parse_judgement(response)
def evaluate_batch(
self,
items: List[Dict]
) -> List[LLMJudgement]:
"""批量评估"""
results = []
for item in items:
judgement = self.evaluate(
question=item["question"],
ground_truth=item.get("ground_truth", ""),
generated_answer=item.get("generated_answer", "")
)
results.append(judgement)
return results
def _parse_judgement(self, response: str) -> LLMJudgement:
"""解析判断结果"""
try:
import json
data = json.loads(response)
return LLMJudgement(
is_accurate=data.get("is_accurate", False),
reasoning=data.get("reasoning", ""),
confidence=data.get("confidence", 0.5)
)
except:
# 解析失败,默认不认可
return LLMJudgement(
is_accurate=False,
reasoning="无法解析评估结果",
confidence=0.5
)
面试高频问法
Q1: 如何构建 RAG 评估的 Golden Set?
标准回答:
Golden Set 构建步骤:
【1. 数据收集】
来源 | 方式 | 质量
--------------|------------------------------|------
用户查询日志 | 真实数据 | 高
工单标注 | 人工标注 | 最高
文档生成问答 | 从文档自动生成 | 中
社区问答 | StackOverflow、GitHub Issues | 中
实践建议:
- 优先使用真实用户查询
- 至少 100-500 对作为基础
- 覆盖不同难度(easy/medium/hard)
- 覆盖不同类型(技术/非技术)
【2. 数据规范】
```json
{
"question": "问题文本",
"ground_truth": {
"answer": "标准答案",
"contexts": ["相关文档ID1", "相关文档ID2"],
"references": ["URL1", "URL2"]
},
"metadata": {
"difficulty": "easy|medium|hard",
"category": "分类标签",
"tags": ["标签1", "标签2"],
"language": "语言"
}
}
【3. 标注规范】
-
标注者培训
- 明确评分标准
- 提供示例
- 双人交叉审核
-
难度分级
- easy: 基础概念,单一答案
- medium: 需要推理,可能有多个答案
- hard: 复杂推理,需要多个步骤
-
相关文档标注
- 明确相关文档
- 提供文档 ID
- 可追溯源文件
-
质量保证
- 至少 100 对
- 定期更新
- 版本管理
【4. 工程自动化】
数据采集:
- 查询日志导出
- 去重过滤
- 随机化去重
- 导出待标注
标注工具:
- 使用 Label Studio
- 批量分配任务
- 质量检查
导出存储:
- JSON 格式导出
- 版本控制(git)
- 自动验证格式
【5. 持续维护】
-
新场景补充:
- 定期添加新类型问题
- 更新过时问题
- 调整难度分布
-
定期评估:
- 每月评估覆盖范围
- 检查标注质量
- 统计各类型分布
-
版本追踪:
- git 管理历史
- 记录变更日志
- 可回滚到历史版本
实现示例:
# 标注工具
class GoldenSetAnnotator:
def __init__(self, file_path: str):
self.file_path = file_path
self._load_existing()
def _load_existing(self):
"""加载现有数据"""
try:
with open(self.file_path, 'r') as f:
return json.load(f)
except:
return []
def annotate_next(self):
"""获取下一个待标注项"""
for item in self._load_existing():
if not item.get("annotated"):
return item
return None
def save_annotation(self, question_id: str, annotation: Dict):
"""保存标注"""
data = self._load_existing()
for item in data:
if item["question_id"] == question_id:
item["annotation"] = annotation
break
with open(self.file_path, 'w') as f:
json.dump(data, f, indent=2)
记忆要点:
- 数据来源:真实查询 > 人工标注 > 自动生成
- 规模:question/ground_truth/metadata 结构
- 质量:最少 100-500 对
- 维护:定期更新、版本追踪
记忆要点
评估体系口诀:
Golden Set 构建:
真实查询优先
多维度标注
难度分级管理
版本追踪维护
离线评估:
RAGAS 自动化
指标标准化
详细错误分析
A/B 测试:
流量分配
指标收集
统计显著性
LLM-as-Judge:
复杂质量评估
多维度打分
批量处理
Checklist:
□ Golden Set: 100+ 对
□ 离线评估: RAGAS
□ 在线测试: A/B 平台
□ 持续更新: 定期维护
最小 Demo
见离线评估器的完整实现。
实战场景
场景:RAG 系统评估
需求:
为新开发的 RAG 系统构建评估体系。
架构:
评估流程:
├─ 构建 Golden Set (100+ 对)
├─ 离线评估(RAGAS)
├─ 持续监控(每周)
└─ 在线 A/B 测试(版本升级时)
实现:
# 1. 构建 Golden Set
builder = GoldenSetBuilder()
# 添加问题
questions = [
{
"question": "Python 3.12 有什么新特性?",
"answer": "Python 3.12 引入了类型参数、性能改进等",
"contexts": ["py312_features.md"],
"difficulty": "medium"
},
# ... 更多问题
]
for q in questions:
builder.add_from_dict(q)
builder.export_to_file("golden_set.json")
# 2. 离线评估
evaluator = OfflineEvaluator()
dataset = json.load(open("golden_set"))
report = evaluator.evaluate(dataset)
# 3. 输出报告
with open("evaluation_report.json", 'w') as f:
json.dump(report.__dict__(), f, indent=2)
监控指标:
- Faithfulness
- Answer Relevance
- Context Precision
- Context Recall
- Correctness
文档版本: 1.0