Observability 可观测性
生产级 AI 应用必须具备完整的可观测性,Spring AI Alibaba 基于 Micrometer 提供开箱即用的指标、追踪和日志能力,深度集成阿里云 ARMS。
为什么 AI 应用需要可观测性?
AI 应用的可观测性比传统应用更复杂:
| 传统应用 | AI 应用 |
|---|---|
| 响应时间 | 响应时间 + Token 生成速度 |
| 错误率 | 错误率 + 模型拒绝率 + 幻觉率 |
| 吞吐量 | 吞吐量 + Token 吞吐量 |
| 资源消耗 | 资源消耗 + Token 消耗 + API 费用 |
| 调用链 | 调用链 + Prompt/Response 内容 |
核心可观测性组件
Spring AI 基于 Micrometer 提供三大可观测性支柱:
┌─────────────────────────────────────────────────────┐
│ Spring AI 可观测性 │
├─────────────────┬───────────────┬───────────────────┤
│ Metrics │ Tracing │ Logging │
│ (指标) │ (追踪) │ (日志) │
│ │ │ │
│ Micrometer │ OpenTelemetry │ SimpleLoggerAdvisor│
│ Prometheus │ Zipkin/Jaeger │ 结构化日志 │
│ Grafana │ ARMS │ │
└─────────────────┴───────────────┴───────────────────┘依赖配置
xml
<!-- Micrometer + Prometheus -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-actuator</artifactId>
</dependency>
<dependency>
<groupId>io.micrometer</groupId>
<artifactId>micrometer-registry-prometheus</artifactId>
</dependency>
<!-- OpenTelemetry 追踪 -->
<dependency>
<groupId>io.micrometer</groupId>
<artifactId>micrometer-tracing-bridge-otel</artifactId>
</dependency>
<dependency>
<groupId>io.opentelemetry</groupId>
<artifactId>opentelemetry-exporter-otlp</artifactId>
</dependency>内置 AI 指标
Spring AI 自动采集以下指标(无需额外配置):
ChatModel 指标
yaml
# 启用 AI 指标
management:
endpoints:
web:
exposure:
include: prometheus, health, info
metrics:
tags:
application: ${spring.application.name}自动采集的指标:
# 请求计数
gen_ai_client_operation_duration_count{
gen_ai_operation_name="chat",
gen_ai_system="dashscope",
gen_ai_request_model="qwen-max",
...
}
# 请求延迟(直方图)
gen_ai_client_operation_duration_bucket{le="0.5"}
gen_ai_client_operation_duration_bucket{le="1.0"}
gen_ai_client_operation_duration_bucket{le="2.0"}
# Token 使用量
gen_ai_client_token_usage_total{
gen_ai_token_type="input",
gen_ai_request_model="qwen-max"
}
gen_ai_client_token_usage_total{
gen_ai_token_type="output",
gen_ai_request_model="qwen-max"
}自定义业务指标
java
@Service
public class MonitoredChatService {
@Autowired
private ChatClient chatClient;
@Autowired
private MeterRegistry meterRegistry;
// 计数器:按场景统计调用次数
private final Counter chatCounter;
private final Counter ragHitCounter;
private final Timer chatTimer;
public MonitoredChatService(MeterRegistry registry) {
this.chatCounter = Counter.builder("ai.chat.requests")
.description("AI 对话请求总数")
.tag("service", "chat")
.register(registry);
this.ragHitCounter = Counter.builder("ai.rag.hits")
.description("RAG 检索命中次数")
.register(registry);
this.chatTimer = Timer.builder("ai.chat.duration")
.description("AI 对话响应时间")
.publishPercentiles(0.5, 0.95, 0.99)
.register(registry);
}
public String chat(String message, String sessionId) {
chatCounter.increment();
return chatTimer.record(() -> {
String response = chatClient.prompt()
.user(message)
.advisors(advisor -> advisor
.param(AbstractChatMemoryAdvisor.CHAT_MEMORY_CONVERSATION_ID_KEY, sessionId)
)
.call()
.content();
// 记录 Token 使用(自定义维度)
meterRegistry.gauge("ai.session.messages",
Tags.of("session", sessionId),
getMessageCount(sessionId)
);
return response;
});
}
}分布式追踪
配置 OpenTelemetry
yaml
spring:
application:
name: ai-service
management:
tracing:
sampling:
probability: 1.0 # 100% 采样(生产环境建议 0.1)
otlp:
tracing:
endpoint: http://otel-collector:4318/v1/tracesSpring AI 自动追踪
Spring AI 自动为每次 AI 调用创建 Span:
Trace: user-request-abc123
│
├── Span: HTTP GET /api/chat (5.2s)
│ │
│ ├── Span: ChatClient.call (4.8s)
│ │ │
│ │ ├── Span: QuestionAnswerAdvisor (0.3s)
│ │ │ └── VectorStore.similaritySearch (0.28s)
│ │ │
│ │ └── Span: DashScopeChatModel.call (4.5s)
│ │ ├── gen_ai.system: dashscope
│ │ ├── gen_ai.request.model: qwen-max
│ │ ├── gen_ai.usage.input_tokens: 523
│ │ └── gen_ai.usage.output_tokens: 287
│ │
│ └── Span: MessageChatMemoryAdvisor.save (0.05s)自定义 Span
java
@Service
public class TracedAgentService {
@Autowired
private Tracer tracer;
@Autowired
private ReactAgent agent;
@NewSpan("agent.execute") // 自动创建 Span
public String executeAgent(
@SpanTag("agent.task") String task,
@SpanTag("agent.user") String userId
) {
Span currentSpan = tracer.currentSpan();
try {
String result = agent.run(task);
currentSpan.tag("agent.success", "true");
return result;
} catch (Exception e) {
currentSpan.tag("agent.success", "false");
currentSpan.tag("agent.error", e.getMessage());
currentSpan.error(e);
throw e;
}
}
}SimpleLoggerAdvisor:对话日志
java
@Bean
public ChatClient chatClient(ChatClient.Builder builder) {
return builder
.defaultAdvisors(
// 记录完整的 Prompt 和 Response
new SimpleLoggerAdvisor()
)
.build();
}输出示例:
INFO SimpleLoggerAdvisor - request: AdvisedRequest{
userText='解释 Spring AI 的架构',
systemText='你是一个技术专家',
messages=[...],
advisorParams={...}
}
INFO SimpleLoggerAdvisor - response: ChatResponse{
generations=[Generation{
output=AssistantMessage{content='Spring AI 采用分层架构...'},
metadata=ChatGenerationMetadata{finishReason=STOP}
}],
metadata=ChatResponseMetadata{
usage=DefaultUsage{promptTokens=45, generationTokens=312}
}
}自定义日志 Advisor
java
@Component
public class AuditLogAdvisor implements CallAroundAdvisor {
@Autowired
private AuditLogRepository auditRepo;
@Override
public AdvisedResponse aroundCall(
AdvisedRequest request,
CallAroundAdvisorChain chain
) {
long startTime = System.currentTimeMillis();
AdvisedResponse response = chain.nextAroundCall(request);
long duration = System.currentTimeMillis() - startTime;
// 异步保存审计日志
CompletableFuture.runAsync(() -> {
AuditLog log = AuditLog.builder()
.sessionId(getSessionId(request))
.userMessage(request.userText())
.assistantMessage(response.response().getResult().getOutput().getContent())
.inputTokens(getInputTokens(response))
.outputTokens(getOutputTokens(response))
.durationMs(duration)
.model(getModel(response))
.timestamp(LocalDateTime.now())
.build();
auditRepo.save(log);
});
return response;
}
@Override
public int getOrder() {
return Ordered.LOWEST_PRECEDENCE; // 最后执行
}
@Override
public String getName() {
return "AuditLogAdvisor";
}
}集成阿里云 ARMS
ARMS(Application Real-Time Monitoring Service)是阿里云的 APM 服务,与 Spring AI Alibaba 深度集成:
xml
<dependency>
<groupId>com.alibaba.cloud</groupId>
<artifactId>spring-cloud-starter-alibaba-arms</artifactId>
</dependency>yaml
spring:
cloud:
alibaba:
arms:
app-name: ${spring.application.name}
license-key: ${ARMS_LICENSE_KEY}
# AI 专项监控
ai:
enabled: true
token-cost-alert:
threshold: 10000 # 单次请求超过 10000 Token 告警
latency-alert:
threshold-ms: 5000 # 超过 5 秒告警ARMS AI 监控大盘提供:
- Token 消耗趋势:按模型、按接口、按用户维度
- 费用预估:实时 Token 消耗换算为费用
- 延迟分布:P50/P95/P99 响应时间
- 错误分析:模型错误、超时、限流统计
- 对话质量:基于用户反馈的满意度追踪
Token 成本监控
java
@Component
public class TokenCostMonitor {
// 各模型单价(元/千 Token,仅示例)
private static final Map<String, Double> INPUT_PRICE = Map.of(
"qwen-max", 0.04,
"qwen-plus", 0.008,
"qwen-turbo", 0.003
);
private static final Map<String, Double> OUTPUT_PRICE = Map.of(
"qwen-max", 0.12,
"qwen-plus", 0.024,
"qwen-turbo", 0.006
);
@Autowired
private MeterRegistry meterRegistry;
@EventListener
public void onChatResponse(ChatResponseEvent event) {
ChatResponse response = event.getResponse();
String model = response.getMetadata().getModel();
Usage usage = response.getMetadata().getUsage();
long inputTokens = usage.getPromptTokens();
long outputTokens = usage.getGenerationTokens();
// 记录 Token 指标
meterRegistry.counter("ai.tokens.input",
"model", model).increment(inputTokens);
meterRegistry.counter("ai.tokens.output",
"model", model).increment(outputTokens);
// 计算费用
double cost = inputTokens / 1000.0 * INPUT_PRICE.getOrDefault(model, 0.04)
+ outputTokens / 1000.0 * OUTPUT_PRICE.getOrDefault(model, 0.12);
meterRegistry.counter("ai.cost.yuan", "model", model).increment(cost);
// 超阈值告警
if (inputTokens + outputTokens > 8000) {
log.warn("高 Token 消耗告警:model={}, tokens={}, cost=¥{:.4f}",
model, inputTokens + outputTokens, cost);
}
}
}Grafana 监控大盘
推荐的 Grafana 面板配置:
json
// 关键监控面板
{
"panels": [
{
"title": "AI 请求 QPS",
"query": "rate(gen_ai_client_operation_duration_count[1m])"
},
{
"title": "P99 响应时间",
"query": "histogram_quantile(0.99, rate(gen_ai_client_operation_duration_bucket[5m]))"
},
{
"title": "Token 消耗速率",
"query": "rate(gen_ai_client_token_usage_total[5m])"
},
{
"title": "错误率",
"query": "rate(gen_ai_client_operation_duration_count{error='true'}[5m]) / rate(gen_ai_client_operation_duration_count[5m])"
}
]
}健康检查
yaml
management:
health:
ai:
enabled: true # 启用 AI 服务健康检查
endpoint:
health:
show-details: alwaysjson
// GET /actuator/health
{
"status": "UP",
"components": {
"dashScopeChatModel": {
"status": "UP",
"details": {
"model": "qwen-max",
"latency_ms": 245
}
},
"vectorStore": {
"status": "UP",
"details": {
"type": "RedisVectorStore",
"documents": 15420
}
}
}
}最佳实践
生产监控建议
- 采样率:生产环境追踪采样率设为 5-10%,避免性能影响
- 告警阈值:P99 延迟 > 5s、错误率 > 1%、单次 Token > 8000 触发告警
- 成本预算:设置每日/每月 Token 消耗预算,超限自动降级
- 日志脱敏:对话日志中的敏感信息(手机号、身份证)自动脱敏
- 保留策略:对话日志保留 90 天,指标数据保留 1 年
注意事项
SimpleLoggerAdvisor会记录完整 Prompt,生产环境注意日志存储成本- 高并发场景下,同步写审计日志会影响响应时间,务必使用异步
- Token 指标是成本控制的核心,建议设置每日预算告警