♻️ refactor: restructure Observer as base class with InferenceObserver

- Refactor Observer into base class with common enable/disable/reset interface
- Create InferenceObserver subclass for TTFT/TPOT metrics
- Fix TTFT calculation timing: compute after prefill completes instead of
  at decode start (fixes max_tokens=1 returning TTFT=0)
- Integrate InferenceObserver into bench.py and bench_offload.py for
  accurate internal timing metrics vs external wall-clock time
- Add get_summary() and print_summary() methods for structured output

Generated with [Claude Code](https://claude.ai/code)
via [Happy](https://happy.engineering)

Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: Happy <yesreply@happy.engineering>
This commit is contained in:
Zijie Tian
2026-01-28 03:15:33 +08:00
parent f3e4611e3b
commit c16bfcf40f
5 changed files with 156 additions and 37 deletions

View File

@@ -1,17 +1,106 @@
class Observer():
ttft_start = 0
tpot_start = 0
"""
Observer 基类和 InferenceObserver 实现。
ttft = 0
tpot = 0
Observer 架构:
- Observer: 基类,定义通用接口
- InferenceObserver: 推理性能观测TTFT/TPOT
- MemoryObserver: 内存传输观测(在 memory_observer.py 中定义)
"""
class Observer:
"""
Observer 基类,提供通用的启用/禁用、重置、输出接口。
所有 Observer 子类应继承此类并实现:
- complete_reset(): 重置所有统计数据
- get_summary(): 返回统计摘要 dict
- print_summary(): 打印人类可读的摘要
"""
_enabled: bool = True # 默认启用
@classmethod
def reset_ttft(cls):
def enable(cls) -> None:
"""启用 observer"""
cls._enabled = True
@classmethod
def disable(cls) -> None:
"""禁用 observer"""
cls._enabled = False
@classmethod
def is_enabled(cls) -> bool:
"""检查是否启用"""
return cls._enabled
@classmethod
def complete_reset(cls) -> None:
"""重置所有统计数据(子类实现)"""
raise NotImplementedError
@classmethod
def get_summary(cls) -> dict:
"""返回统计摘要(子类实现)"""
raise NotImplementedError
@classmethod
def print_summary(cls) -> None:
"""打印人类可读的摘要(子类可选覆盖)"""
import json
print(json.dumps(cls.get_summary(), indent=2))
class InferenceObserver(Observer):
"""
推理性能 Observer统计 TTFT 和 TPOT。
- TTFT (Time To First Token): 首个 token 生成延迟
- TPOT (Time Per Output Token): 每个输出 token 的平均延迟
统计位置:
- TTFT 开始: scheduler.py:35-36 - 第一个 sequence 从 waiting 队列取出时
- TTFT 结束: llm_engine.py:69-72 - prefill 完成后(包括 chunked prefill 所有 chunks
- TPOT 开始: llm_engine.py:65 - 每次 decode step 结束时
- TPOT 结束: llm_engine.py:62-63 - 下一次 decode step 开始时计算(测量上一次 decode 时间)
- 重置: llm_engine.py:97 - generate() 开始时
注意TPOT 需要至少 2 个输出 token 才能计算(测量 decode step 间隔)。
"""
# 时间戳 (nanoseconds)
ttft_start: int = 0
tpot_start: int = 0
# 统计结果 (nanoseconds)
ttft: int = 0
tpot: int = 0
@classmethod
def reset_ttft(cls) -> None:
"""重置 TTFT 计时器"""
cls.ttft_start = 0
@classmethod
def complete_reset(cls):
def complete_reset(cls) -> None:
"""重置所有统计数据"""
cls.ttft_start = 0
cls.tpot_start = 0
cls.ttft = 0
cls.tpot = 0
@classmethod
def get_summary(cls) -> dict:
"""返回统计摘要"""
return {
"ttft_ns": cls.ttft,
"ttft_ms": cls.ttft / 1e6,
"tpot_ns": cls.tpot,
"tpot_ms": cls.tpot / 1e6,
}
@classmethod
def print_summary(cls) -> None:
"""打印摘要"""
print(f"[InferenceObserver] TTFT: {cls.ttft / 1e6:.2f}ms, TPOT: {cls.tpot / 1e6:.2f}ms")