- Refactor Observer into base class with common enable/disable/reset interface - Create InferenceObserver subclass for TTFT/TPOT metrics - Fix TTFT calculation timing: compute after prefill completes instead of at decode start (fixes max_tokens=1 returning TTFT=0) - Integrate InferenceObserver into bench.py and bench_offload.py for accurate internal timing metrics vs external wall-clock time - Add get_summary() and print_summary() methods for structured output Generated with [Claude Code](https://claude.ai/code) via [Happy](https://happy.engineering) Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: Happy <yesreply@happy.engineering>
107 lines
3.0 KiB
Python
107 lines
3.0 KiB
Python
"""
|
||
Observer 基类和 InferenceObserver 实现。
|
||
|
||
Observer 架构:
|
||
- Observer: 基类,定义通用接口
|
||
- InferenceObserver: 推理性能观测(TTFT/TPOT)
|
||
- MemoryObserver: 内存传输观测(在 memory_observer.py 中定义)
|
||
"""
|
||
|
||
|
||
class Observer:
|
||
"""
|
||
Observer 基类,提供通用的启用/禁用、重置、输出接口。
|
||
|
||
所有 Observer 子类应继承此类并实现:
|
||
- complete_reset(): 重置所有统计数据
|
||
- get_summary(): 返回统计摘要 dict
|
||
- print_summary(): 打印人类可读的摘要
|
||
"""
|
||
|
||
_enabled: bool = True # 默认启用
|
||
|
||
@classmethod
|
||
def enable(cls) -> None:
|
||
"""启用 observer"""
|
||
cls._enabled = True
|
||
|
||
@classmethod
|
||
def disable(cls) -> None:
|
||
"""禁用 observer"""
|
||
cls._enabled = False
|
||
|
||
@classmethod
|
||
def is_enabled(cls) -> bool:
|
||
"""检查是否启用"""
|
||
return cls._enabled
|
||
|
||
@classmethod
|
||
def complete_reset(cls) -> None:
|
||
"""重置所有统计数据(子类实现)"""
|
||
raise NotImplementedError
|
||
|
||
@classmethod
|
||
def get_summary(cls) -> dict:
|
||
"""返回统计摘要(子类实现)"""
|
||
raise NotImplementedError
|
||
|
||
@classmethod
|
||
def print_summary(cls) -> None:
|
||
"""打印人类可读的摘要(子类可选覆盖)"""
|
||
import json
|
||
print(json.dumps(cls.get_summary(), indent=2))
|
||
|
||
|
||
class InferenceObserver(Observer):
|
||
"""
|
||
推理性能 Observer,统计 TTFT 和 TPOT。
|
||
|
||
- TTFT (Time To First Token): 首个 token 生成延迟
|
||
- TPOT (Time Per Output Token): 每个输出 token 的平均延迟
|
||
|
||
统计位置:
|
||
- TTFT 开始: scheduler.py:35-36 - 第一个 sequence 从 waiting 队列取出时
|
||
- TTFT 结束: llm_engine.py:69-72 - prefill 完成后(包括 chunked prefill 所有 chunks)
|
||
- TPOT 开始: llm_engine.py:65 - 每次 decode step 结束时
|
||
- TPOT 结束: llm_engine.py:62-63 - 下一次 decode step 开始时计算(测量上一次 decode 时间)
|
||
- 重置: llm_engine.py:97 - generate() 开始时
|
||
|
||
注意:TPOT 需要至少 2 个输出 token 才能计算(测量 decode step 间隔)。
|
||
"""
|
||
|
||
# 时间戳 (nanoseconds)
|
||
ttft_start: int = 0
|
||
tpot_start: int = 0
|
||
|
||
# 统计结果 (nanoseconds)
|
||
ttft: int = 0
|
||
tpot: int = 0
|
||
|
||
@classmethod
|
||
def reset_ttft(cls) -> None:
|
||
"""重置 TTFT 计时器"""
|
||
cls.ttft_start = 0
|
||
|
||
@classmethod
|
||
def complete_reset(cls) -> None:
|
||
"""重置所有统计数据"""
|
||
cls.ttft_start = 0
|
||
cls.tpot_start = 0
|
||
cls.ttft = 0
|
||
cls.tpot = 0
|
||
|
||
@classmethod
|
||
def get_summary(cls) -> dict:
|
||
"""返回统计摘要"""
|
||
return {
|
||
"ttft_ns": cls.ttft,
|
||
"ttft_ms": cls.ttft / 1e6,
|
||
"tpot_ns": cls.tpot,
|
||
"tpot_ms": cls.tpot / 1e6,
|
||
}
|
||
|
||
@classmethod
|
||
def print_summary(cls) -> None:
|
||
"""打印摘要"""
|
||
print(f"[InferenceObserver] TTFT: {cls.ttft / 1e6:.2f}ms, TPOT: {cls.tpot / 1e6:.2f}ms")
|