♻️ refactor: restructure Observer as base class with InferenceObserver
- Refactor Observer into base class with common enable/disable/reset interface - Create InferenceObserver subclass for TTFT/TPOT metrics - Fix TTFT calculation timing: compute after prefill completes instead of at decode start (fixes max_tokens=1 returning TTFT=0) - Integrate InferenceObserver into bench.py and bench_offload.py for accurate internal timing metrics vs external wall-clock time - Add get_summary() and print_summary() methods for structured output Generated with [Claude Code](https://claude.ai/code) via [Happy](https://happy.engineering) Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: Happy <yesreply@happy.engineering>
This commit is contained in:
@@ -1,17 +1,106 @@
|
||||
class Observer():
|
||||
ttft_start = 0
|
||||
tpot_start = 0
|
||||
"""
|
||||
Observer 基类和 InferenceObserver 实现。
|
||||
|
||||
ttft = 0
|
||||
tpot = 0
|
||||
Observer 架构:
|
||||
- Observer: 基类,定义通用接口
|
||||
- InferenceObserver: 推理性能观测(TTFT/TPOT)
|
||||
- MemoryObserver: 内存传输观测(在 memory_observer.py 中定义)
|
||||
"""
|
||||
|
||||
|
||||
class Observer:
|
||||
"""
|
||||
Observer 基类,提供通用的启用/禁用、重置、输出接口。
|
||||
|
||||
所有 Observer 子类应继承此类并实现:
|
||||
- complete_reset(): 重置所有统计数据
|
||||
- get_summary(): 返回统计摘要 dict
|
||||
- print_summary(): 打印人类可读的摘要
|
||||
"""
|
||||
|
||||
_enabled: bool = True # 默认启用
|
||||
|
||||
@classmethod
|
||||
def reset_ttft(cls):
|
||||
def enable(cls) -> None:
|
||||
"""启用 observer"""
|
||||
cls._enabled = True
|
||||
|
||||
@classmethod
|
||||
def disable(cls) -> None:
|
||||
"""禁用 observer"""
|
||||
cls._enabled = False
|
||||
|
||||
@classmethod
|
||||
def is_enabled(cls) -> bool:
|
||||
"""检查是否启用"""
|
||||
return cls._enabled
|
||||
|
||||
@classmethod
|
||||
def complete_reset(cls) -> None:
|
||||
"""重置所有统计数据(子类实现)"""
|
||||
raise NotImplementedError
|
||||
|
||||
@classmethod
|
||||
def get_summary(cls) -> dict:
|
||||
"""返回统计摘要(子类实现)"""
|
||||
raise NotImplementedError
|
||||
|
||||
@classmethod
|
||||
def print_summary(cls) -> None:
|
||||
"""打印人类可读的摘要(子类可选覆盖)"""
|
||||
import json
|
||||
print(json.dumps(cls.get_summary(), indent=2))
|
||||
|
||||
|
||||
class InferenceObserver(Observer):
|
||||
"""
|
||||
推理性能 Observer,统计 TTFT 和 TPOT。
|
||||
|
||||
- TTFT (Time To First Token): 首个 token 生成延迟
|
||||
- TPOT (Time Per Output Token): 每个输出 token 的平均延迟
|
||||
|
||||
统计位置:
|
||||
- TTFT 开始: scheduler.py:35-36 - 第一个 sequence 从 waiting 队列取出时
|
||||
- TTFT 结束: llm_engine.py:69-72 - prefill 完成后(包括 chunked prefill 所有 chunks)
|
||||
- TPOT 开始: llm_engine.py:65 - 每次 decode step 结束时
|
||||
- TPOT 结束: llm_engine.py:62-63 - 下一次 decode step 开始时计算(测量上一次 decode 时间)
|
||||
- 重置: llm_engine.py:97 - generate() 开始时
|
||||
|
||||
注意:TPOT 需要至少 2 个输出 token 才能计算(测量 decode step 间隔)。
|
||||
"""
|
||||
|
||||
# 时间戳 (nanoseconds)
|
||||
ttft_start: int = 0
|
||||
tpot_start: int = 0
|
||||
|
||||
# 统计结果 (nanoseconds)
|
||||
ttft: int = 0
|
||||
tpot: int = 0
|
||||
|
||||
@classmethod
|
||||
def reset_ttft(cls) -> None:
|
||||
"""重置 TTFT 计时器"""
|
||||
cls.ttft_start = 0
|
||||
|
||||
@classmethod
|
||||
def complete_reset(cls):
|
||||
def complete_reset(cls) -> None:
|
||||
"""重置所有统计数据"""
|
||||
cls.ttft_start = 0
|
||||
cls.tpot_start = 0
|
||||
cls.ttft = 0
|
||||
cls.tpot = 0
|
||||
|
||||
@classmethod
|
||||
def get_summary(cls) -> dict:
|
||||
"""返回统计摘要"""
|
||||
return {
|
||||
"ttft_ns": cls.ttft,
|
||||
"ttft_ms": cls.ttft / 1e6,
|
||||
"tpot_ns": cls.tpot,
|
||||
"tpot_ms": cls.tpot / 1e6,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def print_summary(cls) -> None:
|
||||
"""打印摘要"""
|
||||
print(f"[InferenceObserver] TTFT: {cls.ttft / 1e6:.2f}ms, TPOT: {cls.tpot / 1e6:.2f}ms")
|
||||
|
||||
Reference in New Issue
Block a user