♻️ refactor: restructure Observer as base class with InferenceObserver
- Refactor Observer into base class with common enable/disable/reset interface - Create InferenceObserver subclass for TTFT/TPOT metrics - Fix TTFT calculation timing: compute after prefill completes instead of at decode start (fixes max_tokens=1 returning TTFT=0) - Integrate InferenceObserver into bench.py and bench_offload.py for accurate internal timing metrics vs external wall-clock time - Add get_summary() and print_summary() methods for structured output Generated with [Claude Code](https://claude.ai/code) via [Happy](https://happy.engineering) Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: Happy <yesreply@happy.engineering>
This commit is contained in:
@@ -4,7 +4,7 @@ from typing import TYPE_CHECKING
|
||||
|
||||
from nanovllm.config import Config
|
||||
from nanovllm.engine.sequence import Sequence, SequenceStatus
|
||||
from nanovllm.utils.observer import Observer
|
||||
from nanovllm.utils.observer import InferenceObserver
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from nanovllm.kvcache import KVCacheManager
|
||||
@@ -32,8 +32,8 @@ class Scheduler:
|
||||
num_seqs = 0
|
||||
num_batched_tokens = 0
|
||||
while self.waiting and num_seqs < self.max_num_seqs:
|
||||
if Observer.ttft_start == 0:
|
||||
Observer.ttft_start = perf_counter_ns()
|
||||
if InferenceObserver.ttft_start == 0:
|
||||
InferenceObserver.ttft_start = perf_counter_ns()
|
||||
seq = self.waiting[0]
|
||||
|
||||
# Check if sequence is too large
|
||||
|
||||
Reference in New Issue
Block a user