📈 feat: add MemoryObserver for GPU-CPU communication tracking

Implement MemoryObserver to track memory transfers between GPU and CPU:
- H2D (Host to Device): CPU → GPU transfers
- D2H (Device to Host): GPU → CPU transfers
- D2D (Device to Device): GPU buffer copies
- Supports prefill/decode phase separation

Integration points in offload_engine.py:
- load_to_slot_layer: H2D with is_prefill parameter
- offload_slot_layer_to_cpu, offload_prefill_buffer_async: D2H
- write_to_prefill_buffer, write_to_decode_buffer: D2D
- load_block_sample_from_cpu, load_block_full_from_cpu: H2D

Add bench_offload.py integration for memory stats printing.

Benchmark results (Llama-3.1-8B, 64K context):
- Full Policy: Prefill H2D 262.13 GB
- XAttention: Prefill H2D 386.62 GB (1.48x)

Generated with [Claude Code](https://claude.ai/code)
via [Happy](https://happy.engineering)

Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: Happy <yesreply@happy.engineering>
This commit is contained in:
Zijie Tian
2026-01-28 04:06:45 +08:00
parent c16bfcf40f
commit 39d12a0416
8 changed files with 458 additions and 3 deletions

View File

@@ -17,6 +17,7 @@ from dataclasses import dataclass
from nanovllm.kvcache.kernels import gathered_copy_kv
from nanovllm.comm import memcpy_2d_async
from nanovllm.utils.logger import get_logger
from nanovllm.utils.memory_observer import MemoryObserver
# Import for type hints only (avoid circular import)
from typing import TYPE_CHECKING
@@ -376,7 +377,8 @@ class OffloadEngine:
self.ring_slot_compute_done[slot_idx].record()
def load_to_slot_layer(
self, slot_idx: int, layer_id: int, cpu_block_id: int, chunk_idx: int = -1
self, slot_idx: int, layer_id: int, cpu_block_id: int, chunk_idx: int = -1,
is_prefill: bool = True,
) -> None:
"""
Async load a single CPU block to a ring buffer slot for one layer.
@@ -393,6 +395,7 @@ class OffloadEngine:
layer_id: Layer index to load (for CPU cache indexing)
cpu_block_id: Source CPU block ID
chunk_idx: Optional chunk index for NVTX labeling (-1 means not specified)
is_prefill: True if in prefill phase, False if in decode phase (for MemoryObserver)
"""
logger.debug(f"Ring load: layer={layer_id}, CPU[{cpu_block_id}] -> GPU slot[{slot_idx}]")
@@ -425,6 +428,9 @@ class OffloadEngine:
self.ring_slot_ready[slot_idx].record(stream)
nvtx.pop_range()
# Record H2D transfer: K + V = 2 * block_bytes
MemoryObserver.record_h2d(2 * self.gpu_block_bytes, is_prefill=is_prefill)
def wait_slot_layer(self, slot_idx: int) -> None:
"""
Wait for a slot's loading to complete.
@@ -499,6 +505,9 @@ class OffloadEngine:
self.ring_slot_offload_done[slot_idx].record(self.transfer_stream_main)
nvtx.pop_range()
# Record D2H transfer: K + V = 2 * block_bytes
MemoryObserver.record_d2h(2 * self.gpu_block_bytes, is_prefill=is_prefill)
# ----- KV access methods for ring buffer -----
def get_kv_for_slot(self, slot_idx: int) -> Tuple[Tensor, Tensor]:
@@ -745,6 +754,10 @@ class OffloadEngine:
self.prefill_v_buffer[layer_id, :num_tokens].copy_(v)
torch.cuda.nvtx.range_pop()
# Record D2D transfer: K + V
transfer_bytes = 2 * k.numel() * k.element_size()
MemoryObserver.record_d2d(transfer_bytes)
def write_to_decode_buffer(
self,
layer_id: int,
@@ -768,6 +781,10 @@ class OffloadEngine:
self.decode_v_buffer[layer_id, pos_in_block].copy_(v)
torch.cuda.nvtx.range_pop()
# Record D2D transfer: K + V (single token)
transfer_bytes = 2 * k.numel() * k.element_size()
MemoryObserver.record_d2d(transfer_bytes)
def offload_prefill_buffer_async(
self,
layer_id: int,
@@ -813,6 +830,9 @@ class OffloadEngine:
self.prefill_offload_events[layer_id].record(stream)
nvtx.pop_range()
# Record D2H transfer: K + V = 2 * block_bytes
MemoryObserver.record_d2h(2 * self.gpu_block_bytes, is_prefill=True)
def wait_all_prefill_offloads(self) -> None:
"""Wait for all prefill buffer offloads to complete."""
for stream in self.prefill_offload_streams:
@@ -851,6 +871,11 @@ class OffloadEngine:
v_sample = self.v_cache_cpu[
layer_id, cpu_block_id, :num_samples
].clone().cuda()
# Record H2D transfer: K + V samples
transfer_bytes = 2 * k_sample.numel() * k_sample.element_size()
MemoryObserver.record_h2d(transfer_bytes, is_prefill=True)
return k_sample, v_sample
def load_block_full_from_cpu(
@@ -877,4 +902,8 @@ class OffloadEngine:
v_full = self.v_cache_cpu[
layer_id, cpu_block_id
].clone().cuda()
# Record H2D transfer: K + V full block
MemoryObserver.record_h2d(2 * self.gpu_block_bytes, is_prefill=True)
return k_full, v_full

View File

@@ -422,7 +422,7 @@ class FullAttentionPolicy(SparsePolicy):
num_preload = min(num_slots, num_blocks)
for i in range(num_preload):
cpu_block_id = cpu_block_table[i]
offload_engine.load_to_slot_layer(load_slots[i], layer_id, cpu_block_id, chunk_idx=cpu_block_id)
offload_engine.load_to_slot_layer(load_slots[i], layer_id, cpu_block_id, chunk_idx=cpu_block_id, is_prefill=False)
# Phase 2: Process blocks with pipeline
for block_idx in range(num_blocks):
@@ -456,7 +456,7 @@ class FullAttentionPolicy(SparsePolicy):
next_block_idx = block_idx + num_slots
if next_block_idx < num_blocks:
next_cpu_block_id = cpu_block_table[next_block_idx]
offload_engine.load_to_slot_layer(current_slot, layer_id, next_cpu_block_id, chunk_idx=next_cpu_block_id)
offload_engine.load_to_slot_layer(current_slot, layer_id, next_cpu_block_id, chunk_idx=next_cpu_block_id, is_prefill=False)
# Merge with accumulated
with torch.cuda.stream(compute_stream):