simplify
This commit is contained in:
@@ -66,7 +66,7 @@ class ModelRunner:
|
||||
break
|
||||
|
||||
def read_shm(self):
|
||||
assert self.world_size > 1 and self.rank
|
||||
assert self.world_size > 1 and self.rank > 0
|
||||
self.event.wait()
|
||||
n = int.from_bytes(self.shm.buf[0:4], "little")
|
||||
method_name, *args = pickle.loads(self.shm.buf[4:n+4])
|
||||
@@ -74,7 +74,7 @@ class ModelRunner:
|
||||
return method_name, args
|
||||
|
||||
def write_shm(self, method_name, *args):
|
||||
assert self.world_size > 1 and not self.rank
|
||||
assert self.world_size > 1 and self.rank == 0
|
||||
data = pickle.dumps([method_name, *args])
|
||||
n = len(data)
|
||||
self.shm.buf[0:4] = n.to_bytes(4, "little")
|
||||
@@ -108,7 +108,7 @@ class ModelRunner:
|
||||
block_bytes = 2 * hf_config.num_hidden_layers * self.block_size * num_kv_heads * hf_config.head_dim * hf_config.torch_dtype.itemsize
|
||||
config.num_kvcache_blocks = int(total * config.gpu_memory_utilization - used - peak + current) // block_bytes
|
||||
assert config.num_kvcache_blocks > 0
|
||||
self.kv_cache = torch.zeros(2, hf_config.num_hidden_layers, config.num_kvcache_blocks, self.block_size, num_kv_heads, hf_config.head_dim)
|
||||
self.kv_cache = torch.empty(2, hf_config.num_hidden_layers, config.num_kvcache_blocks, self.block_size, num_kv_heads, hf_config.head_dim)
|
||||
layer_id = 0
|
||||
for module in self.model.modules():
|
||||
if hasattr(module, "k_cache") and hasattr(module, "v_cache"):
|
||||
@@ -141,7 +141,7 @@ class ModelRunner:
|
||||
cu_seqlens_k.append(cu_seqlens_k[-1] + seqlen_k)
|
||||
max_seqlen_q = max(seqlen_q, max_seqlen_q)
|
||||
max_seqlen_k = max(seqlen_k, max_seqlen_k)
|
||||
if not seq.block_table:
|
||||
if not seq.block_table: # warmup
|
||||
continue
|
||||
for i in range(seq.num_cached_blocks, seq.num_blocks):
|
||||
start = seq.block_table[i] * self.block_size
|
||||
@@ -194,12 +194,11 @@ class ModelRunner:
|
||||
context = get_context()
|
||||
graph = self.graphs[next(x for x in self.graph_bs if x >= bs)]
|
||||
graph_vars = self.graph_vars
|
||||
for k, v in graph_vars.items():
|
||||
if k != "outputs":
|
||||
v.zero_()
|
||||
graph_vars["input_ids"][:bs] = input_ids
|
||||
graph_vars["positions"][:bs] = positions
|
||||
graph_vars["slot_mapping"].fill_(-1)
|
||||
graph_vars["slot_mapping"][:bs] = context.slot_mapping
|
||||
graph_vars["context_lens"].zero_()
|
||||
graph_vars["context_lens"][:bs] = context.context_lens
|
||||
graph_vars["block_tables"][:bs, :context.block_tables.size(1)] = context.block_tables
|
||||
graph.replay()
|
||||
|
||||
Reference in New Issue
Block a user