Compare commits
2 Commits
e09a2a5b10
...
e436ec861f
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e436ec861f | ||
|
|
45efcf0db1 |
@@ -108,7 +108,7 @@ DEFAULT_MODEL = os.path.expanduser("~/models/Llama-3.1-8B-Instruct")
|
|||||||
# Note: max_model_len must be > max_input_len to leave room for output tokens
|
# Note: max_model_len must be > max_input_len to leave room for output tokens
|
||||||
# 64k benchmark has inputs up to 65536 tokens, so we need 65536 + 128 = 65664
|
# 64k benchmark has inputs up to 65536 tokens, so we need 65536 + 128 = 65664
|
||||||
DEFAULT_MAX_MODEL_LEN = 65664
|
DEFAULT_MAX_MODEL_LEN = 65664
|
||||||
DEFAULT_MAX_NEW_TOKENS = 128 # Larger for multi-value tasks
|
DEFAULT_MAX_NEW_TOKENS = 16 # Sufficient for NIAH single-value answers
|
||||||
|
|
||||||
# Task categories for evaluation
|
# Task categories for evaluation
|
||||||
NIAH_TASKS = ["niah_single_1", "niah_single_2", "niah_single_3",
|
NIAH_TASKS = ["niah_single_1", "niah_single_2", "niah_single_3",
|
||||||
@@ -323,7 +323,7 @@ def run_ruler_benchmark(
|
|||||||
max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
|
max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
|
||||||
enable_cpu_offload: bool = False,
|
enable_cpu_offload: bool = False,
|
||||||
num_gpu_blocks: int = 4,
|
num_gpu_blocks: int = 4,
|
||||||
block_size: int = 1024,
|
block_size: int = 4096,
|
||||||
num_kv_buffers: int = 4,
|
num_kv_buffers: int = 4,
|
||||||
gpu_utilization: float = 0.9,
|
gpu_utilization: float = 0.9,
|
||||||
enforce_eager: bool = True,
|
enforce_eager: bool = True,
|
||||||
@@ -335,6 +335,7 @@ def run_ruler_benchmark(
|
|||||||
sparse_samples: int = 128,
|
sparse_samples: int = 128,
|
||||||
sparse_block_size: int = 128,
|
sparse_block_size: int = 128,
|
||||||
sparse_stride: int = 8,
|
sparse_stride: int = 8,
|
||||||
|
dtype: Optional[str] = None,
|
||||||
) -> Dict:
|
) -> Dict:
|
||||||
"""
|
"""
|
||||||
Run RULER benchmark on multiple tasks.
|
Run RULER benchmark on multiple tasks.
|
||||||
@@ -389,6 +390,8 @@ def run_ruler_benchmark(
|
|||||||
"kvcache_block_size": block_size,
|
"kvcache_block_size": block_size,
|
||||||
"enable_cpu_offload": enable_cpu_offload,
|
"enable_cpu_offload": enable_cpu_offload,
|
||||||
}
|
}
|
||||||
|
if dtype:
|
||||||
|
llm_kwargs["dtype"] = dtype
|
||||||
if enable_cpu_offload:
|
if enable_cpu_offload:
|
||||||
llm_kwargs["num_gpu_blocks"] = num_gpu_blocks
|
llm_kwargs["num_gpu_blocks"] = num_gpu_blocks
|
||||||
llm_kwargs["num_kv_buffers"] = num_kv_buffers
|
llm_kwargs["num_kv_buffers"] = num_kv_buffers
|
||||||
@@ -525,8 +528,8 @@ if __name__ == "__main__":
|
|||||||
help="Enable CPU offload mode")
|
help="Enable CPU offload mode")
|
||||||
parser.add_argument("--num-gpu-blocks", type=int, default=4,
|
parser.add_argument("--num-gpu-blocks", type=int, default=4,
|
||||||
help="Number of GPU blocks for CPU offload (default: 4)")
|
help="Number of GPU blocks for CPU offload (default: 4)")
|
||||||
parser.add_argument("--block-size", type=int, default=1024,
|
parser.add_argument("--block-size", type=int, default=4096,
|
||||||
help="KV cache block size (default: 1024)")
|
help="KV cache block size (default: 4096)")
|
||||||
parser.add_argument("--num-kv-buffers", type=int, default=4,
|
parser.add_argument("--num-kv-buffers", type=int, default=4,
|
||||||
help="Number of KV buffers for ring buffer (default: 4)")
|
help="Number of KV buffers for ring buffer (default: 4)")
|
||||||
parser.add_argument("--gpu-utilization", type=float, default=0.9,
|
parser.add_argument("--gpu-utilization", type=float, default=0.9,
|
||||||
@@ -550,6 +553,8 @@ if __name__ == "__main__":
|
|||||||
help="XAttention BSA: block size for estimation")
|
help="XAttention BSA: block size for estimation")
|
||||||
parser.add_argument("--sparse-stride", type=int, default=8,
|
parser.add_argument("--sparse-stride", type=int, default=8,
|
||||||
help="XAttention BSA: stride for Q/K downsampling")
|
help="XAttention BSA: stride for Q/K downsampling")
|
||||||
|
parser.add_argument("--dtype", type=str, default=None,
|
||||||
|
help="Model dtype (bfloat16, float16). Required for models with float32 default.")
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
@@ -587,6 +592,7 @@ if __name__ == "__main__":
|
|||||||
sparse_samples=args.sparse_samples,
|
sparse_samples=args.sparse_samples,
|
||||||
sparse_block_size=args.sparse_block_size,
|
sparse_block_size=args.sparse_block_size,
|
||||||
sparse_stride=args.sparse_stride,
|
sparse_stride=args.sparse_stride,
|
||||||
|
dtype=args.dtype,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Exit code (skip for json output mode)
|
# Exit code (skip for json output mode)
|
||||||
|
|||||||
Reference in New Issue
Block a user