I'm using the GGUF too; it appears slightly faster in llama.cpp now than current LM Studio but it's not clear to me if that is down to LM Studio having a little more code overhead, older llama.cpp under the hood, or just parameter differences.
# 27B GGUF # Benchmark results: # - TG speed: ~5 tok/s (vs baseline ~4.5 tok/s, +10% improvement) # - PP speed: ~60 tok/s (stable across context sizes) # - With parallel=4: total throughput ~28 tok/s
llama-server \ -m /Users/*/models/hf/models--unsloth--Qwen3.6-27B-GGUF/snapshots/82d411acf4a06cfb8d9b073a5211bf410bfc29bf/Qwen3.6-27B-Q6_K.gguf \ --alias "qwen3.6-27b" \ -ngl -1 \ --n-cpu-moe 0 \ -fa off \ -ctk q4_0 \ -ctv f16 \ -c 131071 \ -b 512 \ -ub 256 \ --spec-type ngram-cache \ --jinja \ --cache-ram -1 \ --parallel 4 \ --kv-unified \ --no-context-shift \ --mlock \ --slot-save-path ~/qwen_slots \ --reasoning-budget 512 \...
# 27B MTP GGUF # Benchmark results (from bench_conversation.sh, ~500tok prompts + multi-turn): # Config | PP (tok/s) | TG (tok/s) | Draft accept # ---------------------------------------|------------|------------|------------- # Baseline (ngram-cache, non-MTP) | 48.0 | 5.2 | N/A # MTP --no-mmap, dn=4, fa on, ctk q4_0 | 45.7 | 6.9 | 60% # MTP --no-mmap, dn=3, fa on, ctk q4_0 | 46.3 | 7.5 | 73% ← winner
MODEL="/Users/bale/models/hf/models--unsloth--Qwen3.6-27B-MTP-GGUF/snapshots/ac393bc3d23fd5a929a85e2f33c7c4fd5be02d43/Qwen3.6-27B-Q6_K.gguf"
llama-server \ -m "$MODEL" \ --alias "qwen3.6-27b-mtp" \ -ngl -1 \ --n-cpu-moe 0 \ --no-mmap \ -fa on \ -ctk q4_0 \ -ctv f16 \ -c 131071 \ -b 512 \ -ub 256 \ --spec-type draft-mtp \ --spec-draft-n-max 3 \ --jinja \ --cache-ram -1 \ --parallel 1 \ --kv-unified \ --no-context-shift \ --slot-save-path ~/qwen_slots \ --reasoning-budget 512 \...
# 35B GGUF # Benchmark results: # - TG speed: ~27 tok/s (vs baseline ~21 tok/s, +30% improvement) # - PP speed: ~370-350 tok/s (slight decrease with larger context) # - Parallel=4 gives best throughput at ~152 tok/s total
llama-server \ -m /Users/*
/models/hf/models--unsloth--Qwen3.6-35B-A3B-GGUF/snapshots/9280dd353ab587157920d5bd391ada414d84e552/Qwen3.6-35B-A3B-UD-Q6_K_XL.gguf \ --alias "qwen3.6-35b" \ -ngl -1 \ --n-cpu-moe 0 \ -fa on \ -ctk f16 \ -ctv f16 \ -c 262144 \ -b 2048 \ -ub 512 \ --spec-type ngram-cache \ --jinja \ --cache-ram -1 \ --parallel 4 \ --kv-unified \ --no-context-shift \ --mlock \ --threads 4 \ --threads-batch 8 \ --slot-save-path ~/qwen_slots \ --reasoning-budget 512 \...# 35B MTP GGUF # Benchmark results (128K context, verified 2026-05-18): # TG: 30.7 tok/s, Draft accept: 65%, no OOM at 128K
MODEL="/Users/**/models/hf/models--unsloth--Qwen3.6-35B-A3B-MTP-GGUF/snapshots/e28512781649329c5b37cbf55029355a48d158d4/Qwen3.6-35B-A3B-UD-Q6_K_XL.gguf"
export GGML_METAL_BF16_DISABLE=1
llama-server \ -m "$MODEL" \ --alias "qwen3.6-35b-mtp" \ -ngl -1 \ --n-cpu-moe 0 \ --no-mmap \ -fa on \ -ctk f16 \ -ctv f16 \ -c 262144 \ -b 512 \ -ub 256 \ --spec-type draft-mtp \ --spec-draft-n-max 3 \ --jinja \ --cache-ram -1 \ --parallel 1 \ --no-context-shift \ --threads 4 \ --threads-batch 8 \ --slot-save-path ~/qwen_slots \ --reasoning-budget 512 \...