model=/mnt/data/models/stepfun-ai/Step-3.5-Flash-Int4/step3p5_flash_Q4_K_S-00001-of-00012.gguf

numactl -N "$SOCKET" -m "$SOCKET" \
./build/bin/llama-server \
    --model "$model"\
    --alias ubergarm/Step-3.5-Flash \
    --ctx-size 65536 \
    -ctk q8_0 -ctv q8_0 \
    -ub 4096 -b 4096 \
    --parallel 1 \
    --threads 96 \
    --threads-batch 128 \
    --numa numactl \
    --host 127.0.0.1 \
    --port 8080 \
    --no-mmap \
    --jinja \
    --validate-quants

INFO [                    main] build info | tid="135355715800704" timestamp=1770404814 build=4183 commit="9a0b5e80"
INFO [                    main] system info | tid="135355715800704" timestamp=1770404814 n_threads=96 n_threads_batch=128 total_threads=512 system_info="AVX = 1 | AVX_VNNI = 1 | AVX2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | "
CPU: using device CPU - 0 MiB free
llama_model_loader: additional 11 GGUFs metadata loaded.
llama_model_loader: loaded meta data with 51 key-value pairs and 754 tensors from /mnt/data/models/stepfun-ai/Step-3.5-Flash-Int4/step3p5_flash_Q4_K_S-00001-of-00012.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = step35
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Hf_Mtp_And_One
llama_model_loader: - kv   3:                         general.size_label str              = 288x7.4B
llama_model_loader: - kv   4:                         step35.block_count u32              = 45
llama_model_loader: - kv   5:                      step35.context_length u32              = 262144
llama_model_loader: - kv   6:                    step35.embedding_length u32              = 4096
llama_model_loader: - kv   7:                 step35.feed_forward_length u32              = 11264
llama_model_loader: - kv   8:                step35.attention.head_count arr[i32,45]      = [64, 96, 96, 96, 64, 96, 96, 96, 64, ...
llama_model_loader: - kv   9:                      step35.rope.freq_base f32              = 5000000.000000
llama_model_loader: - kv  10:                step35.attention.key_length u32              = 128
llama_model_loader: - kv  11:              step35.attention.value_length u32              = 128
llama_model_loader: - kv  12:             step35.attention.head_count_kv arr[i32,45]      = [8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...
llama_model_loader: - kv  13:            step35.attention.sliding_window u32              = 512
llama_model_loader: - kv  14:    step35.attention.sliding_window_pattern arr[i32,45]      = [0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, ...
llama_model_loader: - kv  15:             step35.rope.scaling.apply_mask u32              = 1
llama_model_loader: - kv  16:                        step35.expert_count u32              = 288
llama_model_loader: - kv  17:                   step35.expert_used_count u32              = 8
llama_model_loader: - kv  18:          step35.expert_feed_forward_length u32              = 1280
llama_model_loader: - kv  19:   step35.expert_shared_feed_forward_length u32              = 1280
llama_model_loader: - kv  20:                  step35.expert_gating_func u32              = 2
llama_model_loader: - kv  21:                step35.expert_weights_scale f32              = 3.000000
llama_model_loader: - kv  22:                 step35.expert_weights_norm bool             = true
llama_model_loader: - kv  23:           step35.leading_dense_block_count u32              = 3
llama_model_loader: - kv  24:                  step35.moe_every_n_layers u32              = 1
llama_model_loader: - kv  25:      step35.rope.dimension_count_per_layer arr[i32,45]      = [64, 128, 128, 128, 64, 128, 128, 128...
llama_model_loader: - kv  26:    step35.attention.layer_norm_rms_epsilon f32              = 0.000010
llama_model_loader: - kv  27:            step35.rope.freq_base_per_layer arr[f32,45]      = [5000000.000000, 10000.000000, 10000....
llama_model_loader: - kv  28:                       step35.swiglu_limits arr[f32,45]      = [0.000000, 0.000000, 0.000000, 0.0000...
llama_model_loader: - kv  29:                step35.swiglu_limits_shared arr[f32,45]      = [0.000000, 0.000000, 0.000000, 0.0000...
llama_model_loader: - kv  30:                       tokenizer.ggml.model str              = gpt2
llama_model_loader: - kv  31:                         tokenizer.ggml.pre str              = deepseek-v3
llama_model_loader: - kv  32:                      tokenizer.ggml.tokens arr[str,128896]  = ["<｜begin▁of▁sentence｜>", "<�...
llama_model_loader: - kv  33:                  tokenizer.ggml.token_type arr[i32,128896]  = [3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
llama_model_loader: - kv  34:                      tokenizer.ggml.merges arr[str,127741]  = ["Ġ t", "Ġ a", "i n", "Ġ Ġ", "h e...
llama_model_loader: - kv  35:                tokenizer.ggml.bos_token_id u32              = 0
llama_model_loader: - kv  36:                tokenizer.ggml.eos_token_id u32              = 128007
llama_model_loader: - kv  37:            tokenizer.ggml.padding_token_id u32              = 1
llama_model_loader: - kv  38:               tokenizer.ggml.add_bos_token bool             = true
llama_model_loader: - kv  39:               tokenizer.ggml.add_sep_token bool             = false
llama_model_loader: - kv  40:               tokenizer.ggml.add_eos_token bool             = false
llama_model_loader: - kv  41:                    tokenizer.chat_template str              = {% macro render_content(content) %}{%...
llama_model_loader: - kv  42:               general.quantization_version u32              = 2
llama_model_loader: - kv  43:                          general.file_type u32              = 14
llama_model_loader: - kv  44:                      quantize.imatrix.file str              = ../../../preview_hf_mtp/step3p5_flash...
llama_model_loader: - kv  45:                   quantize.imatrix.dataset str              = ../../../wikitext-2-raw/wiki.256.raw
llama_model_loader: - kv  46:             quantize.imatrix.entries_count u32              = 528
llama_model_loader: - kv  47:              quantize.imatrix.chunks_count u32              = 72
llama_model_loader: - kv  48:                                   split.no u16              = 0
llama_model_loader: - kv  49:                        split.tensors.count i32              = 754
llama_model_loader: - kv  50:                                split.count u16              = 12
llama_model_loader: - type  f32:  266 tensors
llama_model_loader: - type q4_K:  476 tensors
llama_model_loader: - type q5_K:   11 tensors
llama_model_loader: - type q6_K:    1 tensors
load: printing all EOG tokens:
load:   - 128007 ('<|im_end|>')
load: special tokens cache size = 818
load: token to piece cache size = 0.8220 MB
llm_load_print_meta: format           = GGUF V3 (latest)
llm_load_print_meta: arch             = step35
llm_load_print_meta: n_ctx_train      = 262144
llm_load_print_meta: n_embd           = 4096
llm_load_print_meta: n_layer          = 45
llm_load_print_meta: n_head           = [64, 96, 96, 96, 64, 96, 96, 96, 64, 96, 96, 96, 64, 96, 96, 96, 64, 96, 96, 96, 64, 96, 96, 96, 64, 96, 96, 96, 64, 96, 96, 96, 64, 96, 96, 96, 64, 96, 96, 96, 64, 96, 96, 96, 64]
llm_load_print_meta: n_head_kv        = 8
llm_load_print_meta: n_rot            = 128
llm_load_print_meta: n_swa            = 512
llm_load_print_meta: n_swa_pattern    = 1
llm_load_print_meta: n_embd_head_k    = 128
llm_load_print_meta: n_embd_head_v    = 128
llm_load_print_meta: n_gqa            = [8, 12, 12, 12, 8, 12, 12, 12, 8, 12, 12, 12, 8, 12, 12, 12, 8, 12, 12, 12, 8, 12, 12, 12, 8, 12, 12, 12, 8, 12, 12, 12, 8, 12, 12, 12, 8, 12, 12, 12, 8, 12, 12, 12, 8]
llm_load_print_meta: n_embd_k_gqa     = 1024
llm_load_print_meta: n_embd_v_gqa     = 1024
llm_load_print_meta: f_norm_eps       = 0.0e+00
llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
llm_load_print_meta: f_clamp_kqv      = 0.0e+00
llm_load_print_meta: f_max_alibi_bias = 0.0e+00
llm_load_print_meta: f_logit_scale    = 0.0e+00
llm_load_print_meta: n_ff             = 11264
llm_load_print_meta: n_expert         = 288
llm_load_print_meta: n_expert_used    = 8
llm_load_print_meta: causal attn      = 1
llm_load_print_meta: pooling type     = 0
llm_load_print_meta: rope type        = 2
llm_load_print_meta: rope scaling     = linear
llm_load_print_meta: freq_base_train  = 5000000.0
llm_load_print_meta: freq_scale_train = 1
llm_load_print_meta: n_ctx_orig_yarn  = 262144
llm_load_print_meta: rope_finetuned   = unknown
llm_load_print_meta: ssm_d_conv       = 0
llm_load_print_meta: ssm_d_inner      = 0
llm_load_print_meta: ssm_d_state      = 0
llm_load_print_meta: ssm_dt_rank      = 0
llm_load_print_meta: model type       = ?B
llm_load_print_meta: model ftype      = Q4_K - Small
llm_load_print_meta: model params     = 196.956 B
llm_load_print_meta: model size       = 103.837 GiB (4.529 BPW) 
llm_load_print_meta: repeating layers = 103.157 GiB (4.523 BPW, 195.900 B parameters)
llm_load_print_meta: general.name     = Hf_Mtp_And_One
print_info: vocab type       = BPE
print_info: n_vocab          = 128896
print_info: n_merges         = 127741
print_info: BOS token        = 0 '<｜begin▁of▁sentence｜>'
print_info: EOS token        = 128007 '<|im_end|>'
print_info: EOT token        = 128007 '<|im_end|>'
print_info: PAD token        = 1 '<｜end▁of▁sentence｜>'
print_info: LF token         = 201 'Ċ'
print_info: FIM PRE token    = 128801 '<｜fim▁begin｜>'
print_info: FIM SUF token    = 128800 '<｜fim▁hole｜>'
print_info: FIM MID token    = 128802 '<｜fim▁end｜>'
print_info: EOG token        = 128007 '<|im_end|>'
print_info: max token length = 256
llm_load_tensors: ggml ctx size =    0.31 MiB
llm_load_tensors: offloading 0 repeating layers to GPU
llm_load_tensors: offloaded 0/46 layers to GPU
llm_load_tensors:        CPU buffer size = 106328.79 MiB
....................................................................................................
llama_new_context_with_model: n_ctx         = 65536
llama_new_context_with_model: n_batch       = 4096
llama_new_context_with_model: n_ubatch      = 4096
llama_new_context_with_model: flash_attn    = 1
llama_new_context_with_model: attn_max_b    = 0
llama_new_context_with_model: fused_moe     = 1
llama_new_context_with_model: grouped er    = 0
llama_new_context_with_model: fused_up_gate = 1
llama_new_context_with_model: fused_mmad    = 1
llama_new_context_with_model: rope_cache    = 0
llama_new_context_with_model: graph_reuse   = 1
llama_new_context_with_model: k_cache_hadam = 0
llama_new_context_with_model: split_mode_graph_scheduling = 0
llama_new_context_with_model: reduce_type   = f16
llama_new_context_with_model: sched_async   = 0
llama_new_context_with_model: ser           = -1, 0
llama_new_context_with_model: freq_base     = 5000000.0
llama_new_context_with_model: freq_scale    = 1
llama_kv_cache_init:        CPU KV buffer size =  6120.00 MiB
llama_new_context_with_model: KV self size  = 6120.00 MiB, K (q8_0): 3060.00 MiB, V (q8_0): 3060.00 MiB
llama_new_context_with_model:        CPU  output buffer size =     0.49 MiB
llama_new_context_with_model:        CPU compute buffer size =  2078.00 MiB
llama_new_context_with_model: graph nodes  = 2201
llama_new_context_with_model: graph splits = 1
XXXXXXXXXXXXXXXXXXXXX Setting only active experts offload
===================================== llama_new_context_with_model: f16
======================================= HAVE_FANCY_SIMD is defined
INFO [                    init] initializing slots | tid="135355715800704" timestamp=1770404843 n_slots=1
INFO [                    init] new slot | tid="135355715800704" timestamp=1770404843 id_slot=0 n_ctx_slot=65536
srv          init: Exclude reasoning tokens when selecting slot based on similarity: start: <think>, end: </think>
use `--reasoning-tokens none` to disable.
prompt cache is enabled, size limit: 8192 MiB
use `--cache-ram 0` to disable the prompt cache
INFO [                    main] model loaded | tid="135355715800704" timestamp=1770404843
INFO [                    main] chat template | tid="135355715800704" timestamp=1770404843 chat_template="{% macro render_content(content) %}{% if content is none %}{{- '' }}{% elif content is string %}{{- content }}{% elif content is mapping %}{{- content['value'] if 'value' in content else content['text'] }}{% elif content is iterable %}{% for item in content %}{% if item.type == 'text' %}{{- item['value'] if 'value' in item else item['text'] }}{% elif item.type == 'image' %}<im_patch>{% endif %}{% endfor %}{% endif %}{% endmacro %}\n{{bos_token}}{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0].role == 'system' %}\n        {{- render_content(messages[0].content) + '\\n\\n' }}\n    {%- endif %}\n    {{- \"# Tools\\n\\nYou have access to the following functions in JSONSchema format:\\n\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson(ensure_ascii=False) }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nIf you choose to call a function ONLY reply in the following format with NO suffix:\\n\\n<tool_call>\\n<function=example_function_name>\\n<parameter=example_parameter_1>\\nvalue_1\\n</parameter>\\n<parameter=example_parameter_2>\\nThis is the value for the second parameter\\nthat can span\\nmultiple lines\\n</parameter>\\n</function>\\n</tool_call>\\n\\n<IMPORTANT>\\nReminder:\\n- Function calls MUST follow the specified format: an inner <function=...>\\n...\\n</function> block must be nested within <tool_call>\\n...\\n</tool_call> XML tags\\n- Required parameters MUST be specified\\n</IMPORTANT><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0].role == 'system' %}\n        {{- '<|im_start|>system\\n' + render_content(messages[0].content) + '<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n    {%- set index = (messages|length - 1) - loop.index0 %}\n    {%- if ns.multi_step_tool and message.role == \"user\" and render_content(message.content) is string and not(render_content(message.content).startswith('<tool_response>') and render_content(message.content).endswith('</tool_response>')) %}\n        {%- set ns.multi_step_tool = false %}\n        {%- set ns.last_query_index = index %}\n    {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n    {%- set content = render_content(message.content) %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n        {%- set role_name = 'observation' if (message.role == \"system\" and not loop.first and message.name == 'observation') else message.role %}\n        {{- '<|im_start|>' + role_name + '\\n' + content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {%- set reasoning_content = '' %}\n        {%- if enable_thinking %}\n            {%- if message.reasoning_content is string %}\n                {%- set reasoning_content = render_content(message.reasoning_content) %}\n            {%- else %}\n                {%- if '</think>' in content %}\n                    {%- set reasoning_content = content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n                    {%- set content = content.split('</think>')[-1].lstrip('\\n') %}\n                {%- endif %}\n            {%- endif %}\n        {%- else %}\n            {# If thinking is disabled, strip any inline <think>...</think> from assistant content #}\n            {%- if '</think>' in content %}\n                {%- set content = content.split('</think>')[-1].lstrip('\\n') %}\n            {%- endif %}\n        {%- endif %}\n\n        {%- if loop.index0 > ns.last_query_index and enable_thinking %}\n            {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.rstrip('\\n') + '\\n</think>\\n' + content.lstrip('\\n') }}\n        {%- else %}\n            {{- '<|im_start|>' + message.role + '\\n' + content.lstrip('\\n') }}\n        {%- endif %}\n        {%- if message.tool_calls %}\n            {%- for tool_call in message.tool_calls %}\n                {%- if tool_call.function is defined %}\n                    {%- set tool_call = tool_call.function %}\n                {%- endif %}\n                {{- '<tool_call>\\n<function=' + tool_call.name + '>\\n' }}\n                {%- if tool_call.arguments is defined %}\n                    {%- if tool_call.arguments is mapping %}\n                        {%- set arguments = tool_call.arguments %}\n                        {%- for args_name, args_value in arguments|items %}\n                            {{- '<parameter=' + args_name + '>\\n' }}\n                            {%- set args_value = args_value | tojson(ensure_ascii=False) | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}\n                            {{- args_value }}\n                            {{- '\\n</parameter>\\n' }}\n                        {%- endfor %}\n                    {%- elif tool_call.arguments is string %}\n                        {# Minja does not support fromjson; preserve raw JSON string as a single parameter #}\n                        {{- '<parameter=arguments>\\n' + tool_call.arguments + '\\n</parameter>\\n' }}\n                    {%- endif %}\n                {%- endif %}\n                {{- '</function>\\n</tool_call>' }}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>tool_response\\n' }}\n        {%- endif %}\n        {{- '<tool_response>' }}\n        {{- content }}\n        {{- '</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n    {%- if enable_thinking %}\n        {{- '<think>\\n' }}\n    {%- endif %}\n{%- endif %}\n"
INFO [                    main] chat template | tid="135355715800704" timestamp=1770404843 chat_example="<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\nHi there<|im_end|>\n<|im_start|>user\nHow are you?<|im_end|>\n<|im_start|>assistant\n<think>\n" built_in=true
INFO [                    main] HTTP server listening | tid="135355715800704" timestamp=1770404843 n_threads_http="511" port="8080" hostname="127.0.0.1"
INFO [              slots_idle] all slots are idle | tid="135355715800704" timestamp=1770404843