1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
| docker run --gpus all -p 8080:8080 -v /mnt/ExtDisk/AI_Models:/models \ ghcr.io/ggml-org/llama.cpp:full-cuda \ --server \ --model /models/Qwen3-30B-A3B-Instruct-2507-Q4_K_M.gguf \ --host 0.0.0.0 \ --port 8080 \ --n-gpu-layers 12 \ --flash-attn on \ --ctx-size 4096 ggml_cuda_init: found 1 CUDA devices: Device 0: NVIDIA GeForce RTX 3060 Laptop GPU, compute capability 8.6, VMM: yes load_backend: loaded CUDA backend from /app/libggml-cuda.so load_backend: loaded CPU backend from /app/libggml-cpu-alderlake.so main: n_parallel is set to auto, using n_parallel = 4 and kv_unified = true build: 7964 (b83111815) with GNU 11.4.0 for Linux x86_64 system info: n_threads = 6, n_threads_batch = 6, total_threads = 20
system_info: n_threads = 6 (n_threads_batch = 6) / 20 | CUDA : ARCHS = 500,610,700,750,800,860,890 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX_VNNI = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
Running without SSL init: using 19 threads for HTTP server start: binding port with default address family main: loading model srv load_model: loading model '/models/Qwen3-30B-A3B-Instruct-2507-Q4_K_M.gguf' common_init_result: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on llama_params_fit_impl: projected to use 4716 MiB of device memory vs. 5562 MiB of free device memory llama_params_fit_impl: cannot meet free memory target of 1024 MiB, need to reduce device memory by 177 MiB llama_params_fit_impl: context size set by user to 4096 -> no change llama_params_fit: failed to fit params to free device memory: n_gpu_layers already set by user to 12, abort llama_params_fit: fitting params to free memory took 0.26 seconds llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3060 Laptop GPU) (0000:01:00.0) - 5562 MiB free llama_model_loader: loaded meta data with 45 key-value pairs and 579 tensors from /models/Qwen3-30B-A3B-Instruct-2507-Q4_K_M.gguf (version GGUF V3 (latest)) llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. llama_model_loader: - kv 0: general.architecture str = qwen3moe llama_model_loader: - kv 1: general.type str = model llama_model_loader: - kv 2: general.name str = Qwen3-30B-A3B-Instruct-2507 llama_model_loader: - kv 3: general.version str = 2507 llama_model_loader: - kv 4: general.finetune str = Instruct llama_model_loader: - kv 5: general.basename str = Qwen3-30B-A3B-Instruct-2507 llama_model_loader: - kv 6: general.quantized_by str = Unsloth llama_model_loader: - kv 7: general.size_label str = 30B-A3B llama_model_loader: - kv 8: general.license str = apache-2.0 llama_model_loader: - kv 9: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B... llama_model_loader: - kv 10: general.repo_url str = https://huggingface.co/unsloth llama_model_loader: - kv 11: general.base_model.count u32 = 1 llama_model_loader: - kv 12: general.base_model.0.name str = Qwen3 30B A3B Instruct 2507 llama_model_loader: - kv 13: general.base_model.0.version str = 2507 llama_model_loader: - kv 14: general.base_model.0.organization str = Qwen llama_model_loader: - kv 15: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B... llama_model_loader: - kv 16: general.tags arr[str,2] = ["unsloth", "text-generation"] llama_model_loader: - kv 17: qwen3moe.block_count u32 = 48 llama_model_loader: - kv 18: qwen3moe.context_length u32 = 262144 llama_model_loader: - kv 19: qwen3moe.embedding_length u32 = 2048 llama_model_loader: - kv 20: qwen3moe.feed_forward_length u32 = 6144 llama_model_loader: - kv 21: qwen3moe.attention.head_count u32 = 32 llama_model_loader: - kv 22: qwen3moe.attention.head_count_kv u32 = 4 llama_model_loader: - kv 23: qwen3moe.rope.freq_base f32 = 10000000.000000 llama_model_loader: - kv 24: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001 llama_model_loader: - kv 25: qwen3moe.expert_used_count u32 = 8 llama_model_loader: - kv 26: qwen3moe.attention.key_length u32 = 128 llama_model_loader: - kv 27: qwen3moe.attention.value_length u32 = 128 llama_model_loader: - kv 28: qwen3moe.expert_count u32 = 128 llama_model_loader: - kv 29: qwen3moe.expert_feed_forward_length u32 = 768 llama_model_loader: - kv 30: tokenizer.ggml.model str = gpt2 llama_model_loader: - kv 31: tokenizer.ggml.pre str = qwen2 llama_model_loader: - kv 32: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ... llama_model_loader: - kv 33: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... llama_model_loader: - kv 34: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",... llama_model_loader: - kv 35: tokenizer.ggml.eos_token_id u32 = 151645 llama_model_loader: - kv 36: tokenizer.ggml.padding_token_id u32 = 151654 llama_model_loader: - kv 37: tokenizer.ggml.add_bos_token bool = false llama_model_loader: - kv 38: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>... llama_model_loader: - kv 39: general.quantization_version u32 = 2 llama_model_loader: - kv 40: general.file_type u32 = 15 llama_model_loader: - kv 41: quantize.imatrix.file str = Qwen3-30B-A3B-Instruct-2507-GGUF/imat... llama_model_loader: - kv 42: quantize.imatrix.dataset str = unsloth_calibration_Qwen3-30B-A3B-Ins... llama_model_loader: - kv 43: quantize.imatrix.entries_count u32 = 384 llama_model_loader: - kv 44: quantize.imatrix.chunks_count u32 = 693 llama_model_loader: - type f32: 241 tensors llama_model_loader: - type q4_K: 289 tensors llama_model_loader: - type q6_K: 49 tensors print_info: file format = GGUF V3 (latest) print_info: file type = Q4_K - Medium print_info: file size = 17.28 GiB (4.86 BPW) load: 0 unused tokens load: printing all EOG tokens: load: - 151643 ('<|endoftext|>') load: - 151645 ('<|im_end|>') load: - 151662 ('<|fim_pad|>') load: - 151663 ('<|repo_name|>') load: - 151664 ('<|file_sep|>') load: special tokens cache size = 26 load: token to piece cache size = 0.9311 MB print_info: arch = qwen3moe print_info: vocab_only = 0 print_info: no_alloc = 0 print_info: n_ctx_train = 262144 print_info: n_embd = 2048 print_info: n_embd_inp = 2048 print_info: n_layer = 48 print_info: n_head = 32 print_info: n_head_kv = 4 print_info: n_rot = 128 print_info: n_swa = 0 print_info: is_swa_any = 0 print_info: n_embd_head_k = 128 print_info: n_embd_head_v = 128 print_info: n_gqa = 8 print_info: n_embd_k_gqa = 512 print_info: n_embd_v_gqa = 512 print_info: f_norm_eps = 0.0e+00 print_info: f_norm_rms_eps = 1.0e-06 print_info: f_clamp_kqv = 0.0e+00 print_info: f_max_alibi_bias = 0.0e+00 print_info: f_logit_scale = 0.0e+00 print_info: f_attn_scale = 0.0e+00 print_info: n_ff = 6144 print_info: n_expert = 128 print_info: n_expert_used = 8 print_info: n_expert_groups = 0 print_info: n_group_used = 0 print_info: causal attn = 1 print_info: pooling type = 0 print_info: rope type = 2 print_info: rope scaling = linear print_info: freq_base_train = 10000000.0 print_info: freq_scale_train = 1 print_info: n_ctx_orig_yarn = 262144 print_info: rope_yarn_log_mul = 0.0000 print_info: rope_finetuned = unknown print_info: model type = 30B.A3B print_info: model params = 30.53 B print_info: general.name = Qwen3-30B-A3B-Instruct-2507 print_info: n_ff_exp = 768 print_info: vocab type = BPE print_info: n_vocab = 151936 print_info: n_merges = 151387 print_info: BOS token = 11 ',' print_info: EOS token = 151645 '<|im_end|>' print_info: EOT token = 151645 '<|im_end|>' print_info: PAD token = 151654 '<|vision_pad|>' print_info: LF token = 198 'Ċ' print_info: FIM PRE token = 151659 '<|fim_prefix|>' print_info: FIM SUF token = 151661 '<|fim_suffix|>' print_info: FIM MID token = 151660 '<|fim_middle|>' print_info: FIM PAD token = 151662 '<|fim_pad|>' print_info: FIM REP token = 151663 '<|repo_name|>' print_info: FIM SEP token = 151664 '<|file_sep|>' print_info: EOG token = 151643 '<|endoftext|>' print_info: EOG token = 151645 '<|im_end|>' print_info: EOG token = 151662 '<|fim_pad|>' print_info: EOG token = 151663 '<|repo_name|>' print_info: EOG token = 151664 '<|file_sep|>' print_info: max token length = 256 load_tensors: loading model tensors, this can take a while... (mmap = true, direct_io = false) load_tensors: offloading output layer to GPU load_tensors: offloading 11 repeating layers to GPU load_tensors: offloaded 12/49 layers to GPU load_tensors: CPU_Mapped model buffer size = 13363.29 MiB load_tensors: CUDA0 model buffer size = 4328.06 MiB .................................................................................................... common_init_result: added <|endoftext|> logit bias = -inf common_init_result: added <|im_end|> logit bias = -inf common_init_result: added <|fim_pad|> logit bias = -inf common_init_result: added <|repo_name|> logit bias = -inf common_init_result: added <|file_sep|> logit bias = -inf llama_context: constructing llama_context llama_context: n_seq_max = 4 llama_context: n_ctx = 4096 llama_context: n_ctx_seq = 4096 llama_context: n_batch = 2048 llama_context: n_ubatch = 512 llama_context: causal_attn = 1 llama_context: flash_attn = enabled llama_context: kv_unified = true llama_context: freq_base = 10000000.0 llama_context: freq_scale = 1 llama_context: n_ctx_seq (4096) < n_ctx_train (262144) -- the full capacity of the model will not be utilized llama_context: CUDA_Host output buffer size = 2.32 MiB llama_kv_cache: CPU KV buffer size = 296.00 MiB llama_kv_cache: CUDA0 KV buffer size = 88.00 MiB llama_kv_cache: size = 384.00 MiB ( 4096 cells, 48 layers, 4/1 seqs), K (f16): 192.00 MiB, V (f16): 192.00 MiB sched_reserve: reserving ... sched_reserve: CUDA0 compute buffer size = 300.75 MiB sched_reserve: CUDA_Host compute buffer size = 16.01 MiB sched_reserve: graph nodes = 3031 sched_reserve: graph splits = 520 (with bs=512), 76 (with bs=1) sched_reserve: reserve took 12.28 ms, sched copies = 1 common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) srv load_model: initializing slots, n_slots = 4 no implementations specified for speculative decoding slot load_model: id 0 | task -1 | speculative decoding context not initialized slot load_model: id 0 | task -1 | new slot, n_ctx = 4096 no implementations specified for speculative decoding slot load_model: id 1 | task -1 | speculative decoding context not initialized slot load_model: id 1 | task -1 | new slot, n_ctx = 4096 no implementations specified for speculative decoding slot load_model: id 2 | task -1 | speculative decoding context not initialized slot load_model: id 2 | task -1 | new slot, n_ctx = 4096 no implementations specified for speculative decoding slot load_model: id 3 | task -1 | speculative decoding context not initialized slot load_model: id 3 | task -1 | new slot, n_ctx = 4096 srv load_model: prompt cache is enabled, size limit: 8192 MiB srv load_model: use `--cache-ram 0` to disable the prompt cache srv load_model: for more info see https://github.com/ggml-org/llama.cpp/pull/16391 init: chat template, example_format: '<|im_start|>system You are a helpful assistant<|im_end|> <|im_start|>user Hello<|im_end|> <|im_start|>assistant Hi there<|im_end|> <|im_start|>user How are you?<|im_end|> <|im_start|>assistant ' srv init: init: chat template, thinking = 0 main: model loaded main: server is listening on http://0.0.0.0:8080 main: starting the main loop... srv update_slots: all slots are idle
|