#!/usr/bin/env bash set -euo pipefail DEFAULT_GGUF_URL="https://huggingface.co/lmstudio-community/Qwen3.5-9B-GGUF/resolve/main/Qwen3.5-9B-Q4_K_M.gguf" DEFAULT_MMPROJ_URL="https://huggingface.co/lmstudio-community/Qwen3.5-9B-GGUF/resolve/main/mmproj-Qwen3.5-9B-BF16.gguf" BACKEND_READY_TIMEOUT_SEC=180 RECENT_LOG_LINE_COUNT=80 . /usr/local/bin/toolhub-backend-helpers.sh log_step() { printf '[toolhub-backend] %s\n' "$1" } log_stage() { log_step "$1" } resolve_llama_server_bin() { local candidate="" if candidate="$(command -v llama-server 2>/dev/null)"; then printf '%s\n' "$candidate" return fi candidate="/app/llama-server" if [[ -x "$candidate" ]]; then printf '%s\n' "$candidate" return fi printf '未找到 llama-server,可执行文件既不在 PATH 中,也不在 /app/llama-server\n' >&2 exit 1 } require_positive_integer() { local key="$1" local value="$2" if [[ ! "$value" =~ ^[0-9]+$ ]] || [[ "$value" -le 0 ]]; then printf '%s 必须是正整数,收到: %s\n' "$key" "$value" >&2 exit 1 fi } verify_sha256() { local path="$1" local expected="$2" if [[ -z "$expected" ]]; then return fi local actual actual="$(sha256sum "$path" | awk '{print $1}')" if [[ "${actual,,}" != "${expected,,}" ]]; then printf 'SHA256 校验失败: %s\n' "$path" >&2 printf '期望: %s\n' "$expected" >&2 printf '实际: %s\n' "$actual" >&2 exit 1 fi } resolve_runtime_profile() { case "${THINK_MODE:-think-on}" in think-on) REASONING_BUDGET="-1" MAX_TOKENS="-1" ;; think-off) REASONING_BUDGET="0" MAX_TOKENS="2048" ;; *) printf '不支持的 THINK_MODE: %s\n' "${THINK_MODE:-}" >&2 exit 1 ;; esac } main() { local host_addr="${HOST:-0.0.0.0}" local port_num="${PORT:-8081}" local model_path="${MODEL_PATH:-/models/model.gguf}" local mmproj_path="${MMPROJ_PATH:-/models/mmproj.gguf}" local gguf_url="${MODEL_GGUF_URL:-$DEFAULT_GGUF_URL}" local mmproj_url="${MODEL_MMPROJ_URL:-$DEFAULT_MMPROJ_URL}" local ctx_size="${CTX_SIZE:-16384}" local image_min_tokens="${IMAGE_MIN_TOKENS:-256}" local image_max_tokens="${IMAGE_MAX_TOKENS:-1024}" local mmproj_offload="${MMPROJ_OFFLOAD:-off}" local backend_ready_timeout_sec="$BACKEND_READY_TIMEOUT_SEC" local llama_server_bin local runtime_dir="/tmp/toolhub-backend" local stdout_log="${runtime_dir}/llama-server.stdout.log" local stderr_log="${runtime_dir}/llama-server.stderr.log" local llama_pid log_stage '阶段 1/6: 检查运行参数' require_positive_integer "PORT" "$port_num" require_positive_integer "CTX_SIZE" "$ctx_size" require_positive_integer "IMAGE_MIN_TOKENS" "$image_min_tokens" require_positive_integer "IMAGE_MAX_TOKENS" "$image_max_tokens" require_positive_integer "BACKEND_READY_TIMEOUT_SEC" "$backend_ready_timeout_sec" if (( image_min_tokens > image_max_tokens )); then printf 'IMAGE_MIN_TOKENS 不能大于 IMAGE_MAX_TOKENS\n' >&2 exit 1 fi if [[ "$mmproj_offload" != "on" && "$mmproj_offload" != "off" ]]; then printf 'MMPROJ_OFFLOAD 仅支持 on 或 off,收到: %s\n' "$mmproj_offload" >&2 exit 1 fi resolve_runtime_profile llama_server_bin="$(resolve_llama_server_bin)" mkdir -p "$runtime_dir" : > "$stdout_log" : > "$stderr_log" log_stage '阶段 2/6: 检查或下载主模型' download_if_missing "$model_path" "$gguf_url" "主模型" log_stage '阶段 3/6: 检查或下载视觉模型' download_if_missing "$mmproj_path" "$mmproj_url" "视觉模型" log_stage '阶段 4/6: 校验模型文件' verify_sha256 "$model_path" "${MODEL_GGUF_SHA256:-}" verify_sha256 "$mmproj_path" "${MODEL_MMPROJ_SHA256:-}" local args=( -m "$model_path" -mm "$mmproj_path" --n-gpu-layers all --flash-attn on --fit on --fit-target 256 --temp 1.0 --top-p 0.95 --top-k 20 --min-p 0.1 --presence-penalty 1.5 --repeat-penalty 1.05 -n "$MAX_TOKENS" --reasoning-budget "$REASONING_BUDGET" -c "$ctx_size" --image-min-tokens "$image_min_tokens" --image-max-tokens "$image_max_tokens" --host "$host_addr" --port "$port_num" --webui ) if [[ "$mmproj_offload" == "off" ]]; then args+=(--no-mmproj-offload) else args+=(--mmproj-offload) fi log_stage '阶段 5/6: 启动 llama-server' log_step "启动参数: host=$host_addr port=$port_num think=${THINK_MODE:-think-on}" "$llama_server_bin" "${args[@]}" >"$stdout_log" 2>"$stderr_log" & llama_pid=$! log_step "llama-server 已启动: PID ${llama_pid}" log_stage '阶段 6/6: 等待模型加载到 GPU' if ! wait_for_backend_ready "$port_num" "$backend_ready_timeout_sec" "$llama_pid" "$stdout_log" "$stderr_log"; then if kill -0 "$llama_pid" 2>/dev/null; then kill "$llama_pid" 2>/dev/null || true wait "$llama_pid" 2>/dev/null || true fi exit 1 fi wait "$llama_pid" } main "$@"