first
This commit is contained in:
15
docker/backend/Dockerfile
Normal file
15
docker/backend/Dockerfile
Normal file
@@ -0,0 +1,15 @@
|
||||
FROM ghcr.io/ggml-org/llama.cpp:server-cuda
|
||||
|
||||
USER root
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends curl ca-certificates \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY docker/backend/entrypoint.sh /usr/local/bin/toolhub-backend-entrypoint.sh
|
||||
COPY docker/backend/entrypoint_helpers.sh /usr/local/bin/toolhub-backend-helpers.sh
|
||||
|
||||
RUN chmod +x /usr/local/bin/toolhub-backend-entrypoint.sh /usr/local/bin/toolhub-backend-helpers.sh
|
||||
|
||||
ENTRYPOINT ["/usr/local/bin/toolhub-backend-entrypoint.sh"]
|
||||
176
docker/backend/entrypoint.sh
Normal file
176
docker/backend/entrypoint.sh
Normal file
@@ -0,0 +1,176 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
DEFAULT_GGUF_URL="https://huggingface.co/lmstudio-community/Qwen3.5-9B-GGUF/resolve/main/Qwen3.5-9B-Q4_K_M.gguf"
|
||||
DEFAULT_MMPROJ_URL="https://huggingface.co/lmstudio-community/Qwen3.5-9B-GGUF/resolve/main/mmproj-Qwen3.5-9B-BF16.gguf"
|
||||
BACKEND_READY_TIMEOUT_SEC=180
|
||||
RECENT_LOG_LINE_COUNT=80
|
||||
|
||||
. /usr/local/bin/toolhub-backend-helpers.sh
|
||||
|
||||
log_step() {
|
||||
printf '[toolhub-backend] %s\n' "$1"
|
||||
}
|
||||
|
||||
log_stage() {
|
||||
log_step "$1"
|
||||
}
|
||||
|
||||
resolve_llama_server_bin() {
|
||||
local candidate=""
|
||||
if candidate="$(command -v llama-server 2>/dev/null)"; then
|
||||
printf '%s\n' "$candidate"
|
||||
return
|
||||
fi
|
||||
|
||||
candidate="/app/llama-server"
|
||||
if [[ -x "$candidate" ]]; then
|
||||
printf '%s\n' "$candidate"
|
||||
return
|
||||
fi
|
||||
|
||||
printf '未找到 llama-server,可执行文件既不在 PATH 中,也不在 /app/llama-server\n' >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
require_positive_integer() {
|
||||
local key="$1"
|
||||
local value="$2"
|
||||
if [[ ! "$value" =~ ^[0-9]+$ ]] || [[ "$value" -le 0 ]]; then
|
||||
printf '%s 必须是正整数,收到: %s\n' "$key" "$value" >&2
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
verify_sha256() {
|
||||
local path="$1"
|
||||
local expected="$2"
|
||||
if [[ -z "$expected" ]]; then
|
||||
return
|
||||
fi
|
||||
|
||||
local actual
|
||||
actual="$(sha256sum "$path" | awk '{print $1}')"
|
||||
if [[ "${actual,,}" != "${expected,,}" ]]; then
|
||||
printf 'SHA256 校验失败: %s\n' "$path" >&2
|
||||
printf '期望: %s\n' "$expected" >&2
|
||||
printf '实际: %s\n' "$actual" >&2
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
resolve_runtime_profile() {
|
||||
case "${THINK_MODE:-think-on}" in
|
||||
think-on)
|
||||
REASONING_BUDGET="-1"
|
||||
MAX_TOKENS="-1"
|
||||
;;
|
||||
think-off)
|
||||
REASONING_BUDGET="0"
|
||||
MAX_TOKENS="2048"
|
||||
;;
|
||||
*)
|
||||
printf '不支持的 THINK_MODE: %s\n' "${THINK_MODE:-}" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
main() {
|
||||
local host_addr="${HOST:-0.0.0.0}"
|
||||
local port_num="${PORT:-8081}"
|
||||
local model_path="${MODEL_PATH:-/models/model.gguf}"
|
||||
local mmproj_path="${MMPROJ_PATH:-/models/mmproj.gguf}"
|
||||
local gguf_url="${MODEL_GGUF_URL:-$DEFAULT_GGUF_URL}"
|
||||
local mmproj_url="${MODEL_MMPROJ_URL:-$DEFAULT_MMPROJ_URL}"
|
||||
local ctx_size="${CTX_SIZE:-16384}"
|
||||
local image_min_tokens="${IMAGE_MIN_TOKENS:-256}"
|
||||
local image_max_tokens="${IMAGE_MAX_TOKENS:-1024}"
|
||||
local mmproj_offload="${MMPROJ_OFFLOAD:-off}"
|
||||
local backend_ready_timeout_sec="$BACKEND_READY_TIMEOUT_SEC"
|
||||
local llama_server_bin
|
||||
local runtime_dir="/tmp/toolhub-backend"
|
||||
local stdout_log="${runtime_dir}/llama-server.stdout.log"
|
||||
local stderr_log="${runtime_dir}/llama-server.stderr.log"
|
||||
local llama_pid
|
||||
|
||||
log_stage '阶段 1/6: 检查运行参数'
|
||||
require_positive_integer "PORT" "$port_num"
|
||||
require_positive_integer "CTX_SIZE" "$ctx_size"
|
||||
require_positive_integer "IMAGE_MIN_TOKENS" "$image_min_tokens"
|
||||
require_positive_integer "IMAGE_MAX_TOKENS" "$image_max_tokens"
|
||||
require_positive_integer "BACKEND_READY_TIMEOUT_SEC" "$backend_ready_timeout_sec"
|
||||
|
||||
if (( image_min_tokens > image_max_tokens )); then
|
||||
printf 'IMAGE_MIN_TOKENS 不能大于 IMAGE_MAX_TOKENS\n' >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ "$mmproj_offload" != "on" && "$mmproj_offload" != "off" ]]; then
|
||||
printf 'MMPROJ_OFFLOAD 仅支持 on 或 off,收到: %s\n' "$mmproj_offload" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
resolve_runtime_profile
|
||||
llama_server_bin="$(resolve_llama_server_bin)"
|
||||
mkdir -p "$runtime_dir"
|
||||
: > "$stdout_log"
|
||||
: > "$stderr_log"
|
||||
|
||||
log_stage '阶段 2/6: 检查或下载主模型'
|
||||
download_if_missing "$model_path" "$gguf_url" "主模型"
|
||||
log_stage '阶段 3/6: 检查或下载视觉模型'
|
||||
download_if_missing "$mmproj_path" "$mmproj_url" "视觉模型"
|
||||
|
||||
log_stage '阶段 4/6: 校验模型文件'
|
||||
verify_sha256 "$model_path" "${MODEL_GGUF_SHA256:-}"
|
||||
verify_sha256 "$mmproj_path" "${MODEL_MMPROJ_SHA256:-}"
|
||||
|
||||
local args=(
|
||||
-m "$model_path"
|
||||
-mm "$mmproj_path"
|
||||
--n-gpu-layers all
|
||||
--flash-attn on
|
||||
--fit on
|
||||
--fit-target 256
|
||||
--temp 1.0
|
||||
--top-p 0.95
|
||||
--top-k 20
|
||||
--min-p 0.1
|
||||
--presence-penalty 1.5
|
||||
--repeat-penalty 1.05
|
||||
-n "$MAX_TOKENS"
|
||||
--reasoning-budget "$REASONING_BUDGET"
|
||||
-c "$ctx_size"
|
||||
--image-min-tokens "$image_min_tokens"
|
||||
--image-max-tokens "$image_max_tokens"
|
||||
--host "$host_addr"
|
||||
--port "$port_num"
|
||||
--webui
|
||||
)
|
||||
|
||||
if [[ "$mmproj_offload" == "off" ]]; then
|
||||
args+=(--no-mmproj-offload)
|
||||
else
|
||||
args+=(--mmproj-offload)
|
||||
fi
|
||||
|
||||
log_stage '阶段 5/6: 启动 llama-server'
|
||||
log_step "启动参数: host=$host_addr port=$port_num think=${THINK_MODE:-think-on}"
|
||||
"$llama_server_bin" "${args[@]}" >"$stdout_log" 2>"$stderr_log" &
|
||||
llama_pid=$!
|
||||
log_step "llama-server 已启动: PID ${llama_pid}"
|
||||
|
||||
log_stage '阶段 6/6: 等待模型加载到 GPU'
|
||||
if ! wait_for_backend_ready "$port_num" "$backend_ready_timeout_sec" "$llama_pid" "$stdout_log" "$stderr_log"; then
|
||||
if kill -0 "$llama_pid" 2>/dev/null; then
|
||||
kill "$llama_pid" 2>/dev/null || true
|
||||
wait "$llama_pid" 2>/dev/null || true
|
||||
fi
|
||||
exit 1
|
||||
fi
|
||||
|
||||
wait "$llama_pid"
|
||||
}
|
||||
|
||||
main "$@"
|
||||
156
docker/backend/entrypoint_helpers.sh
Normal file
156
docker/backend/entrypoint_helpers.sh
Normal file
@@ -0,0 +1,156 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
show_recent_server_logs() {
|
||||
local stdout_log="$1"
|
||||
local stderr_log="$2"
|
||||
|
||||
log_step '后端启动失败,最近日志如下'
|
||||
if [[ -s "$stdout_log" ]]; then
|
||||
log_step '=== 最近标准输出 ==='
|
||||
tail -n "$RECENT_LOG_LINE_COUNT" "$stdout_log"
|
||||
fi
|
||||
if [[ -s "$stderr_log" ]]; then
|
||||
log_step '=== 最近标准错误 ==='
|
||||
tail -n "$RECENT_LOG_LINE_COUNT" "$stderr_log" >&2
|
||||
fi
|
||||
}
|
||||
|
||||
probe_backend_ready() {
|
||||
local port_num="$1"
|
||||
curl -fsS "http://127.0.0.1:${port_num}/health" >/dev/null 2>&1
|
||||
}
|
||||
|
||||
wait_for_backend_ready() {
|
||||
local port_num="$1"
|
||||
local timeout_sec="$2"
|
||||
local llama_pid="$3"
|
||||
local stdout_log="$4"
|
||||
local stderr_log="$5"
|
||||
local elapsed_sec=0
|
||||
|
||||
while (( elapsed_sec < timeout_sec )); do
|
||||
if ! kill -0 "$llama_pid" 2>/dev/null; then
|
||||
log_step '后端启动失败: llama-server 进程已提前退出'
|
||||
show_recent_server_logs "$stdout_log" "$stderr_log"
|
||||
return 1
|
||||
fi
|
||||
if probe_backend_ready "$port_num"; then
|
||||
log_step '后端健康检查已通过,网关会继续完成预热'
|
||||
return 0
|
||||
fi
|
||||
log_step "等待模型加载到 GPU... ${elapsed_sec}/${timeout_sec} 秒"
|
||||
sleep 1
|
||||
elapsed_sec=$((elapsed_sec + 1))
|
||||
done
|
||||
|
||||
log_step "后端在 ${timeout_sec} 秒内未就绪"
|
||||
show_recent_server_logs "$stdout_log" "$stderr_log"
|
||||
return 1
|
||||
}
|
||||
|
||||
format_bytes() {
|
||||
local bytes="$1"
|
||||
awk -v bytes="$bytes" '
|
||||
BEGIN {
|
||||
split("B KiB MiB GiB TiB", units, " ")
|
||||
value = bytes + 0
|
||||
idx = 1
|
||||
while (value >= 1024 && idx < 5) {
|
||||
value /= 1024
|
||||
idx++
|
||||
}
|
||||
printf "%.1f %s", value, units[idx]
|
||||
}
|
||||
'
|
||||
}
|
||||
|
||||
resolve_content_length() {
|
||||
local url="$1"
|
||||
curl -fsSLI "$url" \
|
||||
| tr -d '\r' \
|
||||
| awk 'tolower($1) == "content-length:" { print $2 }' \
|
||||
| tail -n 1
|
||||
}
|
||||
|
||||
read_file_size() {
|
||||
local path="$1"
|
||||
if [[ -f "$path" ]]; then
|
||||
stat -c '%s' "$path"
|
||||
return
|
||||
fi
|
||||
printf '0\n'
|
||||
}
|
||||
|
||||
render_progress_message() {
|
||||
local label="$1"
|
||||
local current_bytes="$2"
|
||||
local total_bytes="$3"
|
||||
local speed_bytes="$4"
|
||||
local current_text
|
||||
local total_text
|
||||
local speed_text
|
||||
|
||||
current_text="$(format_bytes "$current_bytes")"
|
||||
speed_text="$(format_bytes "$speed_bytes")"
|
||||
total_text="$(format_bytes "${total_bytes:-0}")"
|
||||
|
||||
if [[ -n "$total_bytes" && "$total_bytes" =~ ^[0-9]+$ && "$total_bytes" -gt 0 ]]; then
|
||||
awk -v label="$label" -v current="$current_bytes" -v total="$total_bytes" \
|
||||
-v current_text="$current_text" -v total_text="$total_text" -v speed_text="$speed_text" '
|
||||
BEGIN {
|
||||
pct = (current / total) * 100
|
||||
printf "下载%s: %.1f%% %s / %s %s/s\n",
|
||||
label, pct, current_text, total_text, speed_text
|
||||
}
|
||||
'
|
||||
return
|
||||
fi
|
||||
|
||||
printf '下载%s: 已下载 %s %s/s\n' "$label" "$current_text" "$speed_text"
|
||||
}
|
||||
|
||||
download_if_missing() {
|
||||
local path="$1"
|
||||
local url="$2"
|
||||
local label="$3"
|
||||
local temp_path="${path}.part"
|
||||
local total_bytes=""
|
||||
local previous_bytes=0
|
||||
local current_bytes=0
|
||||
local speed_bytes=0
|
||||
local curl_pid
|
||||
|
||||
mkdir -p "$(dirname "$path")"
|
||||
if [[ -f "$path" ]]; then
|
||||
log_step "检测到现有${label},跳过下载"
|
||||
return
|
||||
fi
|
||||
|
||||
log_step "下载${label}: $url"
|
||||
total_bytes="$(resolve_content_length "$url" || true)"
|
||||
previous_bytes="$(read_file_size "$temp_path")"
|
||||
|
||||
curl --fail --location --retry 5 --retry-delay 2 --retry-connrefused \
|
||||
--continue-at - --output "$temp_path" --silent --show-error "$url" &
|
||||
curl_pid=$!
|
||||
|
||||
while kill -0 "$curl_pid" 2>/dev/null; do
|
||||
sleep 2
|
||||
current_bytes="$(read_file_size "$temp_path")"
|
||||
speed_bytes=$(( (current_bytes - previous_bytes) / 2 ))
|
||||
if (( speed_bytes < 0 )); then
|
||||
speed_bytes=0
|
||||
fi
|
||||
log_step "$(render_progress_message "$label" "$current_bytes" "$total_bytes" "$speed_bytes")"
|
||||
previous_bytes="$current_bytes"
|
||||
done
|
||||
|
||||
if ! wait "$curl_pid"; then
|
||||
printf '下载失败: %s\n' "$url" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
current_bytes="$(read_file_size "$temp_path")"
|
||||
log_step "下载${label}完成: $(format_bytes "$current_bytes")"
|
||||
mv "$temp_path" "$path"
|
||||
}
|
||||
14
docker/gateway/Dockerfile
Normal file
14
docker/gateway/Dockerfile
Normal file
@@ -0,0 +1,14 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY requirements.txt /app/requirements.txt
|
||||
|
||||
RUN python -m pip install --no-cache-dir --upgrade pip wheel \
|
||||
&& python -m pip install --no-cache-dir -r /app/requirements.txt
|
||||
|
||||
COPY . /app
|
||||
|
||||
CMD ["python", "run_8080_toolhub_gateway.py", "--host", "0.0.0.0", "--port", "8080"]
|
||||
Reference in New Issue
Block a user