first

2026-03-11 16:49:00 +08:00
commit 52d7d14795
53 changed files with 4991 additions and 0 deletions
--- a/run_8080_toolhub_gateway.py
+++ b/run_8080_toolhub_gateway.py
@@ -0,0 +1,593 @@
+#!/usr/bin/env python3
+import argparse
+import os
+import threading
+import time
+from contextlib import asynccontextmanager
+from dataclasses import dataclass
+from typing import Any, Dict, List, Set, Tuple
+
+import requests
+import uvicorn
+from fastapi import FastAPI, Request
+from fastapi.responses import JSONResponse, Response, StreamingResponse
+from starlette.concurrency import run_in_threadpool
+
+from toolhub_gateway_agent import (
+    build_non_stream_response,
+    run_chat_completion,
+    stream_chat_completion,
+)
+
+DEFAULT_GATEWAY_HOST = '127.0.0.1'
+DEFAULT_GATEWAY_PORT = 8080
+DEFAULT_BACKEND_BASE = 'http://127.0.0.1:8081'
+DEFAULT_MODEL_SERVER = 'http://127.0.0.1:8081/v1'
+DEFAULT_TIMEOUT_SEC = 180
+DEFAULT_BACKEND_WAIT_HINT = ''
+DEFAULT_ACCESS_URLS = 'http://127.0.0.1:8080,http://localhost:8080'
+READY_ANNOUNCE_INTERVAL_SEC = 2
+WAIT_LOG_INTERVAL_SEC = 10
+WARMUP_MESSAGE = '请只回复一个字：好'
+WARMUP_PARSE_ERROR_MARKER = 'Failed to parse input'
+STREAM_CHUNK_BYTES = 8192
+SUPPORTED_PROXY_METHODS = ['GET', 'POST', 'PUT', 'PATCH', 'DELETE', 'OPTIONS', 'HEAD']
+HOP_HEADERS = {
+    'connection',
+    'keep-alive',
+    'proxy-authenticate',
+    'proxy-authorization',
+    'te',
+    'trailers',
+    'transfer-encoding',
+    'upgrade',
+}
+LOCAL_CONFIG_KEY = 'LlamaCppWebui.config'
+LOCAL_OVERRIDES_KEY = 'LlamaCppWebui.userOverrides'
+WEBUI_SETTINGS_PATCH = f"""
+<script>
+(function () {{
+  try {{
+    var cfgKey = '{LOCAL_CONFIG_KEY}';
+    var ovKey = '{LOCAL_OVERRIDES_KEY}';
+    var cfg = JSON.parse(localStorage.getItem(cfgKey) || '{{}}');
+    cfg.showMessageStats = true;
+    cfg.keepStatsVisible = false;
+    cfg.showThoughtInProgress = true;
+    cfg.disableReasoningParsing = false;
+    localStorage.setItem(cfgKey, JSON.stringify(cfg));
+
+    var overrides = JSON.parse(localStorage.getItem(ovKey) || '[]');
+    var set = new Set(Array.isArray(overrides) ? overrides : []);
+    ['showMessageStats', 'keepStatsVisible', 'showThoughtInProgress', 'disableReasoningParsing']
+      .forEach(function (k) {{ set.add(k); }});
+    localStorage.setItem(ovKey, JSON.stringify(Array.from(set)));
+  }} catch (e) {{
+    console.error('webui settings patch failed', e);
+  }}
+}})();
+</script>
+<style>
+.chat-processing-info-container {{
+  display: none !important;
+}}
+</style>
+""".strip()
+BACKEND_LOADING_HTML = """
+<!doctype html>
+<html lang="zh-CN">
+<head>
+  <meta charset="utf-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1">
+  <title>ToolHub 正在准备中</title>
+  <style>
+    :root {
+      color-scheme: light;
+      font-family: "Segoe UI", "PingFang SC", "Microsoft YaHei", sans-serif;
+      background: #0f172a;
+      color: #e2e8f0;
+    }
+    body {
+      margin: 0;
+      min-height: 100vh;
+      display: grid;
+      place-items: center;
+      background:
+        radial-gradient(circle at top, rgba(59, 130, 246, 0.18), transparent 45%),
+        linear-gradient(180deg, #111827, #020617);
+    }
+    main {
+      width: min(680px, calc(100vw - 32px));
+      padding: 28px;
+      border-radius: 20px;
+      background: rgba(15, 23, 42, 0.88);
+      border: 1px solid rgba(148, 163, 184, 0.24);
+      box-shadow: 0 24px 80px rgba(15, 23, 42, 0.45);
+    }
+    h1 {
+      margin: 0 0 12px;
+      font-size: 28px;
+    }
+    .status {
+      display: flex;
+      align-items: center;
+      gap: 16px;
+      margin-bottom: 18px;
+    }
+    .spinner-shell {
+      position: relative;
+      width: 40px;
+      height: 40px;
+      flex: 0 0 auto;
+    }
+    .spinner-ring {
+      position: absolute;
+      inset: 0;
+      border-radius: 999px;
+      border: 3px solid rgba(148, 163, 184, 0.16);
+      border-top-color: #93c5fd;
+      border-right-color: rgba(96, 165, 250, 0.92);
+      animation: spin 12s steps(12, end) infinite;
+      will-change: transform;
+      transform: translateZ(0);
+    }
+    .spinner-ring::after {
+      content: "";
+      position: absolute;
+      top: 3px;
+      left: 50%;
+      width: 7px;
+      height: 7px;
+      margin-left: -3.5px;
+      border-radius: 999px;
+      background: #e0f2fe;
+      box-shadow: 0 0 12px rgba(96, 165, 250, 0.78);
+    }
+    .spinner-core {
+      position: absolute;
+      inset: 9px;
+      border-radius: 999px;
+      background:
+        radial-gradient(circle, rgba(191, 219, 254, 0.96) 0, rgba(147, 197, 253, 0.82) 34%, rgba(59, 130, 246, 0.18) 65%, transparent 72%);
+    }
+    p {
+      margin: 10px 0;
+      line-height: 1.7;
+      color: #cbd5e1;
+    }
+    .state-line {
+      margin-top: 14px;
+      color: #93c5fd;
+    }
+    .elapsed-line {
+      margin-top: 8px;
+      color: #cbd5e1;
+      font-variant-numeric: tabular-nums;
+    }
+    .hint-box {
+      margin-top: 16px;
+      padding: 14px 16px;
+      border-radius: 14px;
+      background: rgba(15, 23, 42, 0.72);
+      border: 1px solid rgba(148, 163, 184, 0.18);
+    }
+    details {
+      margin-top: 16px;
+      color: #94a3b8;
+    }
+    summary {
+      cursor: pointer;
+    }
+    pre {
+      margin: 10px 0 0;
+      padding: 12px;
+      border-radius: 12px;
+      background: rgba(2, 6, 23, 0.86);
+      border: 1px solid rgba(148, 163, 184, 0.16);
+      color: #cbd5e1;
+      white-space: pre-wrap;
+      word-break: break-word;
+      font-family: "Cascadia Code", "Consolas", monospace;
+      font-size: 13px;
+      line-height: 1.6;
+    }
+    code {
+      font-family: "Cascadia Code", "Consolas", monospace;
+      color: #f8fafc;
+    }
+    @keyframes spin {
+      from { transform: rotate(0deg); }
+      to { transform: rotate(360deg); }
+    }
+    @media (prefers-reduced-motion: reduce) {
+      .spinner-ring {
+        animation-duration: 12s;
+      }
+    }
+  </style>
+</head>
+<body>
+  <main>
+    <div class="status">
+      <div class="spinner-shell" aria-hidden="true">
+        <div class="spinner-ring"></div>
+        <div class="spinner-core"></div>
+      </div>
+      <h1>ToolHub 正在准备中</h1>
+    </div>
+    <p>网关已经启动，但模型后端暂时还没有就绪。</p>
+    <p>如果这是第一次启动，程序可能正在下载模型文件，或者正在把模型加载到 GPU。</p>
+    <p>页面会停留在这个等待界面里，并自动检查后端状态。准备完成后会自动进入聊天界面，不再整页反复刷新。</p>
+    <p class="state-line" id="state-line">正在检查后端状态...</p>
+    <p class="elapsed-line" id="elapsed-line">已等待 0 秒</p>
+    <div class="hint-box">
+      <p>如果你是刚在终端里执行了启动命令，最直接的进度信息通常就在那个终端窗口里。</p>
+      __HINT_BLOCK__
+    </div>
+    <details>
+      <summary>查看技术详情</summary>
+      <pre>__DETAIL__</pre>
+    </details>
+  </main>
+  <script>
+    (function () {
+      var stateLine = document.getElementById('state-line');
+      var elapsedLine = document.getElementById('elapsed-line');
+      var healthUrl = '/gateway/health';
+      var startedAt = Date.now();
+
+      function updateState(message) {
+        if (stateLine) {
+          stateLine.textContent = message;
+        }
+      }
+
+      function updateElapsed() {
+        if (!elapsedLine) {
+          return;
+        }
+        var elapsedSec = Math.floor((Date.now() - startedAt) / 1000);
+        elapsedLine.textContent = '已等待 ' + elapsedSec + ' 秒';
+      }
+
+      async function pollHealth() {
+        try {
+          var response = await fetch(healthUrl, { cache: 'no-store' });
+          var payload = await response.json();
+          if (payload.status === 'ok') {
+            updateState('后端已经就绪，正在进入聊天界面...');
+            updateElapsed();
+            window.location.reload();
+            return;
+          }
+          updateState('模型仍在准备中，页面会自动继续等待。');
+        } catch (error) {
+          updateState('暂时还连不上后端，继续等待即可。');
+        }
+        window.setTimeout(pollHealth, 4000);
+      }
+
+      updateElapsed();
+      window.setInterval(updateElapsed, 1000);
+      window.setTimeout(pollHealth, 1200);
+    })();
+  </script>
+</body>
+</html>
+""".strip()
+
+
+@dataclass(frozen=True)
+class GatewayConfig:
+    backend_base: str
+    model_server: str
+    gateway_host: str
+    gateway_port: int
+    timeout_sec: int = DEFAULT_TIMEOUT_SEC
+    backend_wait_hint: str = DEFAULT_BACKEND_WAIT_HINT
+    access_urls: Tuple[str, ...] = ()
+
+
+@dataclass
+class GatewayState:
+    ready_event: threading.Event
+
+
+def parse_args() -> GatewayConfig:
+    parser = argparse.ArgumentParser(description='Run 8080 toolhub gateway with 8081 llama-server backend.')
+    parser.add_argument('--host', default=os.getenv('GATEWAY_HOST', DEFAULT_GATEWAY_HOST))
+    parser.add_argument('--port', type=int, default=int(os.getenv('GATEWAY_PORT', str(DEFAULT_GATEWAY_PORT))))
+    parser.add_argument('--backend-base', default=os.getenv('BACKEND_BASE', DEFAULT_BACKEND_BASE))
+    parser.add_argument('--model-server', default=os.getenv('MODEL_SERVER', DEFAULT_MODEL_SERVER))
+    parser.add_argument('--timeout-sec', type=int, default=int(os.getenv('GATEWAY_TIMEOUT_SEC', str(DEFAULT_TIMEOUT_SEC))))
+    parser.add_argument('--backend-wait-hint', default=os.getenv('BACKEND_WAIT_HINT', DEFAULT_BACKEND_WAIT_HINT))
+    parser.add_argument('--access-urls', default=os.getenv('ACCESS_URLS', DEFAULT_ACCESS_URLS))
+    args = parser.parse_args()
+    return GatewayConfig(
+        backend_base=args.backend_base.rstrip('/'),
+        model_server=args.model_server.rstrip('/'),
+        gateway_host=args.host,
+        gateway_port=args.port,
+        timeout_sec=args.timeout_sec,
+        backend_wait_hint=args.backend_wait_hint.strip(),
+        access_urls=parse_access_urls(args.access_urls),
+    )
+
+
+def parse_access_urls(raw: str) -> Tuple[str, ...]:
+    urls = [item.strip() for item in raw.split(',') if item.strip()]
+    return tuple(dict.fromkeys(urls))
+
+
+def filtered_headers(headers: Dict[str, str]) -> Dict[str, str]:
+    blocked = HOP_HEADERS | {'host', 'content-length', 'proxy-connection'}
+    return {key: value for key, value in headers.items() if key.lower() not in blocked}
+
+
+def drop_headers_ci(headers: Dict[str, str], names: Set[str]) -> Dict[str, str]:
+    lowered = {name.lower() for name in names}
+    return {key: value for key, value in headers.items() if key.lower() not in lowered}
+
+
+def build_backend_url(base: str, path: str, query: str) -> str:
+    if not query:
+        return f'{base}{path}'
+    return f'{base}{path}?{query}'
+
+
+def stream_upstream(upstream: requests.Response):
+    try:
+        for chunk in upstream.iter_content(chunk_size=STREAM_CHUNK_BYTES):
+            if chunk:
+                yield chunk
+    finally:
+        upstream.close()
+
+
+def inject_webui_settings(html: str) -> str:
+    if WEBUI_SETTINGS_PATCH in html:
+        return html
+    if '<head>' in html:
+        return html.replace('<head>', f'<head>\n{WEBUI_SETTINGS_PATCH}\n', 1)
+    if '<body>' in html:
+        return html.replace('<body>', f'<body>\n{WEBUI_SETTINGS_PATCH}\n', 1)
+    return f'{WEBUI_SETTINGS_PATCH}\n{html}'
+
+
+def build_backend_loading_response(detail: str, wait_hint: str) -> Response:
+    safe_detail = detail.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
+    hint_block = ''
+    if wait_hint:
+        safe_hint = wait_hint.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
+        hint_block = f'<p>如果你想单独查看后端准备进度，可以执行：<br><code>{safe_hint}</code></p>'
+    html = BACKEND_LOADING_HTML.replace('__DETAIL__', safe_detail).replace('__HINT_BLOCK__', hint_block)
+    return Response(
+        content=html,
+        status_code=200,
+        media_type='text/html; charset=utf-8',
+        headers={'Cache-Control': 'no-store, max-age=0'},
+    )
+
+
+def is_root_request(request: Request, path: str) -> bool:
+    return request.method == 'GET' and path in {'/', '/index.html'}
+
+
+def is_backend_wait_status(status_code: int) -> bool:
+    return status_code in {502, 503, 504}
+
+
+def format_access_urls(access_urls: Tuple[str, ...]) -> str:
+    return ' '.join(access_urls)
+
+
+def check_backend_ready(cfg: GatewayConfig) -> bool:
+    try:
+        response = requests.get(f'{cfg.backend_base}/health', timeout=cfg.timeout_sec)
+        response.raise_for_status()
+    except Exception:  # noqa: BLE001
+        return False
+    return True
+
+
+def announce_access_urls(cfg: GatewayConfig) -> None:
+    if not cfg.access_urls:
+        return
+    print(
+        f'[toolhub-gateway] 网页入口已经开放，正在加载模型，完成后可访问: {format_access_urls(cfg.access_urls)}',
+        flush=True,
+    )
+
+
+def announce_backend_ready(cfg: GatewayConfig) -> None:
+    if not cfg.access_urls:
+        return
+    print(
+        f'[toolhub-gateway] 模型已完成加载和预热，可以打开: {format_access_urls(cfg.access_urls)}',
+        flush=True,
+    )
+
+
+def is_gateway_ready(state: GatewayState) -> bool:
+    return state.ready_event.is_set()
+
+
+def warmup_model(cfg: GatewayConfig) -> Tuple[bool, str]:
+    payload = {
+        'messages': [{'role': 'user', 'content': WARMUP_MESSAGE}],
+        'max_tokens': 1,
+        'stream': False,
+        'temperature': 0,
+    }
+    try:
+        response = requests.post(
+            f'{cfg.model_server}/chat/completions',
+            json=payload,
+            timeout=cfg.timeout_sec,
+        )
+    except Exception as exc:  # noqa: BLE001
+        return False, f'模型预热请求失败: {exc}'
+    if response.ok:
+        return True, '模型预热已完成'
+    body = response.text.strip()
+    if response.status_code == 500 and WARMUP_PARSE_ERROR_MARKER in body:
+        return True, '模型首轮预热已经完成'
+    return False, f'模型预热暂未完成: HTTP {response.status_code} {body[:200]}'
+
+
+def run_ready_announcer(cfg: GatewayConfig, state: GatewayState) -> None:
+    last_wait_detail = ''
+    last_wait_log_at = 0.0
+    announce_access_urls(cfg)
+    while True:
+        if check_backend_ready(cfg):
+            ready, wait_detail = warmup_model(cfg)
+        else:
+            ready, wait_detail = False, '后端健康检查尚未通过'
+        if ready:
+            state.ready_event.set()
+            announce_backend_ready(cfg)
+            return
+        now = time.monotonic()
+        if wait_detail != last_wait_detail or (now - last_wait_log_at) >= WAIT_LOG_INTERVAL_SEC:
+            print(f'[toolhub-gateway] 后端仍在准备中: {wait_detail}', flush=True)
+            last_wait_detail = wait_detail
+            last_wait_log_at = now
+        time.sleep(READY_ANNOUNCE_INTERVAL_SEC)
+
+
+async def handle_gateway_health(cfg: GatewayConfig, state: GatewayState) -> Dict[str, Any]:
+    status = 'ok' if is_gateway_ready(state) else 'warming'
+    backend_error = ''
+    try:
+        health = requests.get(f'{cfg.backend_base}/health', timeout=cfg.timeout_sec)
+        health.raise_for_status()
+    except Exception as exc:  # noqa: BLE001
+        status = 'degraded'
+        backend_error = str(exc)
+    return {'status': status, 'backend_base': cfg.backend_base, 'backend_error': backend_error}
+
+
+async def handle_chat_completions(request: Request, cfg: GatewayConfig) -> Response:
+    payload = await request.json()
+    stream = bool(payload.get('stream', False))
+    if stream:
+        try:
+            iterator = stream_chat_completion(payload, cfg.model_server, cfg.timeout_sec)
+        except Exception as exc:  # noqa: BLE001
+            error = {'error': {'code': 500, 'type': 'gateway_error', 'message': str(exc)}}
+            return JSONResponse(status_code=500, content=error)
+        return StreamingResponse(iterator, media_type='text/event-stream')
+
+    try:
+        result = await run_in_threadpool(run_chat_completion, payload, cfg.model_server, cfg.timeout_sec)
+    except Exception as exc:  # noqa: BLE001
+        error = {'error': {'code': 500, 'type': 'gateway_error', 'message': str(exc)}}
+        return JSONResponse(status_code=500, content=error)
+
+    answer = result['answer']
+    model = result['model']
+    reasoning = result.get('reasoning', '')
+    return JSONResponse(content=build_non_stream_response(answer, model, reasoning))
+
+
+async def handle_proxy(request: Request, full_path: str, cfg: GatewayConfig, state: GatewayState) -> Response:
+    path = '/' + full_path
+    if is_root_request(request, path) and not is_gateway_ready(state):
+        return build_backend_loading_response('模型正在加载或预热，完成后会自动进入聊天界面。', cfg.backend_wait_hint)
+    url = build_backend_url(cfg.backend_base, path, request.url.query)
+    headers = filtered_headers(dict(request.headers))
+    body = await request.body()
+
+    try:
+        upstream = requests.request(
+            method=request.method,
+            url=url,
+            headers=headers,
+            data=body,
+            stream=True,
+            timeout=cfg.timeout_sec,
+            allow_redirects=False,
+        )
+    except Exception as exc:  # noqa: BLE001
+        if is_root_request(request, path):
+            return build_backend_loading_response(str(exc), cfg.backend_wait_hint)
+        if request.method == 'GET' and path == '/favicon.ico':
+            return Response(status_code=204)
+        error = {'error': {'type': 'proxy_error', 'message': str(exc)}}
+        return JSONResponse(status_code=502, content=error)
+
+    response_headers = filtered_headers(dict(upstream.headers))
+    content_type = upstream.headers.get('content-type', '')
+    if is_root_request(request, path) and is_backend_wait_status(upstream.status_code):
+        detail = upstream.text.strip() or f'backend returned {upstream.status_code}'
+        upstream.close()
+        return build_backend_loading_response(detail, cfg.backend_wait_hint)
+    if request.method == 'GET' and path == '/favicon.ico' and is_backend_wait_status(upstream.status_code):
+        upstream.close()
+        return Response(status_code=204)
+    if 'text/event-stream' in content_type:
+        return StreamingResponse(
+            stream_upstream(upstream),
+            status_code=upstream.status_code,
+            headers=response_headers,
+            media_type='text/event-stream',
+        )
+
+    is_webui_html = (
+        request.method == 'GET'
+        and path in {'/', '/index.html'}
+        and upstream.status_code == 200
+        and 'text/html' in content_type
+    )
+    if is_webui_html:
+        encoding = upstream.encoding or 'utf-8'
+        html = upstream.content.decode(encoding, errors='replace')
+        injected = inject_webui_settings(html)
+        upstream.close()
+        clean_headers = drop_headers_ci(response_headers, {'content-encoding', 'content-length', 'etag'})
+        return Response(
+            content=injected.encode('utf-8'),
+            status_code=200,
+            headers=clean_headers,
+            media_type='text/html; charset=utf-8',
+        )
+
+    upstream.raw.decode_content = False
+    data = upstream.raw.read(decode_content=False)
+    upstream.close()
+    return Response(content=data, status_code=upstream.status_code, headers=response_headers)
+
+
+def create_app(cfg: GatewayConfig, state: GatewayState) -> FastAPI:
+    @asynccontextmanager
+    async def lifespan(_: FastAPI):
+        threading.Thread(target=run_ready_announcer, args=(cfg, state), daemon=True).start()
+        yield
+
+    app = FastAPI(title='Qwen3.5 ToolHub Gateway 8080', lifespan=lifespan)
+
+    @app.get('/gateway/health')
+    async def gateway_health() -> Dict[str, Any]:
+        return await handle_gateway_health(cfg, state)
+
+    @app.post('/v1/chat/completions')
+    async def chat_completions(request: Request) -> Response:
+        return await handle_chat_completions(request, cfg)
+
+    @app.api_route('/{full_path:path}', methods=SUPPORTED_PROXY_METHODS)
+    async def proxy_all(request: Request, full_path: str) -> Response:
+        return await handle_proxy(request, full_path, cfg, state)
+
+    return app
+
+
+def main() -> None:
+    cfg = parse_args()
+    state = GatewayState(ready_event=threading.Event())
+    app = create_app(cfg, state)
+    uvicorn.run(app, host=cfg.gateway_host, port=cfg.gateway_port, log_level='info')
+
+
+if __name__ == '__main__':
+    main()