commit a0e157655f6bd9e6f68197a5f7df7d00a23d3adc Author: laowang Date: Fri Nov 7 19:44:51 2025 +0800 Initial diff --git a/README.md b/README.md new file mode 100644 index 0000000..92c30c8 --- /dev/null +++ b/README.md @@ -0,0 +1,112 @@ + +# CodeZen +> 一个专注中文区的 GitHub 项目发现 + +--- + +### 📖 关于本项目 + +`CodeZen` 是一个全栈的 Web 应用,旨在解决“GitHub 上的中文项目信息过载”问题。 + +它通过一个 **7x24 小时无人值守**的爬虫,自动抓取 GitHub 上**近期**(7天内)、**热门**(>50星)且**包含中文内容**的仓库,并将其以一个美观、可交互的“卡片式”界面呈现。 + +本应用是**“HTMX 驱动”**的,所有筛选、排序和分页操作均可“无刷新”丝滑完成,提供了现代化的单页应用 (SPA) 体验。 + +### ✨ 主要功能 + +* **7x24 自动更新:** 应用启动后,`APScheduler` 后台调度器会自动(每 4 小时)抓取最新项目,无需人工干预。 +* **智能中文筛选:** 独有的“智能扫描”逻辑,优先检查项目 `description`,其次扫描 `README_CN.md` 或 `README.md`,确保 100% 收录中文相关项目。 +* **多重协同筛选 (100% 自动):** + * **语言筛选:** 自动从数据库中 `SELECT DISTINCT` 提取**所有已抓取**的语言,动态生成筛选器。 + * **主题筛选:** 自动**实时统计**数据库中 Top 20 热门 `Topics`,动态生成功能筛选器。 +* **协同工作流:** 语言、主题、排序、分页、搜索**五大功能**可以完美协同工作(例如:搜索“AI”项目中,`Python` 语言下,`按热度排序` 的第 2 页)。 +* **稳定搜索:** 采用稳定的“回车键”搜索,可(通过 HTMX)无刷新更新结果。 +* **现代 UI/UX:** + * 响应式卡片布局 (`Grid` + `flex-wrap`)。 + * 支持手动切换和系统记忆的“深色/浅色”模式。 + * 清晰的三区卡片视觉划分(标题、内容、元数据)。 + +### 🛠️ 技术栈 + +* **后端:** Flask, APScheduler +* **前端:** HTMX, Tailwind CSS, Material Symbols +* **数据库:** SQLite3 +* **抓取器:** Python (Requests, Collections) + +### 📁 文件结构 +``` + +CodeZen/ +├── webapp.py # 主程序: Flask + APScheduler 调度器 +├── scraper.py # 抓取器: 封装了所有 GitHub API 逻辑 +├── templates/ +│ ├── index.html # "外壳"模板 (含页眉, 页脚, 搜索框) +│ └── _projects_partial.html # "内容"模板 (含筛选器, 卡片, 分页) +└── github_projects.db # 自动生成的 SQLite 数据库 + +``` +### 🚀 如何运行 + +1. **克隆仓库** + ```bash + git clone https://git.wlens.top/laowang/CodeZen.git + cd CodeZen + ``` + +2. **创建并激活虚拟环境** + ```bash + python -m venv venv + + # Windows + .\venv\Scripts\activate + + # macOS / Linux + source venv/bin/activate + ``` + +3. **安装依赖** + ```bash + pip install -r requirements.txt + ``` + +4. **配置 GitHub Token (关键!)** + 打开 `webapp.py` 文件,找到顶部的配置区域,将 `GITHUB_TOKEN` 变量替换为你自己的 GitHub Personal Access Token。 + + ```python + # webapp.py + GITHUB_TOKEN = 'ghp_YOUR_REAL_GITHUB_TOKEN_HERE' # 替换你的KEY + SCHEDULE_HOURS = 4 # 自动抓取的间隔时间 (小时) + ``` + +5. **启动应用 (V6.0 自动化版)** + 运行主应用: + + ```bash + python webapp.py + ``` + + 应用启动后,它将**立即**在后台执行第一次抓取(你会在终端看到日志),这可能需要几分钟。抓取完成后,在浏览器中访问 `http://127.0.0.1:5000` 即可看到结果。 + + **应用会从此 7x24 自动在后台更新,你无需任何其他操作。** + +### 🧠 CodeZen 的“大脑”:收录与展示逻辑 + +#### 1. "要不要收录?" (抓取逻辑 - `scraper.py`) + +一个项目必须通过 3 道“门”才会被收录: + +1. **热度门 (API):** 7 天内创建 且 > 50 星。 +2. **重复门 (DB):** `repo_id` 未在数据库中。 +3. **中文门 (智能):** **(满足一项即可)** + * **a.** 项目的 `description` (简介) 本身包含中文。 + * **b.** (如果简介不含中文) `README_CN.md` 或 `README.md` 中包含中文。 + +#### 2. "介绍里怎么写?" (展示逻辑 - `2_webapp.py`) + +为了在卡片上提供最有价值的介绍,后端遵循“最佳内容优先”原则: + +1. **第 1 优先级:** 显示中文的 `description` (简介)。 +2. **第 2 优先级:** (如果简介是英文) 显示从 README 提取的中文 `readme_excerpt` (摘要)。 +3. **第 3 优先级:** (如果前两者都没有) 降级显示英文的 `description`。 +4. **第 4 优先级:** 显示 "暂无简介"。 + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..7292c9e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +requests +Flask +APScheduler \ No newline at end of file diff --git a/scraper.py b/scraper.py new file mode 100644 index 0000000..3cf346d --- /dev/null +++ b/scraper.py @@ -0,0 +1,154 @@ +import requests +import sqlite3 +import re +import base64 +from datetime import datetime, timedelta +import os +import json + +# ============================================================================== +# --- ⚙️ 配置区域 --- +# ============================================================================== + +SEARCH_DAYS_AGO = 7 +MIN_STARS = 50 +DB_FILE = 'github_projects.db' +README_EXCERPT_LENGTH = 300 + +# ============================================================================== +# --- 脚本核心代码 --- +# ============================================================================== + +# SESSION = requests.Session() +# SESSION.headers.update({'Authorization': f'token {GITHUB_TOKEN}'}) + +def setup_database(): + print(" [Scheduler] 正在检查并设置数据库...") + conn = sqlite3.connect(DB_FILE) + cursor = conn.cursor() + cursor.execute(''' + CREATE TABLE IF NOT EXISTS projects ( + repo_id INTEGER PRIMARY KEY, name TEXT NOT NULL, url TEXT NOT NULL UNIQUE, + description TEXT, readme_excerpt TEXT, language TEXT, stars INTEGER, pushed_at TIMESTAMP, + owner_avatar_url TEXT, topics TEXT + ) + ''') + conn.commit() + conn.close() + +def has_chinese(text): + if not text: return False + return re.search(r'[\u4e00-\u9fa5]', text) is not None + +def get_readme_content_with_excerpt(repo_full_name, session): + """ + 智能两步扫描逻辑 + """ + print(f" -> 智能扫描 '{repo_full_name}'...") + + chinese_readme_filenames = ["README_CN.md", "README_zh.md", "README_zh-CN.md", "README-zh_CN.md"] + + for filename in chinese_readme_filenames: + url = f"https://api.github.com/repos/{repo_full_name}/contents/{filename}" + try: + response = session.get(url, timeout=10) + if response.status_code == 200: + print(f" ✨ 找到中文文件: '{filename}'.") + content = response.json().get('content', '') + decoded_content = base64.b64decode(content).decode('utf-8', errors='ignore') + if has_chinese(decoded_content): + excerpt = re.sub(r'#+\s*|\!\[.*\]\(.*\)|<.*?>|\[|\]|\(|\)', '', decoded_content).strip().replace('\n', ' ') + return excerpt[:README_EXCERPT_LENGTH] + except requests.RequestException: + continue + + print(" - 检查默认 README.md...") + url = f"https://api.github.com/repos/{repo_full_name}/contents/README.md" + try: + response = session.get(url, timeout=10) + if response.status_code == 200: + content = response.json().get('content', '') + decoded_content = base64.b64decode(content).decode('utf-8', errors='ignore') + if has_chinese(decoded_content): + print(f" ✔️ 默认 README 中发现中文.") + excerpt = re.sub(r'#+\s*|\!\[.*\]\(.*\)|<.*?>|\[|\]|\(|\)', '', decoded_content).strip().replace('\n', ' ') + return excerpt[:README_EXCERPT_LENGTH] + else: + print(f" ❌ 默认 README 中无中文.") + else: + print(f" - 默认 README.md 未找到.") + except requests.RequestException as e: + print(f" ⚠️ 抓取 README 出错: {e}") + + return None + +def get_new_hot_repos(session): + search_date = (datetime.now() - timedelta(days=SEARCH_DAYS_AGO)).strftime('%Y-%m-%d') + query = f'created:>{search_date} stars:>{MIN_STARS}' + url = f'https://api.github.com/search/repositories?q={query}&sort=stars&order=desc&per_page=100' + try: + response = session.get(url, timeout=20) + response.raise_for_status() + return response.json().get('items', []) + except requests.RequestException: + return [] + +def discover_and_save_projects(github_token): + if not github_token or github_token == '111112222233333': + print(" [Scheduler] 错误: GitHub Token 未设置或无效. 跳过本次抓取。") + return 0 + + session = requests.Session() + session.headers.update({'Authorization': f'token {github_token}'}) + + conn = sqlite3.connect(DB_FILE) + cursor = conn.cursor() + all_repos = get_new_hot_repos(session) + if not all_repos: + print(" [Scheduler] API 未返回任何匹配条件的项目。") + conn.close() + return 0 + + print(f" [Scheduler] API 返回 {len(all_repos)} 个项目. 开始过滤...") + + new_projects_count = 0 + for repo in all_repos: + repo_id, repo_full_name = repo['id'], repo['full_name'] + cursor.execute("SELECT 1 FROM projects WHERE repo_id = ?", (repo_id,)) + if cursor.fetchone(): continue + + print(f"\n [Scheduler] 处理 [{repo_full_name}]...") + description = repo.get('description', '') + + is_desc_chinese = has_chinese(description) + # V6.0: 传入 session + readme_excerpt = get_readme_content_with_excerpt(repo_full_name, session) if not is_desc_chinese else None + + if not is_desc_chinese and not readme_excerpt: + print(f" -> 丢弃 '{repo_full_name}' (无中文内容).") + continue + + owner_avatar_url = repo['owner']['avatar_url'] if repo.get('owner') else None + topics_list = repo.get('topics', []) + topics_json = json.dumps(topics_list) + + cursor.execute(''' + INSERT INTO projects ( + repo_id, name, url, description, readme_excerpt, language, + stars, pushed_at, owner_avatar_url, topics + ) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ''', ( + repo['id'], repo['name'], repo['html_url'], description, readme_excerpt, + repo.get('language', 'N/A'), repo['stargazers_count'], + repo['pushed_at'], owner_avatar_url, topics_json + )) + conn.commit() + new_projects_count += 1 + print(f" 💾 已保存 '{repo_full_name}' 到数据库.") + + print(f"\n [Scheduler] 本轮抓取完成. 新增 {new_projects_count} 个项目.") + conn.close() + return new_projects_count + +# ★★★ V6.0: 移除了 main() 和 if __name__ == '__main__' ★★★ \ No newline at end of file diff --git a/templates/_projects_partial.html b/templates/_projects_partial.html new file mode 100644 index 0000000..9a3a03e --- /dev/null +++ b/templates/_projects_partial.html @@ -0,0 +1,137 @@ +
+

CodeZen

+

一个专注中文区的 GitHub 项目发现

+
+ +
+ +

所有语言

+
+ {% for lang in languages %} + +

{{ lang }}

+
+ {% endfor %} +
+ +{% if popular_topics %} +
+ +

所有主题

+
+ {% for topic in popular_topics %} + +

{{ topic }}

+
+ {% endfor %} +
+{% endif %} + +
+ +

按时间排序

+
expand_more
+
+ +

按热度排序

+
expand_more
+
+
+ +
+ {% for project in projects %} + + +
+ {% if project['owner_avatar_url'] %} + avatar + {% endif %} +

{{ project['name'] }}

+ +
+ +

{{ project['display_text'] }}

+ + {% if project['topics'] %} +
+ {% for topic in project['topics'][:5] %} + {{ topic }} + {% endfor %} +
+ {% endif %} + +
+
star{{ format_stars(project['stars']) }}
+ {% if project['language'] and project['language'] != 'N/A' %} +
+ + {{ project['language'] }} +
+ {% endif %} +
+
+ {% else %} +
+
+
search_off
+
+

未找到项目

+

数据库为空或没有项目匹配您的筛选条件。

+
+
+
+ {% endfor %} +
+ +{% if total_pages > 1 %} + +{% endif %} \ No newline at end of file diff --git a/templates/index.html b/templates/index.html new file mode 100644 index 0000000..cafabfa --- /dev/null +++ b/templates/index.html @@ -0,0 +1,103 @@ + + + + + + CodeZen - 中文项目发现 + + + + + + + + + + +
+
+
+
+
+ +
hub
+

CodeZen

+
+
+ + + + +
+
+ +
+ {% include '_projects_partial.html' %} +
+ +
+

© {{ current_year }} CodeZen. 数据来源 GitHub.

+

本页面数据由脚本自动抓取和汇总,非实时更新。

+
+
+
+
+
+ + + + \ No newline at end of file diff --git a/webapp.py b/webapp.py new file mode 100644 index 0000000..7b7bf55 --- /dev/null +++ b/webapp.py @@ -0,0 +1,163 @@ +from flask import Flask, render_template, request +import sqlite3 +from datetime import datetime +import math +import re +import json +import collections +import os +import atexit +from apscheduler.schedulers.background import BackgroundScheduler + +from scraper import setup_database, discover_and_save_projects + +# ============================================================================== +# --- ⚙️自动化配置区域 --- +# ============================================================================== +# ★★★ 在这里填入你的GitHub Token ★★★ +GITHUB_TOKEN = 'YOUR_REAL_GITHUB_TOKEN_HERE' +# ★★★ 自动抓取的间隔时间 (小时) ★★★ +SCHEDULE_HOURS = 4 +# ============================================================================== + +app = Flask(__name__) +DB_FILE = 'github_projects.db' +PER_PAGE = 12 +TOP_TOPICS_COUNT = 20 + +LANGUAGE_COLORS = { "Python": "bg-green-400", "JavaScript": "bg-yellow-400", "TypeScript": "bg-blue-400", "Go": "bg-cyan-400", "Rust": "bg-orange-400", "HTML": "bg-purple-400", "Vue": "bg-emerald-400", "Java": "bg-red-500", "C++": "bg-pink-500", "C#": "bg-indigo-500", "PHP": "bg-violet-500", "Ruby": "bg-red-600", "Jupyter Notebook": "bg-orange-300" } + +def get_db_connection(): + conn = sqlite3.connect(DB_FILE) + conn.row_factory = sqlite3.Row + return conn + +def format_stars(num): + if num is None: return "0" + if num >= 1000: return f"{num / 1000:.1f}k" + return str(num) + +def has_chinese(text): + if not text: return False + return re.search(r'[\u4e00-\u9fa5]', text) is not None + +@app.context_processor +def utility_processor(): + return dict(format_stars=format_stars, language_colors=LANGUAGE_COLORS) + +def scheduled_scrape_job(): + print(f"\n--- [ {datetime.now()} ] ---") + print(" [Scheduler] 开始执行定时抓取任务...") + # 我们把 Token 传递给抓取函数 + discover_and_save_projects(GITHUB_TOKEN) + print(" [Scheduler] 定时抓取任务执行完毕。") + print(f"--- [ 下次运行在 {SCHEDULE_HOURS} 小时后 ] ---") + +@app.route('/') +def index(): + page = request.args.get('page', 1, type=int) + query = request.args.get('q', '') + sort_by = request.args.get('sort', 'pushed_at') + current_lang = request.args.get('lang', '') + current_topic = request.args.get('topic', '') + + conn = get_db_connection() + + languages_cursor = conn.execute("SELECT DISTINCT language FROM projects WHERE language IS NOT NULL AND language != 'N/A' ORDER BY language ASC") + languages = [row['language'] for row in languages_cursor.fetchall()] + + topic_counter = collections.Counter() + topics_cursor = conn.execute("SELECT topics FROM projects WHERE topics IS NOT NULL AND topics != '[]'") + for row in topics_cursor: + try: + topics_list = json.loads(row['topics']) + topic_counter.update(topics_list) + except json.JSONDecodeError: + continue + popular_topics = [topic for topic, count in topic_counter.most_common(TOP_TOPICS_COUNT)] + + + where_clauses = [] + params = [] + + if query: + where_clauses.append("(name LIKE ? OR description LIKE ? OR readme_excerpt LIKE ?)") + params.extend([f'%{query}%', f'%{query}%', f'%{query}%']) + + if current_lang: + where_clauses.append("language = ?") + params.append(current_lang) + + if current_topic: + where_clauses.append("topics LIKE ?") + params.append(f'%"{current_topic}"%') + + base_where = " WHERE " + " AND ".join(where_clauses) if where_clauses else "" + + total_items_cursor = conn.execute(f'SELECT COUNT(*) FROM projects{base_where}', params) + total_items = total_items_cursor.fetchone()[0] + total_pages = math.ceil(total_items / PER_PAGE) + offset = (page - 1) * PER_PAGE + + order_by_clause = " ORDER BY stars DESC" if sort_by == 'stars' else " ORDER BY pushed_at DESC" + sql_query = f"SELECT * FROM projects{base_where}{order_by_clause} LIMIT ? OFFSET ?" + final_params = params + [PER_PAGE, offset] + + projects_cursor = conn.execute(sql_query, final_params).fetchall() + + processed_projects = [] + for row in projects_cursor: + project = dict(row) + desc = project.get('description') + excerpt = project.get('readme_excerpt') + + if has_chinese(desc): project['display_text'] = desc + elif excerpt: project['display_text'] = excerpt + elif desc: project['display_text'] = desc + else: project['display_text'] = '暂无简介' + + if project.get('topics'): + try: project['topics'] = json.loads(project['topics']) + except json.JSONDecodeError: project['topics'] = [] + else: project['topics'] = [] + + processed_projects.append(project) + + conn.close() + current_year = datetime.now().year + + template_context = dict( + projects=processed_projects, + search_query=query, + current_sort=sort_by, + current_year=current_year, + current_page=page, + total_pages=total_pages, + languages=languages, + current_lang=current_lang, + popular_topics=popular_topics, + current_topic=current_topic + ) + + if request.headers.get('HX-Request'): + return render_template('_projects_partial.html', **template_context) + + return render_template('index.html', **template_context) + + +if __name__ == '__main__': + setup_database() + + print("\n" + "="*50) + print(" 🚀 启动 CodeZen 自动化服务...") + + scheduler = BackgroundScheduler() + + scheduler.add_job(func=scheduled_scrape_job, trigger="interval", hours=SCHEDULE_HOURS, misfire_grace_time=60, next_run_time=datetime.now()) + scheduler.start() + print(f" ✅ 后台抓取任务已启动 (每 {SCHEDULE_HOURS} 小时运行一次)") + + atexit.register(lambda: (print(" [Scheduler] 关闭调度器..."), scheduler.shutdown())) + print("="*50 + "\n") + + app.run(host='0.0.0.0', port=5000, debug=True, use_reloader=False) \ No newline at end of file