Trae Bot commited on
Commit ·
c481f8a
1
Parent(s): 3690b90
Upload Spider_XHS project
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +1 -0
- .gitignore +31 -0
- Dockerfile +30 -0
- PRD.md +169 -0
- PROJECT_STATUS.md +92 -0
- README.md +742 -0
- USAGE.md +299 -0
- apis/__init__.py +0 -0
- apis/xhs_creator_apis.py +428 -0
- apis/xhs_creator_login_apis.py +317 -0
- apis/xhs_pc_apis.py +1050 -0
- apis/xhs_pc_login_apis.py +208 -0
- apis/xhs_pugongying_apis.py +176 -0
- apis/xhs_qianfan_apis.py +155 -0
- cli.py +198 -0
- docker-compose.yml +14 -0
- engines/__init__.py +19 -0
- engines/agentic_crawler.py +78 -0
- engines/base.py +23 -0
- engines/mediacrawler.py +1081 -0
- engines/spider_xhs.py +304 -0
- extension/background.js +136 -0
- extension/content.js +35 -0
- extension/manifest.json +20 -0
- frontend/.gitignore +24 -0
- frontend/README.md +73 -0
- frontend/eslint.config.js +23 -0
- frontend/index.html +13 -0
- frontend/package-lock.json +0 -0
- frontend/package.json +34 -0
- frontend/public/favicon.svg +1 -0
- frontend/public/icons.svg +24 -0
- frontend/src/App.css +184 -0
- frontend/src/App.tsx +42 -0
- frontend/src/assets/hero.png +0 -0
- frontend/src/assets/react.svg +1 -0
- frontend/src/assets/vite.svg +1 -0
- frontend/src/components/JsonViewer.tsx +93 -0
- frontend/src/index.css +9 -0
- frontend/src/layouts/AppLayout.tsx +171 -0
- frontend/src/main.tsx +14 -0
- frontend/src/pages/AIGenerationPage.tsx +77 -0
- frontend/src/pages/CleanedNotesPage.tsx +253 -0
- frontend/src/pages/ComplianceReviewPage.tsx +98 -0
- frontend/src/pages/DashboardPage.tsx +232 -0
- frontend/src/pages/ErrorsPage.tsx +276 -0
- frontend/src/pages/LeadsPage.tsx +78 -0
- frontend/src/pages/MetricsPage.tsx +81 -0
- frontend/src/pages/RawNotesPage.tsx +187 -0
- frontend/src/pages/ResourcesAccountsPage.tsx +178 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*.js filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__
|
| 2 |
+
node_modules/
|
| 3 |
+
*.so
|
| 4 |
+
.Python
|
| 5 |
+
build/
|
| 6 |
+
develop-eggs/
|
| 7 |
+
dist/
|
| 8 |
+
downloads/
|
| 9 |
+
eggs/
|
| 10 |
+
.eggs/
|
| 11 |
+
lib/
|
| 12 |
+
lib64/
|
| 13 |
+
parts/
|
| 14 |
+
sdist/
|
| 15 |
+
var/
|
| 16 |
+
wheels/
|
| 17 |
+
MANIFEST
|
| 18 |
+
*.manifest
|
| 19 |
+
*.spec
|
| 20 |
+
.cache
|
| 21 |
+
*.log
|
| 22 |
+
local_settings.py
|
| 23 |
+
db.sqlite3
|
| 24 |
+
__pypackages__/
|
| 25 |
+
.venv
|
| 26 |
+
env/
|
| 27 |
+
venv/
|
| 28 |
+
ENV/
|
| 29 |
+
env.bak/
|
| 30 |
+
venv.bak/
|
| 31 |
+
|
Dockerfile
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 6 |
+
curl \
|
| 7 |
+
ca-certificates \
|
| 8 |
+
gnupg \
|
| 9 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 10 |
+
|
| 11 |
+
RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \
|
| 12 |
+
&& apt-get install -y --no-install-recommends nodejs \
|
| 13 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 14 |
+
|
| 15 |
+
COPY requirements.txt .
|
| 16 |
+
RUN pip install --no-cache-dir -r requirements.txt \
|
| 17 |
+
&& playwright install chromium \
|
| 18 |
+
&& playwright install-deps chromium
|
| 19 |
+
|
| 20 |
+
COPY package.json .
|
| 21 |
+
RUN npm install
|
| 22 |
+
|
| 23 |
+
COPY . .
|
| 24 |
+
|
| 25 |
+
ENV PYTHONPATH=/app
|
| 26 |
+
ENV PYTHONUNBUFFERED=1
|
| 27 |
+
|
| 28 |
+
EXPOSE 7860
|
| 29 |
+
|
| 30 |
+
CMD ["uvicorn", "service.app:app", "--host", "0.0.0.0", "--port", "7860"]
|
PRD.md
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 小红书稳定采集微服务 (Spider_XHS) - 产品需求文档 (PRD)
|
| 2 |
+
|
| 3 |
+
## 1. 产品概述
|
| 4 |
+
|
| 5 |
+
### 1.1 产品名称
|
| 6 |
+
小红书稳定采集微服务 (Spider_XHS Stability Data Service)
|
| 7 |
+
|
| 8 |
+
### 1.2 产品定位
|
| 9 |
+
一款面向企业级大模型(AI Agent)、数据分析、内容运营等上游业务,提供**高可用、抗风控、全链路可溯源**的小红书数据采集基础设施。本产品将底层的反爬对抗与数据清洗封装为标准 RESTful API,使得上游业务无需关注账号、风控及底层协议细节,即可稳定获取所需数据。
|
| 10 |
+
|
| 11 |
+
### 1.3 背景与痛点
|
| 12 |
+
- **风控严苛**:小红书针对协议级 API 采集有着极强的反爬策略(封号、IP 限流、滑动验证码)。单一的协议破解(如逆向 `x-s` 签名)往往在遭遇强风控时全线崩溃。
|
| 13 |
+
- **效率与稳定性的博弈**:传统的浏览器自动化方案(Playwright/Selenium)虽能较好地模拟真人绕过风控,但并发低、极其消耗 CPU/内存资源,无法满足大规模跑批需求。
|
| 14 |
+
- **业务不能断流**:对于上游的内容生成和监控业务,数据的断流意味着业务停滞,必须拥有 100% 可用的兜底机制。
|
| 15 |
+
|
| 16 |
+
### 1.4 核心解决方案
|
| 17 |
+
**“稳定性调度大脑 (Stability Controller) + 三引擎自动降级 + RPA 回传兜底 + AI Agent 自动化获客闭环”**:
|
| 18 |
+
1. **主链路(三引擎自动降级)**:
|
| 19 |
+
- 引擎 A (Spider_XHS): 协议逆向极速采集(高并发、低成本)
|
| 20 |
+
- 引擎 B (MediaCrawler): Playwright Stealth 增强页面采集(含拟人化)
|
| 21 |
+
- 引擎 D (AgenticCrawler): 基于大模型视觉的智能自适应页面解析引擎(抗前端 DOM 改版)
|
| 22 |
+
2. **统一调度与资源池**:前置 Stability Controller 负责基于错误策略(`auth/rate/risk/captcha/timeout` 等)对 Account Pool(账号冷却池)、Session Pool(会话轮换)和 Proxy Pool(动态代理评分与剔除)进行资源分配、重试与引擎降级决策。在遇到复杂验证码时,主动唤醒 **AI Agentic Captcha Solver** 智能解除风控。
|
| 23 |
+
3. **兜底链路(Chrome 插件 RPA 回传)**:当所有自动引擎和 AI 解除验证码均失效,任务进入 `WAITING_RPA` 时,由人工在真实浏览器环境中通过插件采集并调用 `POST /api/v1/import/extension` 回传结果。
|
| 24 |
+
4. **离线导入链路(人工离线导入)**:支持运营人员导出的 Excel(小红书数据助手等)格式化导入。
|
| 25 |
+
5. **AI 编排脚本(Orchestrator)**:提供基于 SQLite 的业务编排闭环,包含数据清洗、AI 智能生成图文内容、基于 AI Agent(`browser-use`)的**全自动真实发布**与**高意向线索自动私信触达**。
|
| 26 |
+
6. **运营控制台 (Ops Console)**:提供基于 React + Ant Design 的前端看板,实现资源池(账号/会话/代理)监控、错误分析聚合与内容库的只读可视化。
|
| 27 |
+
|
| 28 |
+
---
|
| 29 |
+
|
| 30 |
+
## 2. 核心架构设计
|
| 31 |
+
|
| 32 |
+
### 2.1 系统架构图概览
|
| 33 |
+
```text
|
| 34 |
+
上游业务 (Java/Agent) ──► [ OpenAPI / Webhook ] ──► (Spider_XHS FastAPI 微服务)
|
| 35 |
+
│
|
| 36 |
+
┌──────────────────────────────────────────────────────┤
|
| 37 |
+
▼ ▼
|
| 38 |
+
[ 稳定性大脑: Stability Controller ] [ 兜底链路:数据导入 (Importer) ]
|
| 39 |
+
├─► 资源分配:Account Pool + Session Pool + Proxy Pool ├─► 解析器: xhs_data_assistant
|
| 40 |
+
├─► AI Agent 风控解除 (Captcha Solver) ├─► 解析器: fixed_template
|
| 41 |
+
▼ │
|
| 42 |
+
[ 多引擎执行链路 (Runner) ] └─► 字段标准化、元数据注入
|
| 43 |
+
├─► 引擎 A (Spider_XHS): 协议逆向、极速并发 │
|
| 44 |
+
│ └─ 遇 timeout/rate/auth/risk/captcha ─重试/降级─┐ │
|
| 45 |
+
│ ▼ │
|
| 46 |
+
├─► 引擎 B (MediaCrawler): Stealth 防抖模拟人行为 │
|
| 47 |
+
│ └─ DOM 解析失效/改版 ─降级─┐ │
|
| 48 |
+
│ ▼ │
|
| 49 |
+
└─► 引擎 D (Agentic Crawler): AI 视觉自动解析 ───────────┤
|
| 50 |
+
└─ 若仍被验证码拦截 → 状态 WAITING_RPA(等待人工回传) │
|
| 51 |
+
┌──────────────────────┴───────────────────────────────┘
|
| 52 |
+
▼
|
| 53 |
+
[ 持久化与监控 ]
|
| 54 |
+
├─► 文件存储 (JSON + HTML快照) 带有 fcntl 进程锁
|
| 55 |
+
├─► 限流拦截 (IP 滑动窗口 RateLimit)
|
| 56 |
+
└─► Prometheus 指标监控 (/metrics)
|
| 57 |
+
│
|
| 58 |
+
▼
|
| 59 |
+
[ AI 自动化业务编排 (Orchestrator) ]
|
| 60 |
+
├─► 核心数据库: SQLite (14张业务表: 关键词、生成草稿、线索等)
|
| 61 |
+
├─► AI Agent 自动操作: 发布图文 (publish_tracker.py)
|
| 62 |
+
├─► AI Agent 自动操作: 线索私信触达 (lead_service.py)
|
| 63 |
+
└─► 生态协同: 数据清洗、AI 图文生成、飞书同步
|
| 64 |
+
|
| 65 |
+
[ 运营控制台 (Ops Console) ]
|
| 66 |
+
├─► 资源池中心 (账号/会话/代理健康快照)
|
| 67 |
+
├─► 错误中心 (任务失败聚合与异常扫描)
|
| 68 |
+
└─► 内容库看板 (基于 SQLite 数据的全量只读展示)
|
| 69 |
+
|
| 70 |
+
(第三兜底链路:Chrome 插件 RPA 回传)
|
| 71 |
+
浏览器插件 ──► POST /api/v1/import/extension ──► 写入任务结果
|
| 72 |
+
```
|
| 73 |
+
|
| 74 |
+
---
|
| 75 |
+
|
| 76 |
+
## 3. 功能需求说明
|
| 77 |
+
|
| 78 |
+
### 3.1 核心采集业务 (Data Scraping)
|
| 79 |
+
- **笔记详情采集 (`note_url`)**:输入笔记链接,获取无水印图文、视频信息、正文、点赞/收藏/评论等互动数据。
|
| 80 |
+
- **用户主页采集 (`user_profile`)**:输入用户 ID 或主页链接,获取博主基本信息、粉丝数、关注数、获赞与收藏总数。
|
| 81 |
+
- **关键词搜索采集 (`search`)**:输入关键词,获取相关笔记列表,支持按综合/最新/热门排序。
|
| 82 |
+
|
| 83 |
+
### 3.2 智能引擎调度与稳定性控制 (Stability Controller)
|
| 84 |
+
- **资源池化管理**:系统内置账号池 (Account Pool)、会话池 (Session Pool) 和代理池 (Proxy Pool),所有采集任务通过 Stability Controller 动态获取最佳健康资源。
|
| 85 |
+
- **代理打分与剔除**:代理池支持多 provider 聚合,根据任务执行结果实时回写分数,自动降权并剔除高失败率代理。
|
| 86 |
+
- **账号防封冷却**:连续遭遇 `rate` 或 `risk` 错误的账号将进入冷却窗口,避免被平台拉黑。
|
| 87 |
+
- **三引擎容灾与降级(服务端自动执行)**:
|
| 88 |
+
- 任务默认以 `auto` 策略下发,优先分配给 **Spider_XHS 引擎 (Engine A)** 以最高效率执行。
|
| 89 |
+
- 监听执行异常并按策略表处置:`timeout` 换代理重试;`rate` 账号冷却+换账号降频;`auth` 标记会话失效+换 Session。
|
| 90 |
+
- 遇到强风控 (`risk`/`captcha`) 时,降级至 **MediaCrawler 引擎 (Engine B)**。
|
| 91 |
+
- Engine B 内置 Stealth 脚本与随机拟人化行为。若 Engine B 报 `parse` 错误(说明 DOM 结构发生变化),任务自动流转至 **Agentic Crawler (Engine D)**,由大模型视觉接管。
|
| 92 |
+
- **智能验证码解除 (Captcha Solver)**:遇到复杂滑块或点选验证码时,系统主动实例化基于大模型的 `AgenticCaptchaSolver`,尝试自动操控浏览器完成过验。若 Agent 仍失败,任务进入 `WAITING_RPA` 状态。
|
| 93 |
+
- **终极免疫链路(人工 RPA 回传)**:处于 `WAITING_RPA` 的任务可由人工通过 Chrome 浏览器插件拦截并回传数据,完成闭环;该链路属于人工兜底通道,而非服务端自动执行的引擎线程。
|
| 94 |
+
|
| 95 |
+
### 3.3 异步任务与状态流转 (Task Lifecycle)
|
| 96 |
+
- **扩展任务状态机**:支持 `queued` -> `running` -> `retrying` -> `fallback_running` -> `succeeded` / `failed` / `waiting_rpa` / `rpa_imported` / `risk_paused` 等细粒度状态。
|
| 97 |
+
- **异步拉取**:上游业务通过创建任务获取 `task_id`,并可通过长轮询获取执行结果。
|
| 98 |
+
- **Webhook 回调**:支持配置全局 `CALLBACK_URL`。任务到达终态后自动触发回调推送。
|
| 99 |
+
- **回调重试**:如遇上游网络抖动,系统自动采用指数退避算法最多重试 5 次,并在头部携带 `Idempotency-Key` 确保上游幂等消费。
|
| 100 |
+
|
| 101 |
+
### 3.4 离线数据人工兜底 (Offline Import)
|
| 102 |
+
- **Excel 文件解析**:提供 `/api/v1/import/excel` 接口,支持上传 Excel 文件。
|
| 103 |
+
- **模板智能识别**:自动识别“小红书数据助手”等特定报表格式。
|
| 104 |
+
- **数据标准化**:提取“曝光、阅读、互动、转粉”等高阶流量指标,将其与自动化采集的数据结构对齐 (`normalized` 数据契约)。
|
| 105 |
+
- **全链路溯源**:保留 `operator` (操作人)、`source_name` (来源)、行号及表名等元数据,便于数据审计。
|
| 106 |
+
|
| 107 |
+
### 3.5 运营控制台与可视化 (Ops Console)
|
| 108 |
+
- **资源池中心**:直接复用内存池(Account/Session/Proxy Pool)状态快照,提供只读的可视化列表,方便运维人员实时监控风控与资源健康度。
|
| 109 |
+
- **错误聚合分析**:基于本地任务文件系统,扫描近期(如近 1000 条)采集任务,并根据错误类别(auth/rate/risk/captcha 等)进行高阶聚合分析,提供失败列表过滤能力。
|
| 110 |
+
- **内容库基础页**:对接 Orchestrator 的 SQLite 底座,为业务人员提供原始笔记和标准化笔记的分页查询与模糊检索视图。
|
| 111 |
+
|
| 112 |
+
---
|
| 113 |
+
|
| 114 |
+
## 4. 非功能需求
|
| 115 |
+
|
| 116 |
+
### 4.1 并发与数据一致性
|
| 117 |
+
- **无状态服务设计**:服务进程应尽量无状态化,数据落盘依赖本地文件系统(`./storage`)。
|
| 118 |
+
- **并发控制锁**:针对单机多进程/多线程场景,写入任务状态时必须使用 `fcntl.flock` 实现进程间排他锁,配合 `.tmp` 文件原子替换 (`os.replace`),杜绝高并发下的文件写坏或数据覆盖问题。
|
| 119 |
+
|
| 120 |
+
### 4.2 限流与防雪崩
|
| 121 |
+
- **IP 级滑动窗口限流**:内置轻量级基于内存的 API 限流(如 `100 次 / 60 秒`),超出阈值立即返回 HTTP 429 状态码与 `Retry-After` 头,防止恶意或异常流量打垮服务。
|
| 122 |
+
|
| 123 |
+
### 4.3 监控与可观测性
|
| 124 |
+
- **Prometheus 集成**:暴露标准 `/api/v1/metrics` 接口,输出以下核心指标:
|
| 125 |
+
- `spider_xhs_tasks_total{engine, status}`:各引擎任务执行计数。
|
| 126 |
+
- `spider_xhs_queue_length` 与 `spider_xhs_tasks_inflight`:当前排队及在途队列长度。
|
| 127 |
+
- `spider_xhs_recent_failure_rate`:滑动窗口内的实时失败率报警指标。
|
| 128 |
+
- 代理池指标:`spider_xhs_proxy_pool_size`、`spider_xhs_proxy_pool_avg_score` 及失败原因分布统计。
|
| 129 |
+
- **结构化日志**:使用 `loguru` 输出结构化日志并按天切割;对部分错误日志做基础脱敏处理,但不保证对所有自定义日志字段自动脱敏,敏感凭证应只通过环境变量注入且避免打印。
|
| 130 |
+
|
| 131 |
+
### 4.4 部署与兼容性
|
| 132 |
+
- **容器化支持**:提供 Dockerfile 与 Docker Compose 编排,支持一键部署。存储目录独立挂载,保障数据持久化。
|
| 133 |
+
- **向后兼容**:针对历史未带版本号的旧 API,可通过配置 `ENABLE_LEGACY_ROUTES=1` 提供兼容层平滑过渡。
|
| 134 |
+
|
| 135 |
+
---
|
| 136 |
+
|
| 137 |
+
## 5. API 接口契约说明
|
| 138 |
+
|
| 139 |
+
### 5.1 全局契约
|
| 140 |
+
所有 RESTful API 的响应体均采用标准三段式包装:
|
| 141 |
+
```json
|
| 142 |
+
{
|
| 143 |
+
"code": 200, // 业务状态码,200 为成功,100xx 为特定业务错误
|
| 144 |
+
"msg": "success", // 提示信息
|
| 145 |
+
"data": { ... } // 载荷数据
|
| 146 |
+
}
|
| 147 |
+
```
|
| 148 |
+
|
| 149 |
+
### 5.2 核心端点规划
|
| 150 |
+
| 端点路径 | Method | 用途 | 核心参数/说明 |
|
| 151 |
+
|---|---|---|---|
|
| 152 |
+
| `/api/v1/tasks` | POST | 创建采集任务 | 必传: `task_type`, `target`; 可选: `engine`, `payload` |
|
| 153 |
+
| `/api/v1/tasks/{id}` | GET | 查询任务状态 | 返回当前状态 (`queued`, `running` 等) |
|
| 154 |
+
| `/api/v1/tasks/{id}/result`| GET | 获取任务结果 | 若未完成返回 409;完成后返回 `raw`, `normalized`, `meta` |
|
| 155 |
+
| `/api/v1/tasks/{id}/callback/retry` | POST | 手动重试回调 | 用于 Webhook 推送彻底失败后的人工介入 |
|
| 156 |
+
| `/api/v1/import/excel` | POST | 上传并解析 Excel | 必传: `file`, `operator`; 输出标准化清洗结果 |
|
| 157 |
+
| `/api/v1/import/extension` | POST | 插件 RPA 结果回传 | 必传: `task_id`, `raw`, `normalized`; 更新任务为 `RPA_IMPORTED` |
|
| 158 |
+
| `/api/v1/metrics` | GET | Prometheus 指标 | 面向监控采集系统 (如 Grafana),含任务、队列、代理池状态 |
|
| 159 |
+
| `/api/v1/health` | GET | 健康检查与看板 | 包含队列长度、在途数、各引擎执行统计及实时失败率 |
|
| 160 |
+
|
| 161 |
+
---
|
| 162 |
+
|
| 163 |
+
## 6. 演进规划 (Roadmap)
|
| 164 |
+
|
| 165 |
+
- **Phase 1 (V1.0) - 历史版本**:完成双引擎双链路架构重构,实现核心采集、容灾降级、IP限流、并发控制与容器化监控交付。
|
| 166 |
+
- **Phase 2 (MVP) - 当前版本**:新增轻量级 Orchestrator 编排脚本与 SQLite 核心业务数据库,提供“采集同步→内容清洗→Mock 飞书同步/告警”的最小闭环样例。
|
| 167 |
+
- **Phase 3 (Stability) - 当前版本**:完成 Stability Controller (稳定性大脑) 重构,补齐 Account Pool、Session Pool 与 Proxy Pool 的全链路生命周期管理。完成 Chrome 插件回传通道(`/api/v1/import/extension` + `WAITING_RPA` 状态机),形成“双引擎自动采集 + 人工 RPA 回传兜底”的可控闭环。新增基于 React 的运营控制台,落地资源池中心、错误中心与内容库基础页的只读可视化。
|
| 168 |
+
- **Phase 4 (V1.1)**:增加任务优先级队列 (Priority Queue),支持高优紧急任务插队;集成 Redis 作为分布式存储与分布式锁的选项,支持多节点横向扩容部署。
|
| 169 |
+
- **Phase 5 (V2.0)**:接入大模型智能提取链路(针对未知格式或乱码的 Excel,通过 LLM 自动提取结构化内容);提供可视化热更新管理后台(用于账号池、代理池调优)。
|
PROJECT_STATUS.md
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Spider_XHS (小红书稳定采集微服务) - 项目完成情况介绍
|
| 2 |
+
|
| 3 |
+
## 1. 项目概览
|
| 4 |
+
|
| 5 |
+
Spider_XHS 最初是一个基于小红书 PC 端和创作者平台 API 逆向签名的纯脚本库。经过最近几轮的大型架构重构,该项目已经脱胎换骨,全面升级为一个**企业级、高可用、抗风控**的稳定采集微服务,并在生产层面引入了“稳定性调度大脑 + 资源池化 + 双自动引擎 + RPA 回传兜底”的组合策略。
|
| 6 |
+
|
| 7 |
+
目前,项目已完成 V1.0.0 技术规格书中定义的所有核心指标,实现了从“个人学习用爬虫”到“AI Agent 数据底座”的跨越。
|
| 8 |
+
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
## 2. 核心架构完成情况
|
| 12 |
+
|
| 13 |
+
### 2.1 稳定性大脑与多链路协同 (Stability Controller & Multi-Path Fallback)
|
| 14 |
+
已**完全实现**。
|
| 15 |
+
- **稳定性调度大脑 (Stability Controller)**:不再将降级写死在代码中,而是抽象出统一的大脑,依据细粒度错误(`timeout/rate/auth/risk/captcha`)决定:重试、账号冷却、会话失效、引擎降级或暂停任务。
|
| 16 |
+
- **资源池化底座**:
|
| 17 |
+
- `Account Pool`:提供账号风险分管理与连续报错(`rate/risk`)的自动冷却窗口。
|
| 18 |
+
- `Session Pool`:统一管理 API Cookie 会话与 Playwright `storage_state`,具备轻量级健康检查与会话轮换能力。
|
| 19 |
+
- `Proxy Pool`:支持从多个外部 API 或文件动态拉取 IP,基于执行结果同步打分,并具备高失败代理自动剔除逻辑。
|
| 20 |
+
- **多引擎自动采集 + 一条人工 RPA 兜底链路**:
|
| 21 |
+
- **Engine A (API 引擎)**:作为极速并发的首选。
|
| 22 |
+
- **Engine B (Browser 引擎)**:增强了 Stealth 抹机脚本与随机拟人化行为,作为遭遇风控验证码时的第二链路。
|
| 23 |
+
- **Engine D (Agentic Crawler)**:全新引入的基于 `browser-use` 和视觉大模型的智能引擎,专治小红书前端 DOM 改版导致传统爬虫失效的场景。
|
| 24 |
+
- **Agentic Captcha Solver**:智能风控解除。系统在遇到复杂验证码时,会自动唤醒大模型视觉实例去识别并完成验证码点选/滑块。
|
| 25 |
+
- **RPA 回传链路(Chrome 扩展)**:项目已提供可加载的 Chrome 插件与服务端回传接口 `/api/v1/import/extension`。该链路是**最后兜底通道**,用于将处于 `WAITING_RPA` 的任务结果回填并继续回调/入库流程。
|
| 26 |
+
|
| 27 |
+
### 2.2 人工兜底采集链路 (Offline Excel Import)
|
| 28 |
+
已**完全实现**。
|
| 29 |
+
- 针对极端情况(全网 IP 被封或大版本升级),开发了 `/api/v1/import/excel` 接口。
|
| 30 |
+
- 支持直接上传由“小红书数据助手”或第三方平台导出的离线 Excel 数据。
|
| 31 |
+
- 解析器(`xhs_data_assistant`)能够智能识别变体表头,自动完成字段映射(如提取曝光、阅读、互动、转粉等指标),并将其洗牌为标准的 `normalized` 数据结构,实现线上自动化与线下人工数据的无缝衔接。
|
| 32 |
+
|
| 33 |
+
---
|
| 34 |
+
|
| 35 |
+
## 3. 服务端能力完成情况
|
| 36 |
+
|
| 37 |
+
### 3.1 异步任务与 Webhook 回调
|
| 38 |
+
已**完全实现**。
|
| 39 |
+
- 采用 FastAPI 构建了无状态的轻量级微服务(`service/app.py`)。
|
| 40 |
+
- 支持长轮询模式(`/api/v1/tasks/{id}/result`)。
|
| 41 |
+
- 支持更丰富的细粒度任务状态流转(`queued` -> `running` -> `retrying` / `fallback_running` -> `waiting_rpa` / `rpa_running` -> `succeeded/failed/rpa_imported` 等)。
|
| 42 |
+
- 实现了基于 `Idempotency-Key` 的 Webhook 回调推送机制,并在回调失败时支持指数退避的自动重试(最多 5 次),确保上游(如 Java 服务端)能够稳定、幂等地接收采集结果。
|
| 43 |
+
|
| 44 |
+
### 3.2 极高并发与文件锁存储
|
| 45 |
+
已**完全实现**。
|
| 46 |
+
- 数据持久化层(`service/storage.py`)采用了基于本地文件系统(`./storage`)的 JSON 分片存储策略。
|
| 47 |
+
- **原子落盘 + 进程锁**:写入采用临时文件 + `os.replace` 原子替换;为避免多进程并发更新造成覆盖,关键更新路径使用 `fcntl.flock` 做轻量进程互斥,保证一致性与可恢复性。
|
| 48 |
+
|
| 49 |
+
### 3.3 企业级运维与监控保障
|
| 50 |
+
已**完全实现**。
|
| 51 |
+
- **防雪崩 IP 限流**:在 API 网关层实现了基于内存滑动窗口的 IP 限流器(`PerIPRateLimitMiddleware`),默认 `100次/60秒`,超出阈值立即返回 `429 Retry-After`,同时优化了清理逻辑的 O(N) 开销,防止恶意流量打垮服务。
|
| 52 |
+
- **Prometheus 监控**:暴露了标准的 `/api/v1/metrics` 接口,可直出各引擎的 `spider_xhs_tasks_total`、队列积压及在途任务长度,以及 `spider_xhs_recent_failure_rate`、代理池规模、评分及失败原因等核心业务指标,可对接 Grafana 看板。
|
| 53 |
+
- **结构化日志与基础脱敏**:使用 `loguru` 实现日志按天切割、保留与压缩;对部分错误日志做基础脱敏处理,但不保证对所有自定义日志字段自动脱敏,敏感凭证应只通过环境变量注入且避免打印。
|
| 54 |
+
- **自动清理**:服务内置后台守护线程,自动清理过期(默认 7 天)的 Browser 引擎生成的 HTML 原始快照。
|
| 55 |
+
|
| 56 |
+
### 3.4 自动化获客闭环 MVP (Automated Customer Acquisition Closed-Loop)
|
| 57 |
+
已**完全实现**。
|
| 58 |
+
- 在服务上层新增了 `orchestrator/` 编排层,实现了“采集同步 → 内容清洗 → AI 生成图文 → AI 自动发布 → AI 自动触达高意向用户 → 飞书线索同步”的完整业务闭环。
|
| 59 |
+
- **本地数据库底座**:基于 SQLite 构建了 14 张核心业务表(`orchestrator/data/mvp.db`),覆盖了从数据获取、图文草稿管理、任务重试到客户线索的所有生命周期。
|
| 60 |
+
- **AI 生成与自动操作能力**:
|
| 61 |
+
- 基于大模型实现了批量仿写与原创文案的生成(`ai_generation.py`)。
|
| 62 |
+
- 引入了基于 `browser-use` 框架的 AI Browser Agent,成功实现了无人值守的**真实小红书笔记自动发布**(`publish_tracker.py`)与**高意向线索的自动私信回复**(`lead_service.py`)。
|
| 63 |
+
- **生态协同**:`feishu_sync.py` 与 `alert.py` 完成了线索数据向上游 CRM(飞书多维表格)和报警群的 Mock 流转。
|
| 64 |
+
|
| 65 |
+
### 3.5 运营控制台可视化 (Ops Console Visualization)
|
| 66 |
+
已**完全实现**。
|
| 67 |
+
- 基于 React + Vite + Ant Design 构建了轻量级前端控制台(`frontend/`)。
|
| 68 |
+
- 实现了只读模式的**资源池中心**(账号/会话/代理状态快照)、**错误中心**(任务错误聚合与失败任务扫描)以及**内容库基础页**(从 Orchestrator SQLite 数据库读取原始笔记与清洗后数据)。
|
| 69 |
+
- 通过复用内存中的 `AccountPool`, `SessionPool`, `ProxyPool` 的快照和本地任务文件列表,以最小侵入的方式实现了前后端数据打通,提供了友好的看板展示与空态引导。
|
| 70 |
+
|
| 71 |
+
---
|
| 72 |
+
|
| 73 |
+
## 4. 部署与兼容性完成情况
|
| 74 |
+
|
| 75 |
+
### 4.1 容器化与独立运行
|
| 76 |
+
已**完全实现**。
|
| 77 |
+
- 提供了完整的 `Dockerfile` 与 `docker-compose.yml`。
|
| 78 |
+
- 支持一键拉起包括 FastAPI 服务、本地文件挂载(`./storage`)在内的所有组件。
|
| 79 |
+
- 提供了 `MEDIACRAWLER_MOCK` 模式,允许在没有安装 Playwright 依赖(或 CI/CD 环境)的情况下,通过 Mock HTML 直接验证服务调度、容灾及解析逻辑。
|
| 80 |
+
|
| 81 |
+
### 4.2 遗留系统兼容
|
| 82 |
+
已**完全实现**。
|
| 83 |
+
- 通过环境变量 `ENABLE_LEGACY_ROUTES=1`,支持无缝兼容不带 `/api/v1` 前缀的旧版路由请求。
|
| 84 |
+
- 存储层支持启动时自动识别并热迁移旧版的单一 `tasks.json` 文件到全新的碎片化目录结构中,做到了对老用户的零感知升级。
|
| 85 |
+
|
| 86 |
+
---
|
| 87 |
+
|
| 88 |
+
## 5. 总结
|
| 89 |
+
|
| 90 |
+
目前,**Spider_XHS 已达成当前版本既定的产品化目标**。它不仅保留了作为底层爬虫框架逆向签名的硬核能力,更通过引入 FastAPI 微服务架构、稳定性调度大脑 (Stability Controller)、资源池化底座、双引擎自动容灾(API + Playwright Stealth)与 RPA 回传兜底、原子落盘 + 进程锁的本地持久化、Prometheus 监控以及 IP 防雪崩限流,形成了一个可用于上游 AI/数据业务调用的稳定采集底座。
|
| 91 |
+
|
| 92 |
+
此外,项目提供了一个**脚本样例级的编排 MVP**(`orchestrator/` + SQLite),用于演示采集同步与基础清洗、以及 Mock 的飞书同步与告警落库,为后续完善 AI 生成/合规模块预留了清晰扩展点。同时,配合基于 React 的**前端运营控制台**,打通了资源池监控、错误聚合排查与内容展示的可视化闭环,极大提升了项目的可用性与运维体验。
|
README.md
CHANGED
|
@@ -8,3 +8,745 @@ pinned: false
|
|
| 8 |
---
|
| 9 |
|
| 10 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
| 11 |
+
<div align="center">
|
| 12 |
+
|
| 13 |
+
# Spider_XHS
|
| 14 |
+
|
| 15 |
+
**专业的小红书数据采集 & 全域运营解决方案 & Agent Skills**
|
| 16 |
+
|
| 17 |
+
[](https://www.python.org/)
|
| 18 |
+
[](https://nodejs.org/)
|
| 19 |
+
[](LICENSE)
|
| 20 |
+
|
| 21 |
+
</div>
|
| 22 |
+
|
| 23 |
+
> **在 AI 大模型爆发的时代,内容运营的竞争本质是效率竞争。**
|
| 24 |
+
> 本项目封装了小红书平台完整的数据采集与内容发布能力,为开发者构建 AI 运营智能体提供可靠、稳定的底层 API 支撑。
|
| 25 |
+
> **本项目已升级为「稳定性大脑 + 三引擎自动降级 + AI Agent 全链路闭环」架构:服务端自动执行 API 协议采集 + Playwright Stealth + 基于大模型视觉的 Agentic Crawler,并支持 AI Agent 全自动发布笔记与自动触达私信线索。**
|
| 26 |
+
|
| 27 |
+
**⚠️ 本项目仅供学习交流使用,禁止任何商业化行为,如有违反,后果自负**
|
| 28 |
+
|
| 29 |
+
---
|
| 30 |
+
|
| 31 |
+
## 为什么需要这个项目?
|
| 32 |
+
|
| 33 |
+
```
|
| 34 |
+
采集竞品笔记 ──► [Spider_XHS] ──► 你的 AI Agent(改写 / 生成 / 分析)──► 自动上传发布
|
| 35 |
+
▲ │
|
| 36 |
+
└──────────── 获取数据 / 管理账号 ◄──────────────────────┘
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
小红书没有开放完整的内容运营接口。想要接入 AI 大模型实现内容批量采集、智能改写、一键发布,首先需要能**稳定读写平台数据**。Spider_XHS 解决的正是这个前置问题:
|
| 40 |
+
|
| 41 |
+
- 逆向还原了小红书 PC 端与创作者平台的签名算法(x-s / x-t / x-s-common / x_b3_traceid / sign / q-signature参数)
|
| 42 |
+
- 封装全部核心 HTTP 接口,签名参数已透明处理
|
| 43 |
+
- 同时覆盖 **数据采集**(PC端)、**内容发布**(创作者平台)、**KOL数据**(蒲公英)三大场景
|
| 44 |
+
|
| 45 |
+
**你负责接 AI 大脑,我们负责打通小红书的神经。**
|
| 46 |
+
|
| 47 |
+
---
|
| 48 |
+
|
| 49 |
+
## ⭐ 已实现功能
|
| 50 |
+
|
| 51 |
+
| 模块 | 功能 | 状态 |
|
| 52 |
+
|------|------|------|
|
| 53 |
+
| **小红书 PC 端** | 二维码登录 / 手机验证码登录 | ✅ |
|
| 54 |
+
| | 获取主页所有频道 & 推荐笔记 | ✅ |
|
| 55 |
+
| | 获取用户主页信息 / 自己的账号信息 | ✅ |
|
| 56 |
+
| | 获取用户发布 / 喜欢 / 收藏的所有笔记 | ✅ |
|
| 57 |
+
| | 获取笔记详细内容(无水印图片 & 视频) | ✅ |
|
| 58 |
+
| | 搜索笔记 & 搜索用户 | ✅ |
|
| 59 |
+
| | 获取笔记评论 | ✅ |
|
| 60 |
+
| | 获取未读消息 / 评论@提醒 / 点赞收藏 / 新增关注 | ✅ |
|
| 61 |
+
| **创作者平台** | 二维码登录 / 手机验证码登录 | ✅ |
|
| 62 |
+
| | 上传图集作品 | ✅ |
|
| 63 |
+
| | 上传视频作品 | ✅ |
|
| 64 |
+
| | 查看已发布作品列表 | ✅ |
|
| 65 |
+
| **蒲公英平台** | 获取 KOL 博主列表 & 详细数据 | ✅ |
|
| 66 |
+
| | 获取博主粉丝画像 & 历史趋势 | ✅ |
|
| 67 |
+
| | 发起合作邀请 | ✅ |
|
| 68 |
+
| **千帆平台** | 获取分销商列表 & 详细数据 | ✅ |
|
| 69 |
+
| | 获取分销商合作品类 / 店铺 / 商品信息 | ✅ |
|
| 70 |
+
|
| 71 |
+
---
|
| 72 |
+
|
| 73 |
+
## 🌟 核心特性:稳定性大脑与 AI Agent 多链路容灾
|
| 74 |
+
|
| 75 |
+
为了解决小红书日益严格的风控(封号、验证码、前端频繁改版)问题,本项目重构了底层的采集架构,全面升级为**三自动引擎 + RPA 回传兜底 + 稳定性大脑 (Stability Controller)** 模式,完美适配生产级高可用业务需求。
|
| 76 |
+
|
| 77 |
+
### 1. 稳定性大脑与资源池化
|
| 78 |
+
不再将异常写死在爬虫逻辑中,而是引入统一的 `Stability Controller`,并辅以三大资源池,实现防封禁的生命周期管理:
|
| 79 |
+
- **Account Pool(账号池)**:自动管理多账号风险分,遭遇高频风控时触发防封冷却窗。
|
| 80 |
+
- **Session Pool(会话池)**:兼容 API 登录态与浏览器 Storage State,支持失效检测与自动轮换。
|
| 81 |
+
- **Proxy Pool(代理池)**:支持多数据源动态拉取 IP,根据网络响应执行同步打分,自动剔除高失败率代理。
|
| 82 |
+
- **Agentic Captcha Solver**:智能风控解除。系统在遇到复杂验证码时,会自动唤醒基于 `browser-use` 和视觉大模型的 Agent,自动完成滑块拖动和文字点选。
|
| 83 |
+
|
| 84 |
+
### 2. 自动化主采集链路(三引擎容灾)
|
| 85 |
+
提供标准的 FastAPI 轻量服务,服务端自动执行三大引擎:
|
| 86 |
+
- **Engine A (API 协议引擎)**:通过逆向签名算法直接请求接口。极速、并发高、资源占用极小,作为绝对主力。
|
| 87 |
+
- **Engine B (Browser 浏览器引擎)**:引入 Playwright 自动化框架,并提供 Stealth 脚本注入与随机拟人化动作(鼠标轨迹、滚动,支持开关)以降低自动化特征。
|
| 88 |
+
- **Engine D (Agentic Crawler)**:引入了**基于视觉大模型的自适应页面提取引擎**。当小红书前端 DOM 结构发生变化导致 Engine B 的固定解析规则失效时,系统自动降级唤醒 Engine D,大模型会自己“看”懂页面并提取标题、作者、正文等结构化数据,无视任何前端改版。
|
| 89 |
+
|
| 90 |
+
### 3. 人工兜底链路(Chrome 插件 RPA 回传)
|
| 91 |
+
Chrome 插件并不是服务端“自动执行的第三引擎线程”,而是一个**最后的人工兜底通道**:当所有自动引擎和 AI Agent 都无法突破风控,任务自动进入 `WAITING_RPA` 时,运营人员在真实浏览器环境下采集页面内容/数据后,通过插件调用服务端回传接口完成闭环。
|
| 92 |
+
|
| 93 |
+
### 4. 人工兜底采集链路(Excel 导入解析)
|
| 94 |
+
在极端情况下(如全网 IP 被封或大面积升级),支持通过**小红书数据助手**或第三方平台导出的 Excel 进行全量解析:
|
| 95 |
+
- 提供 `/api/v1/import/excel` 接口,自动识别各种变体表头(曝光、阅读、互动、转粉等)。
|
| 96 |
+
- 自动完成字段映射、标准化清洗,将离线人工数据无缝对接到自动化流中,确保业务“不断流”。
|
| 97 |
+
|
| 98 |
+
### 5. 企业级运维保障
|
| 99 |
+
- **防雪崩限流**:内置基于 IP 的滑动窗口限流(Rate Limit),超过阈值返回 `429 Retry-After`。
|
| 100 |
+
- **普罗米修斯监控**:暴露 `/api/v1/metrics` 接口,直出 `spider_xhs_tasks_total`、`spider_xhs_recent_failure_rate` 等核心指标。
|
| 101 |
+
- **并发与持久化**:采用基于 `fcntl.flock` 的文件锁分片存储,保证跨进程写文件的绝对原子性。
|
| 102 |
+
- **回调重试机制**:支持结果 Webhook 推送,内置 `Idempotency-Key` 幂等控制与指数退避重试。
|
| 103 |
+
|
| 104 |
+
### 6. AI 自动化获客编排 (Orchestrator MVP)
|
| 105 |
+
基于采集服务之上,新增轻量级 Python 编排脚本与 SQLite 本地数据库,提供完整的获客流转:
|
| 106 |
+
- **采集同步与清洗**:`crawl_sync.py` & `note_cleaner.py`
|
| 107 |
+
- **AI 批量生成图文**:`ai_generation.py`
|
| 108 |
+
- **AI 全自动发布笔记**:`publish_tracker.py`(接入 `browser-use`,大模型直接操控创作者中心)
|
| 109 |
+
- **AI 全自动触达线索**:`lead_service.py`(AI 自动打开小红书网页版私信,向高意向用户发送留资话术)
|
| 110 |
+
- **飞书同步与告警**:`feishu_sync.py` & `alert.py` (Mock 流转)
|
| 111 |
+
|
| 112 |
+
### 7. 运营控制台可视化
|
| 113 |
+
提供基于 React + Vite 的前端可视化看板(`frontend/`),支持资源池(账号/会话/代理)健康度监控、任务错误聚合扫描以及底层 SQLite 内容库的只读分页检索,实现了前后端数据链路的直观打通。
|
| 114 |
+
|
| 115 |
+
---
|
| 116 |
+
|
| 117 |
+
## 🤖 接入 AI 智能体
|
| 118 |
+
|
| 119 |
+
Spider_XHS 天然适合作为 AI 运营 Agent 的数据底座,以下是几种典型用法:
|
| 120 |
+
|
| 121 |
+
### 场景一:竞品笔记采集 + AI 改写 + 自动发布
|
| 122 |
+
|
| 123 |
+
```python
|
| 124 |
+
from apis.xhs_pc_apis import XHS_Apis
|
| 125 |
+
from apis.xhs_creator_apis import XHS_Creator_Apis
|
| 126 |
+
|
| 127 |
+
pc_api = XHS_Apis()
|
| 128 |
+
creator_api = XHS_Creator_Apis()
|
| 129 |
+
|
| 130 |
+
# 1. 采集竞品笔记
|
| 131 |
+
success, msg, note = pc_api.get_note_info(note_url, cookies_str)
|
| 132 |
+
|
| 133 |
+
# 2. 交给 AI 改写(接入任意大模型)
|
| 134 |
+
rewritten = your_ai_agent(note['content']) # GPT / Claude / Qwen / 本地模型
|
| 135 |
+
|
| 136 |
+
# 3. 自动上传到创作者平台
|
| 137 |
+
creator_api.post_note({
|
| 138 |
+
"title": rewritten['title'],
|
| 139 |
+
"desc": rewritten['desc'],
|
| 140 |
+
"media_type": "image",
|
| 141 |
+
"images": [...],
|
| 142 |
+
...
|
| 143 |
+
}, creator_cookies_str)
|
| 144 |
+
```
|
| 145 |
+
|
| 146 |
+
### 场景二:关键词监控 + AI 情报分析
|
| 147 |
+
|
| 148 |
+
```python
|
| 149 |
+
# 搜索指定关键词的最新笔记,交给 AI 分析趋势
|
| 150 |
+
success, msg, notes = pc_api.search_some_note(query, require_num, cookies_str, ...)
|
| 151 |
+
analysis = your_ai_agent(notes)
|
| 152 |
+
```
|
| 153 |
+
|
| 154 |
+
### 场景三:KOL 筛选 + 智能匹配
|
| 155 |
+
|
| 156 |
+
```python
|
| 157 |
+
from apis.xhs_pugongying_apis import PuGongYingAPI
|
| 158 |
+
|
| 159 |
+
pgy = PuGongYingAPI()
|
| 160 |
+
# 获取目标类目的 KOL 数据,交给 AI 评估匹配度
|
| 161 |
+
kol_list = pgy.get_some_user(num=50, cookies=cookies)
|
| 162 |
+
best_kols = your_ai_agent(kol_list, brand_profile)
|
| 163 |
+
```
|
| 164 |
+
|
| 165 |
+
---
|
| 166 |
+
|
| 167 |
+
## 🧩 Skills 支持
|
| 168 |
+
|
| 169 |
+
当前项目已经支持基于 skills 的能力接入,既可以直接作为 `Spider_XHS` 的底层能力仓库使用,也可以通过标准化 skills 方式被上层 Agent 工具链引入。
|
| 170 |
+
|
| 171 |
+
如果你希望直接复用已经封装好的 skills,可以直接使用本项目内置的 Agent Skills。该模块目前可被 `Clawbot`、`Claude Code`、`Codex` 等支持 skills 的工具直接引入与集成。
|
| 172 |
+
|
| 173 |
+
---
|
| 174 |
+
|
| 175 |
+
## 🎨 爬虫效果图
|
| 176 |
+
|
| 177 |
+
### 处理后的所有用户
|
| 178 |
+

|
| 179 |
+
|
| 180 |
+
### 某个用户所有的笔记
|
| 181 |
+

|
| 182 |
+
|
| 183 |
+
### 某个笔记具体的内容
|
| 184 |
+

|
| 185 |
+
|
| 186 |
+
### 保存的 Excel
|
| 187 |
+

|
| 188 |
+
|
| 189 |
+
---
|
| 190 |
+
|
| 191 |
+
## 🛠️ 快速开始
|
| 192 |
+
|
| 193 |
+
### ⛳ 环境要求
|
| 194 |
+
|
| 195 |
+
- Python 3.10+
|
| 196 |
+
- Node.js 20+
|
| 197 |
+
|
| 198 |
+
### 🎯 安装依赖
|
| 199 |
+
|
| 200 |
+
```bash
|
| 201 |
+
pip install -r requirements.txt
|
| 202 |
+
npm install
|
| 203 |
+
```
|
| 204 |
+
|
| 205 |
+
### 🧩 Playwright(必选:用于 Browser 引擎兜底与扫码登录)
|
| 206 |
+
|
| 207 |
+
如果你希望使用**浏览器引擎 (MediaCrawler) 作为兜底策略**,或使用“内置浏览器扫码登录后自动获取 Cookie 并写回 `.env`”功能,必须安装 Playwright 浏览器内核:
|
| 208 |
+
|
| 209 |
+
```bash
|
| 210 |
+
playwright install chromium
|
| 211 |
+
```
|
| 212 |
+
|
| 213 |
+
### 🎨 配置 Cookie
|
| 214 |
+
|
| 215 |
+
在项目根目录的 `.env` 文件中填入你的登录 Cookie:
|
| 216 |
+
|
| 217 |
+
```
|
| 218 |
+
COOKIES='<YOUR_XHS_COOKIE>'
|
| 219 |
+
```
|
| 220 |
+
|
| 221 |
+
Cookie 获取方式:浏览器登录小红书后,按 `F12` 打开开发者工具 → 网络 → Fetch/XHR → 找任意一个请求 → 复制请求头中的 `cookie` 字段。
|
| 222 |
+
|
| 223 |
+

|
| 224 |
+
|
| 225 |
+

|
| 226 |
+
|
| 227 |
+
> **注意:必须是登录后的 Cookie,未登录状态无效。**
|
| 228 |
+
|
| 229 |
+
### 🚀 运行项目
|
| 230 |
+
|
| 231 |
+
```bash
|
| 232 |
+
python main.py
|
| 233 |
+
```
|
| 234 |
+
|
| 235 |
+
### 🧰 CLI(推荐)
|
| 236 |
+
|
| 237 |
+
项目提供 CLI 方便团队跑批与断点续跑(默认状态文件 `datas/state.json`):
|
| 238 |
+
|
| 239 |
+
```bash
|
| 240 |
+
python cli.py --help
|
| 241 |
+
```
|
| 242 |
+
|
| 243 |
+
#### 扫码登录并自动写回 Cookie(推荐)
|
| 244 |
+
|
| 245 |
+
```bash
|
| 246 |
+
python cli.py login pc-qrcode --save-cookies --write-env
|
| 247 |
+
```
|
| 248 |
+
|
| 249 |
+
成功后会同时写入:
|
| 250 |
+
|
| 251 |
+
- `datas/cookies.json`
|
| 252 |
+
- `.env` 中的 `COOKIES="..."`
|
| 253 |
+
|
| 254 |
+
#### 抓取示例
|
| 255 |
+
|
| 256 |
+
```bash
|
| 257 |
+
python cli.py note --url 'https://www.xiaohongshu.com/explore/xxxx?...' --save-choice excel --excel-name demo --resume
|
| 258 |
+
python cli.py user --url 'https://www.xiaohongshu.com/user/profile/xxxx?...' --resume
|
| 259 |
+
python cli.py search --query 榴莲 --num 50 --resume
|
| 260 |
+
```
|
| 261 |
+
|
| 262 |
+
如需代理:
|
| 263 |
+
|
| 264 |
+
```bash
|
| 265 |
+
python cli.py search --query 榴莲 --num 50 --proxy http://127.0.0.1:7890
|
| 266 |
+
```
|
| 267 |
+
|
| 268 |
+
---
|
| 269 |
+
|
| 270 |
+
## 🧪 FastAPI 服务(对接 Java/任务系统)
|
| 271 |
+
- 适合场景:把 Spider_XHS 作为“采集/解析微服务”,由上游(Java/Agent/工作流引擎)提交任务并拉取结果;也可配置回调推送。
|
| 272 |
+
- 提示:服务端不会要求你把 Cookie 写进代码;推荐通过环境变量或任务 payload 注入,且不要在日志中打印 Cookie。
|
| 273 |
+
|
| 274 |
+
### 🚀 启动服务(uvicorn)
|
| 275 |
+
在项目根目录执行:
|
| 276 |
+
|
| 277 |
+
```bash
|
| 278 |
+
python -m uvicorn Spider_XHS.service.app:app --host 0.0.0.0 --port 8000
|
| 279 |
+
```
|
| 280 |
+
|
| 281 |
+
开发调试(热加载):
|
| 282 |
+
|
| 283 |
+
```bash
|
| 284 |
+
python -m uvicorn Spider_XHS.service.app:app --host 0.0.0.0 --port 8000 --reload
|
| 285 |
+
```
|
| 286 |
+
|
| 287 |
+
启动后:
|
| 288 |
+
- OpenAPI JSON:`http://localhost:8000/openapi.json`
|
| 289 |
+
- Swagger UI:`http://localhost:8000/docs`
|
| 290 |
+
|
| 291 |
+
### 🖥️ 运营控制台前端(frontend/)
|
| 292 |
+
`frontend/` 提供基于 Vite + React + TypeScript 的运营控制台,用于查看健康状态、任务中心、RPA 回传、错误中心、资源池中心、内容库与监控指标等能力。
|
| 293 |
+
|
| 294 |
+
页面路由:
|
| 295 |
+
- `/dashboard`:服务健康概览
|
| 296 |
+
- `/tasks`、`/tasks/:id`:任务中心与任务详情
|
| 297 |
+
- `/rpa`:RPA 回传(Chrome 插件)
|
| 298 |
+
- `/errors`:错误中心
|
| 299 |
+
- `/resources/accounts`、`/resources/sessions`、`/resources/proxies`:资源池中心(账号/会话/代理)
|
| 300 |
+
- `/content/raw-notes`、`/content/cleaned-notes`:内容库(原始笔记/清洗笔记)
|
| 301 |
+
- `/metrics`:监控指标
|
| 302 |
+
|
| 303 |
+
#### 1) 安装依赖
|
| 304 |
+
```bash
|
| 305 |
+
cd frontend
|
| 306 |
+
npm install
|
| 307 |
+
```
|
| 308 |
+
|
| 309 |
+
#### 2) 配置后端地址(VITE_API_BASE_URL)
|
| 310 |
+
默认后端地址为 `http://localhost:8000/api/v1`。如需调整,可通过环境变量覆盖:
|
| 311 |
+
|
| 312 |
+
- 推荐本地联调(走 Vite dev proxy,避免 CORS):
|
| 313 |
+
```bash
|
| 314 |
+
export VITE_API_BASE_URL=/api/v1
|
| 315 |
+
```
|
| 316 |
+
|
| 317 |
+
- 或指定完整地址(跨域访问,需后端允许 CORS):
|
| 318 |
+
```bash
|
| 319 |
+
export VITE_API_BASE_URL=http://localhost:8000/api/v1
|
| 320 |
+
```
|
| 321 |
+
|
| 322 |
+
#### 3) 启动前端(开发模式)
|
| 323 |
+
```bash
|
| 324 |
+
npm run dev
|
| 325 |
+
```
|
| 326 |
+
|
| 327 |
+
访问地址:`http://localhost:5173/`
|
| 328 |
+
|
| 329 |
+
#### 4) 与 FastAPI 同时启动(本地联调)
|
| 330 |
+
终端 A(后端):
|
| 331 |
+
```bash
|
| 332 |
+
python -m uvicorn Spider_XHS.service.app:app --host 0.0.0.0 --port 8000 --reload
|
| 333 |
+
```
|
| 334 |
+
|
| 335 |
+
终端 B(前端):
|
| 336 |
+
```bash
|
| 337 |
+
cd frontend
|
| 338 |
+
export VITE_API_BASE_URL=/api/v1
|
| 339 |
+
npm run dev
|
| 340 |
+
```
|
| 341 |
+
|
| 342 |
+
### 🧩 API 概览(/api/v1)
|
| 343 |
+
服务默认只开放带版本前缀的接口(推荐):
|
| 344 |
+
- Base URL:`http://<host>:8000/api/v1`
|
| 345 |
+
- 统一响应包装:`{ "code": 200, "msg": "success", "data": ... }`
|
| 346 |
+
- 兼容旧路由(不带 `/api/v1`)可通过 `ENABLE_LEGACY_ROUTES=1` 打开,默认关闭
|
| 347 |
+
|
| 348 |
+
#### 统一响应格式
|
| 349 |
+
所有接口(包括错误)均返回如下结构:
|
| 350 |
+
|
| 351 |
+
```json
|
| 352 |
+
{
|
| 353 |
+
"code": 200,
|
| 354 |
+
"msg": "success",
|
| 355 |
+
"data": {}
|
| 356 |
+
}
|
| 357 |
+
```
|
| 358 |
+
|
| 359 |
+
常用错误码(节选):
|
| 360 |
+
| 场景 | HTTP | code | 说明 |
|
| 361 |
+
|------|------|------|------|
|
| 362 |
+
| 鉴权/登录态失效 | 401 | 10001 | auth required |
|
| 363 |
+
| 单 IP 限流 | 429 | 10002 | rate limited |
|
| 364 |
+
| 风控/验证码 | 403 | 10003 | risk control |
|
| 365 |
+
| 参数/路径无效 | 400/404 | 10007 | invalid target |
|
| 366 |
+
| 任务不存在 | 404 | 10008 | task not found |
|
| 367 |
+
| 结果未就绪 | 409 | 10009 | task not ready |
|
| 368 |
+
|
| 369 |
+
#### 端点一览
|
| 370 |
+
| 方法 | 路径 | 说明 |
|
| 371 |
+
|------|------|------|
|
| 372 |
+
| GET | `/tasks` | 分页查询任务列表(过滤/排序) |
|
| 373 |
+
| POST | `/tasks` | 创建任务 |
|
| 374 |
+
| GET | `/tasks/{task_id}` | 查询任务状态 |
|
| 375 |
+
| GET | `/tasks/{task_id}/result` | 获取任务结果 |
|
| 376 |
+
| POST | `/tasks/{task_id}/callback/retry` | 手动重试回调 |
|
| 377 |
+
| POST | `/import/excel` | Excel 导入 |
|
| 378 |
+
| POST | `/import/extension` | Chrome 插件回填 |
|
| 379 |
+
| GET | `/errors/summary` | 错误聚合统计与失败任务列表 |
|
| 380 |
+
| GET | `/resources/accounts` | 账号池快照(只读) |
|
| 381 |
+
| GET | `/resources/sessions` | 会话池轻量检查(只读) |
|
| 382 |
+
| GET | `/resources/proxies` | 代理池快照(只读) |
|
| 383 |
+
| GET | `/content/raw-notes` | 内容库 raw_note 列表(SQLite,只读) |
|
| 384 |
+
| GET | `/content/cleaned-notes` | 内容库 cleaned_note 列表(SQLite,只读) |
|
| 385 |
+
| GET | `/health` | 健康检查 |
|
| 386 |
+
| GET | `/metrics` | Prometheus 指标 |
|
| 387 |
+
|
| 388 |
+
#### 运维:GET /api/v1/health
|
| 389 |
+
返回字段(节选):
|
| 390 |
+
- `queue_length`:当前 `queued` 任务数量
|
| 391 |
+
- `engine_usage_counts`:各引擎累计处理任务数(内存计数器,随进程重启清零)
|
| 392 |
+
- `success_count` / `fail_count`:累计成功/失败任务数(内存计数器,随进程重启清零)
|
| 393 |
+
- `recent_failure_rate`:近期失败率(滑动窗口)
|
| 394 |
+
- `recent_failure_window_seconds` / `recent_failure_total` / `recent_failure_failed`:失败率窗口与分子分母
|
| 395 |
+
|
| 396 |
+
#### 监控:GET /api/v1/metrics
|
| 397 |
+
Prometheus 文本格式(`text/plain; version=0.0.4`),核心指标(节选):
|
| 398 |
+
- `spider_xhs_queue_length`
|
| 399 |
+
- `spider_xhs_tasks_total{engine,status}`
|
| 400 |
+
- `spider_xhs_tasks_succeeded_total` / `spider_xhs_tasks_failed_total`
|
| 401 |
+
- `spider_xhs_recent_failure_rate`
|
| 402 |
+
|
| 403 |
+
#### 错误中心:GET /api/v1/errors/summary
|
| 404 |
+
用途:聚合最近 N 条任务的 `error_kind` 分布,并返回失败任务列表(可分页/过滤)。
|
| 405 |
+
|
| 406 |
+
Query 参数(节选):
|
| 407 |
+
- `scan_limit`:扫描最近任务数量(默认取 `ERROR_SUMMARY_SCAN_LIMIT`,未配置则 1000)
|
| 408 |
+
- `status`:逗号分隔过滤(例如 `failed,waiting_rpa`)
|
| 409 |
+
- `error_kind`:逗号分隔过滤(例如 `auth,rate`)
|
| 410 |
+
|
| 411 |
+
#### 资源池中心:GET /api/v1/resources/*
|
| 412 |
+
- `/resources/accounts`:账号池快照(risk_score/cooldown 等)
|
| 413 |
+
- `/resources/sessions`:会话池轻量检查结果(ok/reason)
|
| 414 |
+
- `/resources/proxies`:代理池快照(规模/均分/失败原因分布等)
|
| 415 |
+
|
| 416 |
+
#### 内容库:GET /api/v1/content/*
|
| 417 |
+
说明:内容库从编排层 SQLite 读取数据,数据库路径由 `ORCHESTRATOR_DB_PATH` 指定(默认 `orchestrator/data/mvp.db`)。
|
| 418 |
+
|
| 419 |
+
- `/content/raw-notes`:raw_note 列表(author/url/content 模糊搜索)
|
| 420 |
+
- `/content/cleaned-notes`:cleaned_note 列表(cleaned_content + 关联 raw_note 信息)
|
| 421 |
+
|
| 422 |
+
#### 日志字段(结构化)
|
| 423 |
+
服务日志会在 `extra` 中附带结构化字段(示例字段):
|
| 424 |
+
- `task_id` / `task_type` / `engine` / `status`
|
| 425 |
+
不要把 Cookie、回调签名、Authorization 等敏感信息写入日志;错误信息会对 URL token/cookies 等做脱敏。
|
| 426 |
+
|
| 427 |
+
#### 1) 创建任务:POST /api/v1/tasks
|
| 428 |
+
用途:提交一个采集/解析任务,立即返回 task_id,服务端后台执行。
|
| 429 |
+
|
| 430 |
+
请求体(JSON):
|
| 431 |
+
- `task_type`:`note_url` / `search` / `user_profile`
|
| 432 |
+
- `target`:采集目标(URL/关键词/用户ID),推荐使用;会自动写入 payload(向后兼容)
|
| 433 |
+
- `payload`:任务参数(见下方示例)
|
| 434 |
+
- `engine`:可选,`auto` / `api` / `browser`(不传则使用环境变量 `ENGINE_STRATEGY`)
|
| 435 |
+
|
| 436 |
+
示例(抓取笔记详情):
|
| 437 |
+
|
| 438 |
+
```bash
|
| 439 |
+
curl -X POST "http://localhost:8000/api/v1/tasks" \
|
| 440 |
+
-H "Content-Type: application/json" \
|
| 441 |
+
-d '{
|
| 442 |
+
"task_type": "note_url",
|
| 443 |
+
"target": "https://www.xiaohongshu.com/explore/xxxx?xsec_token=yyyy",
|
| 444 |
+
"engine": "auto",
|
| 445 |
+
"payload": {
|
| 446 |
+
"operator": "demo"
|
| 447 |
+
}
|
| 448 |
+
}'
|
| 449 |
+
```
|
| 450 |
+
|
| 451 |
+
说明:
|
| 452 |
+
- API 引擎(Spider_XHS)Cookie 来源优先级:`payload.cookies`/`payload.cookie`/`payload.cookies_str` > 环境变量 `COOKIES`/`XHS_COOKIES`
|
| 453 |
+
- 代理来源优先级:`payload.proxies`(对象)> `payload.proxy`/`payload.proxies`(字符串)> 环境变量 `SERVICE_PROXY`;浏览器引擎可额外配置 `SERVICE_PROXY_POOL` 做简单轮换
|
| 454 |
+
|
| 455 |
+
不同 task_type 的 payload 约定:
|
| 456 |
+
- `note_url`
|
| 457 |
+
- `note_url`:笔记链接(必填)
|
| 458 |
+
- `cookies`:可选(推荐从环境变量注入,避免透传到上游日志)
|
| 459 |
+
- `search`
|
| 460 |
+
- `query` 或 `keyword`:关键词(必填)
|
| 461 |
+
- `require_num` 或 `limit`:期望条数(默认 20)
|
| 462 |
+
- `sort_type_choice` / `note_type` / `note_time` / `note_range` / `pos_distance` / `geo`:可选(与 CLI/现有 API 对齐)
|
| 463 |
+
- `user_profile`
|
| 464 |
+
- `user_id` 或 `uid`:用户 id(必填其一)
|
| 465 |
+
- 或 `user_url`/`url`:用户主页链接(可由 URL 自动解析 user_id)
|
| 466 |
+
|
| 467 |
+
#### 2) 查询任务状态:GET /api/v1/tasks/{task_id}
|
| 468 |
+
|
| 469 |
+
```bash
|
| 470 |
+
curl "http://localhost:8000/api/v1/tasks/<task_id>"
|
| 471 |
+
```
|
| 472 |
+
|
| 473 |
+
返回字段(节选):
|
| 474 |
+
- `task.status`:`queued` / `running` / `succeeded` / `failed`
|
| 475 |
+
- `task.engine`:实际执行引擎(例如 `spider_xhs` / `mediacrawler`)
|
| 476 |
+
- `task.callback`:回调状态(配置了回调才会出现)
|
| 477 |
+
|
| 478 |
+
#### 3) 获取任务结果:GET /api/v1/tasks/{task_id}/result
|
| 479 |
+
- 若任务仍在 `queued/running`,会返回 `409`(result not ready)。
|
| 480 |
+
- 任务结束后返回:
|
| 481 |
+
- `raw`:引擎原始返回(结构可能随上游接口变化)
|
| 482 |
+
- `normalized`:标准化后的关键字段(便于入库/检索)
|
| 483 |
+
- `meta`:任务元信息与错误分类
|
| 484 |
+
|
| 485 |
+
```bash
|
| 486 |
+
curl "http://localhost:8000/api/v1/tasks/<task_id>/result"
|
| 487 |
+
```
|
| 488 |
+
|
| 489 |
+
`meta` 常用字段:
|
| 490 |
+
- `ok`:是否成功
|
| 491 |
+
- `error_kind`:失败类型(如 `auth` / `rate` / `risk` / `parse` / `timeout` / `missing_dependency`)
|
| 492 |
+
- `primary_engine` / `final_engine`:自动策略下的主引擎与最终引擎
|
| 493 |
+
- `fallback_reason`:触发兜底的原因(如阈值触发、鉴权/风控失败等)
|
| 494 |
+
|
| 495 |
+
#### 4) 回调失败手动重试:POST /api/v1/tasks/{task_id}/callback/retry
|
| 496 |
+
仅当配置了回调地址且结果已生成时可用。服务端会:
|
| 497 |
+
- 使用 `Idempotency-Key` 请求头进行幂等控制
|
| 498 |
+
- 最多尝试 5 次(指数退避 + 抖动)
|
| 499 |
+
- 若仍失败,会把失败请求落盘,供后续重试
|
| 500 |
+
|
| 501 |
+
```bash
|
| 502 |
+
curl -X POST "http://localhost:8000/api/v1/tasks/<task_id>/callback/retry"
|
| 503 |
+
```
|
| 504 |
+
|
| 505 |
+
#### 5) Excel 导入(人工兜底):POST /api/v1/import/excel
|
| 506 |
+
用途:当线上采集受风控/验证码影响,可用人工导出的 Excel 进行导入与标准化。
|
| 507 |
+
|
| 508 |
+
请求(multipart/form-data):
|
| 509 |
+
- `file`:Excel 文件
|
| 510 |
+
- `operator`:操作者/来源标识(必填)
|
| 511 |
+
- `source_name`:可选,来源名称(如“运营同学A导出”)
|
| 512 |
+
|
| 513 |
+
```bash
|
| 514 |
+
curl -X POST "http://localhost:8000/api/v1/import/excel" \
|
| 515 |
+
-F "file=@/path/to/demo.xlsx" \
|
| 516 |
+
-F "operator=ops" \
|
| 517 |
+
-F "source_name=manual_export"
|
| 518 |
+
```
|
| 519 |
+
|
| 520 |
+
模板识别(自动):
|
| 521 |
+
- `xhs_data_assistant`:命中“曝光/阅读/浏览/互动/转粉”等数据助手指标列,并同时包含“笔记标题/笔记链接/作者昵称”等识别列
|
| 522 |
+
- `fixed_template`:兜底模板(任意表头),会尝试从行内提取 URL/关键词等信息
|
| 523 |
+
|
| 524 |
+
`xhs_data_assistant` 支持的常见列(中英文均可,大小写/空格不敏感):
|
| 525 |
+
- 识别列:`笔记链接`/`笔记url`/`作品链接` → `note_url`,`笔记标题` → `title`,`作者昵称` → `author`
|
| 526 |
+
- 时间列:`发布时间`/`上传时间` → `publish_time`
|
| 527 |
+
- 互动指标:`点赞` → `like`,`评论` → `comment`,`收藏` → `collect`,`分享`/`转发` → `share`
|
| 528 |
+
- 流量指标:`曝光` → `exposure`,`阅读` → `read`,`浏览` → `view`,`互动` → `interact`,`转粉` → `follow`
|
| 529 |
+
- 内容类型:`内容类型`/`笔记类型` → `content_type`
|
| 530 |
+
|
| 531 |
+
标准化输出(`records[].normalized`):
|
| 532 |
+
- 基础:`kind`(note/user/search/url)、`note_url`/`note_id`/`user_url`/`user_id`/`query`
|
| 533 |
+
- 补充:`title`、`author`/`nickname`、`publish_time`、`like_count`、`comment_count`、`collect_count`、`share_count`、`exposure`、`read`、`view`、`interact`、`follow`、`content_type`
|
| 534 |
+
|
| 535 |
+
溯源信息(`records[].meta`):
|
| 536 |
+
- `import_id`、`operator`、`source_name`、`row_number`、`dedup_key`、`parser`、`filename`、`sheet`
|
| 537 |
+
|
| 538 |
+
### ⚙️ 环境变量配置(建议仅在部署环境注入,不要写入代码/仓库)
|
| 539 |
+
> 下表仅展示变量名与用途;不要在日志/报错里输出真实 Cookie、回调签名、账号信息等敏感数据。
|
| 540 |
+
|
| 541 |
+
| 变量 | 默认值 | 说明 |
|
| 542 |
+
|------|--------|------|
|
| 543 |
+
| `ENABLE_LEGACY_ROUTES` | `0` | 是否兼容旧接口(不带 `/api/v1` 前缀);建议生产关闭 |
|
| 544 |
+
| `STORAGE_ROOT` / `SERVICE_STORAGE_ROOT` / `SERVICE_STORAGE_DIR` | `./storage` | 服务落盘根目录(任务/结果/回调失败/原始快照/日志) |
|
| 545 |
+
| `SERVICE_CONCURRENCY` | `4` | 任务执行并发度(建议和上游并发一起收敛) |
|
| 546 |
+
| `SERVICE_PROXY` | 空 | 代理(同时影响 API 引擎与浏览器引擎) |
|
| 547 |
+
| `SERVICE_PROXY_POOL` | 空 | 代理池(逗号分隔多个代理,浏览器引擎按重试次数轮换;会与 `SERVICE_PROXY` 合并去重) |
|
| 548 |
+
| `ENGINE_STRATEGY` | `auto` | `auto`/`api`/`browser` |
|
| 549 |
+
| `ENGINE_FALLBACK_THRESHOLD` | `3` | `auto` 模式下切换浏览器引擎的失败阈值 |
|
| 550 |
+
| `RAW_DATA_RETENTION_DAYS` | `7` | 原始 HTML 快照保留天数(启动时清理 + 定时清理) |
|
| 551 |
+
| `ORCHESTRATOR_DB_PATH` | `orchestrator/data/mvp.db` | Orchestrator SQLite 路径(内容库接口 `/content/*` 读取,只读打开) |
|
| 552 |
+
| `ERROR_SUMMARY_SCAN_LIMIT` | `1000` | 错误中心默认扫描最近任务数(用于 `/errors/summary` 的默认 `scan_limit`) |
|
| 553 |
+
| `CALLBACK_URL` / `SERVICE_CALLBACK_URL` | 空 | 任务完成后回调推送地址(可选) |
|
| 554 |
+
| `MEDIACRAWLER_STORAGE_STATE_PATHS` | 空 | 浏览器引擎登录态文件路径(逗号分隔多个候选) |
|
| 555 |
+
| `MEDIACRAWLER_STORAGE_STATE_PATH` | 空 | 同上(单路径兼容) |
|
| 556 |
+
| `MEDIACRAWLER_MOCK` | `0` | 浏览器引擎 mock 模式(不依赖 Playwright,可用于自测/验证脚本) |
|
| 557 |
+
| `MEDIACRAWLER_RETRY_MAX` | `3` | 浏览器引擎页面访问重试次数(含首次) |
|
| 558 |
+
| `MEDIACRAWLER_RETRY_BASE_S` | `0.8` | 浏览器引擎重试指数退避基准秒数 |
|
| 559 |
+
| `MEDIACRAWLER_RETRY_CAP_S` | `12` | 浏览器引擎重试指数退避上限秒数 |
|
| 560 |
+
| `MEDIACRAWLER_DELAY_MIN_S` | `0.4` | 浏览器引擎随机延迟下限(秒) |
|
| 561 |
+
| `MEDIACRAWLER_DELAY_MAX_S` | `1.2` | 浏览器引擎随机延迟上限(秒) |
|
| 562 |
+
| `COOKIES` / `XHS_COOKIES` | 空 | API 引擎登录 Cookie(敏感:建议仅在服务侧注入,不要写入镜像/仓库) |
|
| 563 |
+
| `LOG_LEVEL` | `INFO` | 日志级别 |
|
| 564 |
+
| `LOG_ROTATION` | `1 day` | 日志切分策略(loguru rotation) |
|
| 565 |
+
| `LOG_RETENTION` | `14 days` | 日志保留策略(loguru retention) |
|
| 566 |
+
| `LOG_COMPRESSION` | `zip` | 压缩策略(loguru compression) |
|
| 567 |
+
|
| 568 |
+
`MEDIACRAWLER_STORAGE_STATE_PATHS` 示例:
|
| 569 |
+
|
| 570 |
+
```bash
|
| 571 |
+
export MEDIACRAWLER_STORAGE_STATE_PATHS="/data/xhs/storage_state.json,/app/datas/state.json"
|
| 572 |
+
```
|
| 573 |
+
|
| 574 |
+
### 🐳 Docker Compose 部署(推荐)
|
| 575 |
+
项目根目录已提供 `docker-compose.yml`,默认把宿主机 `./storage` 挂载到容器 `/app/storage`:
|
| 576 |
+
|
| 577 |
+
```bash
|
| 578 |
+
docker compose up -d --build
|
| 579 |
+
```
|
| 580 |
+
|
| 581 |
+
建议做法:
|
| 582 |
+
- 使用单独的环境文件(例如 `.env.local`)注入 `COOKIES`/`CALLBACK_URL` 等敏感变量,并确保不提交到仓库
|
| 583 |
+
- 生产环境用反向代理(Nginx/Traefik)提供 TLS,并设置 `x-forwarded-for` 以获得真实 IP(限流按 IP 生效)
|
| 584 |
+
|
| 585 |
+
### ☕ Java 对接示例
|
| 586 |
+
#### 模式 A:拉取(推荐,最简单稳定)
|
| 587 |
+
流程:
|
| 588 |
+
1) Java 调用 `POST /api/v1/tasks` 创建任务拿到 task_id
|
| 589 |
+
2) Java 轮询 `GET /api/v1/tasks/{task_id}` 或直接轮询 `GET /api/v1/tasks/{task_id}/result`
|
| 590 |
+
3) 拿到 `normalized/meta` 入库;`raw` 可按需归档
|
| 591 |
+
|
| 592 |
+
OkHttp 示例(精简版):
|
| 593 |
+
|
| 594 |
+
```java
|
| 595 |
+
import okhttp3.*;
|
| 596 |
+
import java.util.concurrent.TimeUnit;
|
| 597 |
+
|
| 598 |
+
public class SpiderXhsClient {
|
| 599 |
+
private final OkHttpClient http = new OkHttpClient.Builder()
|
| 600 |
+
.connectTimeout(5, TimeUnit.SECONDS)
|
| 601 |
+
.readTimeout(30, TimeUnit.SECONDS)
|
| 602 |
+
.build();
|
| 603 |
+
|
| 604 |
+
public String createTask(String baseUrl, String noteUrl) throws Exception {
|
| 605 |
+
String json = "{"
|
| 606 |
+
+ "\"task_type\":\"note_url\","
|
| 607 |
+
+ "\"engine\":\"auto\","
|
| 608 |
+
+ "\"payload\":{\"note_url\":\"" + noteUrl + "\",\"operator\":\"java\"}"
|
| 609 |
+
+ "}";
|
| 610 |
+
Request req = new Request.Builder()
|
| 611 |
+
.url(baseUrl + "/api/v1/tasks")
|
| 612 |
+
.post(RequestBody.create(json, MediaType.parse("application/json")))
|
| 613 |
+
.build();
|
| 614 |
+
try (Response resp = http.newCall(req).execute()) {
|
| 615 |
+
if (!resp.isSuccessful()) throw new RuntimeException("createTask http=" + resp.code());
|
| 616 |
+
return resp.body().string();
|
| 617 |
+
}
|
| 618 |
+
}
|
| 619 |
+
}
|
| 620 |
+
```
|
| 621 |
+
|
| 622 |
+
生产环境请解析 JSON 响应,读取 `task.id` 作为 task_id。
|
| 623 |
+
|
| 624 |
+
建议:
|
| 625 |
+
- `GET /tasks/{task_id}/result` 返回 409 时,说明结果未就绪;可退避重试(例如 0.5s/1s/2s 递增)。
|
| 626 |
+
- 如果 `meta.ok=false` 且 `meta.error_kind=risk/auth`,通常需要人工处理验证码或更新 Cookie/登录态。
|
| 627 |
+
|
| 628 |
+
#### 模式 B:回调推送(可选)
|
| 629 |
+
配置:
|
| 630 |
+
- 在 Spider_XHS 服务侧设置 `CALLBACK_URL`(或 `SERVICE_CALLBACK_URL`)
|
| 631 |
+
|
| 632 |
+
回调行为:
|
| 633 |
+
- 服务完成任务后会 `POST CALLBACK_URL`
|
| 634 |
+
- 请求头:`Idempotency-Key: <sha256>`
|
| 635 |
+
- 请求体:`{"task_id":"...","result":{...}}`
|
| 636 |
+
|
| 637 |
+
Spring Boot 接收示例(示意,需自行实现幂等去重存储):
|
| 638 |
+
|
| 639 |
+
```java
|
| 640 |
+
import org.springframework.web.bind.annotation.*;
|
| 641 |
+
import java.util.Map;
|
| 642 |
+
|
| 643 |
+
@RestController
|
| 644 |
+
public class SpiderXhsCallbackController {
|
| 645 |
+
@PostMapping("/spider-xhs/callback")
|
| 646 |
+
public Map<String, Object> callback(
|
| 647 |
+
@RequestHeader(value = "Idempotency-Key", required = false) String idemKey,
|
| 648 |
+
@RequestBody Map<String, Object> body
|
| 649 |
+
) {
|
| 650 |
+
return Map.of("ok", true);
|
| 651 |
+
}
|
| 652 |
+
}
|
| 653 |
+
```
|
| 654 |
+
|
| 655 |
+
生产环境建议使用 `Idempotency-Key` 做幂等去重(例如落库或 Redis SETNX),同时避免把任何敏感字段写入日志。
|
| 656 |
+
|
| 657 |
+
回调失败处理:
|
| 658 |
+
- 服务会自动重试(最多 5 次),仍失败会落盘记录
|
| 659 |
+
- 上游可调用 `POST /api/v1/tasks/{task_id}/callback/retry` 触发再次推送
|
| 660 |
+
|
| 661 |
+
### 🛡️ 风控/验证码/封号风险建议(务必阅读)
|
| 662 |
+
- 限速:控制 QPS 与随机抖动(尤其是搜索/批量详情),避免固定频率与高峰期密集请求。
|
| 663 |
+
- 并发:客户端线程数与服务端并发同时收敛;建议从小并发开始(例如 1~3)逐步压测。
|
| 664 |
+
- 代理:优先稳定的住宅/独享代理;按账号/会话做粘性,避免频繁切换出口;异常时再切换。
|
| 665 |
+
- 人工验证码:当出现验证码/风控(`meta.error_kind=risk/auth`),建议人工在浏览器完成验证并更新 Cookie 或 Playwright storage_state。
|
| 666 |
+
- 账号隔离:不同业务/环境使用不同账号与独立存储目录;避免共享 Cookie 造成互相踢下线。
|
| 667 |
+
|
| 668 |
+
### 🧰 运维说明(日志/清理/迁移)
|
| 669 |
+
- 日志目录:`<STORAGE_ROOT>/logs/`,默认文件 `service.log`(按 `LOG_ROTATION` 轮转,保留 `LOG_RETENTION`,按需压缩)
|
| 670 |
+
- 原始快照:浏览器引擎可能会将 HTML 快照写入 `<STORAGE_ROOT>/raw/`,由 `RAW_DATA_RETENTION_DAYS` 控制保留天数(启动时清理 + 定时清理)
|
| 671 |
+
- 存储迁移:若检测到旧版 `tasks.json`,服务启动时会��动迁移到 `<STORAGE_ROOT>/tasks/{task_id}.json`,并将旧文件重命名为 `tasks.json.migrated`
|
| 672 |
+
- 备份建议:按目录整体备份 `<STORAGE_ROOT>` 即可(包含任务、结果、回调失败、快照与日志)
|
| 673 |
+
|
| 674 |
+
### ✅ 合规红线(务必遵守)
|
| 675 |
+
- 仅在获得授权、符合平台协议与当地法律法规前提下使用本项目
|
| 676 |
+
- 不要采集/存储/传播敏感个人信息,避免长期留存可识别数据(PII)
|
| 677 |
+
- Cookie、storage_state、回调签名等属于敏感凭证,严禁写入仓库、镜像、公开日志与错误回传
|
| 678 |
+
- 如需留存数据,建议最小化采集字段、设置保留期、建立审计与删除机制
|
| 679 |
+
|
| 680 |
+
---
|
| 681 |
+
|
| 682 |
+
## 📁 项目结构
|
| 683 |
+
|
| 684 |
+
```
|
| 685 |
+
Spider_XHS/
|
| 686 |
+
├── main.py # 主入口:爬虫调用示例
|
| 687 |
+
├── apis/
|
| 688 |
+
│ ├── xhs_pc_apis.py # 小红书PC端完整API(采集)
|
| 689 |
+
│ ├── xhs_creator_apis.py # 创作者平台API(上传发布)
|
| 690 |
+
│ ├── xhs_pc_login_apis.py # PC端登录(二维码/手机验证码)
|
| 691 |
+
│ ├── xhs_creator_login_apis.py # 创作者平台登录
|
| 692 |
+
│ ├── xhs_pugongying_apis.py # 蒲公英平台API(KOL数据)
|
| 693 |
+
│ └── xhs_qianfan_apis.py # 千帆平台API(分销商数据)
|
| 694 |
+
├── xhs_utils/
|
| 695 |
+
│ ├── common_util.py # 初始化工具(读取.env配置)
|
| 696 |
+
│ ├── cookie_util.py # Cookie解析
|
| 697 |
+
│ ├── data_util.py # 数据处理(Excel保存、媒体下载)
|
| 698 |
+
│ ├── xhs_util.py # PC端签名算法封装
|
| 699 |
+
│ ├── xhs_creator_util.py # 创作者平台签名算法封装
|
| 700 |
+
│ ├── xhs_pugongying_util.py # 蒲公英平台工具
|
| 701 |
+
│ └── xhs_qianfan_util.py # 千帆平台工具
|
| 702 |
+
├── static/
|
| 703 |
+
│ ├── xhs_main_260411.js # PC端签名核心JS(最新版)
|
| 704 |
+
│ ├── xhs_creator_260411.js # 创作者平台签名核心JS(最新版)
|
| 705 |
+
│ └── ...
|
| 706 |
+
├── .env # Cookie配置(不要提交到git)
|
| 707 |
+
├── requirements.txt
|
| 708 |
+
├── Dockerfile
|
| 709 |
+
└── package.json
|
| 710 |
+
```
|
| 711 |
+
|
| 712 |
+
---
|
| 713 |
+
|
| 714 |
+
## 🗝️ 注意事项
|
| 715 |
+
|
| 716 |
+
- `main.py` 是爬虫入口,可根据需求修改调用逻辑
|
| 717 |
+
- `apis/xhs_pc_apis.py` 包含所有 PC 端数据接口
|
| 718 |
+
- `apis/xhs_creator_apis.py` 包含创作者平台发布接口
|
| 719 |
+
- Cookie 有时效性,失效后需重新获取
|
| 720 |
+
- 建议配合代理(proxies 参数)使用,降低封号风险
|
| 721 |
+
|
| 722 |
+
---
|
| 723 |
+
|
| 724 |
+
## 🍥 更新日志
|
| 725 |
+
|
| 726 |
+
| 日期 | 说明 |
|
| 727 |
+
|------|------|
|
| 728 |
+
| 23/08/09 | 首次提交 |
|
| 729 |
+
| 23/09/13 | API 更改 params 增加两个字段,修复图片无法下载,修复部分页面无法访问报错 |
|
| 730 |
+
| 23/09/16 | 修复较大视频编码问题,加入异常处理 |
|
| 731 |
+
| 23/09/18 | 代码重构,加入失败重试 |
|
| 732 |
+
| 23/09/19 | 新增下载搜索结果功能 |
|
| 733 |
+
| 23/10/05 | 新增跳过已下载功能,获取更详细的笔记和用户信息 |
|
| 734 |
+
| 23/10/08 | 上传至 PyPI,可通过 pip install 安装 |
|
| 735 |
+
| 23/10/17 | 搜索下载新增排序方式(综合 / 热门 / 最新) |
|
| 736 |
+
| 23/10/21 | 新增图形化界面,上传至 release v2.1.0 |
|
| 737 |
+
| 23/10/28 | Fix Bug:修复搜索功能隐藏问题 |
|
| 738 |
+
| 25/03/18 | 更新 API,修复部分问题 |
|
| 739 |
+
| 25/06/07 | 更新 search 接口,区分视频和图集下载,新增创作者平台 API |
|
| 740 |
+
| 25/07/15 | 更新 xs version56 & 小红书创作者接口 |
|
| 741 |
+
| 26/04/11 | 重构创作者平台 API(图集 / 视频上传),新增蒲公英 KOL 数据 API,新增千帆分销商 API,签名算法升级至最新版 |
|
| 742 |
+
| 26/04/16 | **架构升级**:全面升级为“稳定性大脑 + 双自动引擎 + RPA 回传兜底”架构(Spider_XHS + MediaCrawler兜底 + Extension 回传),新增 FastAPI 微服务、Webhook 回调、数据助手 Excel 解析、Prometheus 监控与 IP 级限流机制 |
|
| 743 |
+
| 26/04/16 | **业务闭环(样例级)**:新增 `orchestrator` 编排层,基于 SQLite 搭建 14 张核心业务表,提供“采集同步 -> 基础清洗 -> Mock 飞书同步/告警落库”的最小闭环样例 |
|
| 744 |
+
| 26/04/16 | **稳定性增强**:引入 **Stability Controller** 大脑与资源池(Account/Session/Proxy Pool),升级为**双自动引擎 + RPA 回传兜底**,增强浏览器引擎拟人化 Stealth,任务状态机升级为多阶段容灾流转 |
|
| 745 |
+
| 26/04/16 | **可视化控制台**:新增基于 React + Vite 的前端可视化看板,落地资源池中心(账号/会话/代理状态)、错误中心(失败任务聚合分析)与内容库基础页的可视化闭环 |
|
| 746 |
+
|
| 747 |
+
---
|
| 748 |
+
|
| 749 |
+
## 🧸 额外说明
|
| 750 |
+
|
| 751 |
+
1. 欢迎 PR 和 Issue。
|
| 752 |
+
2. 项目会持续更新,致力于提供更稳定的数据采集服务。
|
USAGE.md
ADDED
|
@@ -0,0 +1,299 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 小红书稳定采集微服务 (Spider_XHS) 使用说明文档
|
| 2 |
+
|
| 3 |
+
本文档旨在指导开发者、运维人员及业务方如何快速部署、配置、对接与运维 Spider_XHS 数据采集微服务。
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
## 1. 快速启动与部署
|
| 8 |
+
|
| 9 |
+
本项目推荐使用 Docker Compose 进行容器化部署,以保证运行环境的一致性。
|
| 10 |
+
|
| 11 |
+
### 1.1 前置要求
|
| 12 |
+
- **环境**:已安装 Docker 和 Docker Compose。
|
| 13 |
+
- **配置**:建议在宿主机(如项目根目录)准备 `.env` 文件。
|
| 14 |
+
- **数据目录**:宿主机需要预留足够的磁盘空间用于挂载 `./storage` 目录(存放任务状态、结果、回调失败记录、日志及 HTML 快照)。
|
| 15 |
+
|
| 16 |
+
### 1.2 启动步骤
|
| 17 |
+
在项目根目录执行以下命令,构建并以后台模式启动服务:
|
| 18 |
+
```bash
|
| 19 |
+
docker compose up -d --build
|
| 20 |
+
```
|
| 21 |
+
启动成功后,FastAPI 服务默认监听宿主机的 `8000` 端口。你可以通过访问 `http://localhost:8000/docs` 查看 Swagger UI 接口文档。
|
| 22 |
+
|
| 23 |
+
### 1.3 运营控制台前端(frontend/,可选)
|
| 24 |
+
项目内置一个运营控制台前端(Vite + React),用于在浏览器中查看健康状态、任务列表、任务详情与监控指标。
|
| 25 |
+
|
| 26 |
+
在项目根目录执行:
|
| 27 |
+
```bash
|
| 28 |
+
cd frontend
|
| 29 |
+
npm install
|
| 30 |
+
export VITE_API_BASE_URL=/api/v1
|
| 31 |
+
npm run dev
|
| 32 |
+
```
|
| 33 |
+
|
| 34 |
+
访问地址:`http://localhost:5173/`
|
| 35 |
+
|
| 36 |
+
页面路由:
|
| 37 |
+
- `/dashboard`:服务健康概览
|
| 38 |
+
- `/tasks`、`/tasks/:id`:任务中心与任务详情
|
| 39 |
+
- `/rpa`:RPA 回传(Chrome 插件)
|
| 40 |
+
- `/errors`:错误中心
|
| 41 |
+
- `/resources/accounts`、`/resources/sessions`、`/resources/proxies`:资源池中心
|
| 42 |
+
- `/content/raw-notes`、`/content/cleaned-notes`:内容库
|
| 43 |
+
- `/metrics`:监控指标
|
| 44 |
+
|
| 45 |
+
本地联调建议同时启动 FastAPI:
|
| 46 |
+
```bash
|
| 47 |
+
python -m uvicorn Spider_XHS.service.app:app --host 0.0.0.0 --port 8000 --reload
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
---
|
| 51 |
+
|
| 52 |
+
## 2. 核心配置说明 (环境变量)
|
| 53 |
+
|
| 54 |
+
服务的大部分行为通过环境变量进行控制,建议将敏感信息(如 Cookie)通过环境注入,切勿硬编码。
|
| 55 |
+
|
| 56 |
+
| 环境变量名 | 默认值 | 作用与说明 |
|
| 57 |
+
|---|---|---|
|
| 58 |
+
| `ENGINE_STRATEGY` | `auto` | 采集引擎调度策略(服务端自动执行的只有 A/B 两个引擎)。可选值:<br>- `auto`: 优先使用 API 引擎,遇风控自动降级到浏览器引擎<br>- `api`: 仅使用极速协议级引擎 (Spider_XHS)<br>- `browser`: 仅使用自动化浏览器引擎 (MediaCrawler) |
|
| 59 |
+
| `COOKIES` / `XHS_COOKIES` | 无 | 小红书登录后的 Cookie 字符串。支持多账号,可通过 `COOKIES_LIST` 等按逗号/换行分隔配置。 |
|
| 60 |
+
| `ACCOUNT_COOLDOWN_SECONDS` | `900` | 当账号遇到严重风控 (rate/risk) 时,自动进入冷却状态的时长(秒)。 |
|
| 61 |
+
| `SERVICE_PROXY` | 无 | 单一代理服务器地址(如 `http://127.0.0.1:7890`)。 |
|
| 62 |
+
| `SERVICE_PROXY_POOL` | 无 | 代理列表(逗号分隔)。浏览器引擎在重试时会按次数轮询;同时会与 `SERVICE_PROXY` 合并去重。 |
|
| 63 |
+
| `PROXY_API_URL` | 无 | 动态代理池提取 API(单个地址,向后兼容);建议使用 `PROXY_API_URLS`。 |
|
| 64 |
+
| `PROXY_API_URLS` | 无 | 动态代理池的提取 API,支持逗号分隔配置多个 Provider,系统将自动定时拉取、验活与打分剔除。 |
|
| 65 |
+
| `PROXY_FILE_PATH` | 无 | 静态代理池文件路径(每行一个代理),支持与 API 代理池聚合使用。 |
|
| 66 |
+
| `CALLBACK_URL` | 无 | 任务完成后的 Webhook 回调推送地址。若配置,服务将在采集成功/失败后主动推送数据。 |
|
| 67 |
+
| `STORAGE_ROOT` | `./storage` | 容器内的本地文件存储路径。 |
|
| 68 |
+
| `ORCHESTRATOR_DB_PATH` | `orchestrator/data/mvp.db` | Orchestrator SQLite 路径(内容库接口 `/content/*` 读取,只读打开)。 |
|
| 69 |
+
| `RAW_DATA_RETENTION_DAYS` | `7` | 浏览器引擎采集时保留的原始 HTML 快照天数,过期自动清理。 |
|
| 70 |
+
| `ERROR_SUMMARY_SCAN_LIMIT` | `1000` | 错误中心默认扫描最近任务数(用于 `/errors/summary` 的默认 `scan_limit`)。 |
|
| 71 |
+
| `MEDIACRAWLER_STEALTH` | `1` | 浏览器引擎是否启用 Stealth 脚本注入(`0` 关闭)。 |
|
| 72 |
+
| `MEDIACRAWLER_HUMANIZE` | `1` | 浏览器引擎是否启用随机拟人化动作(`0` 关闭)。 |
|
| 73 |
+
| `AGENT_LLM_API_KEY` | 无 | **[必填]** 用于驱动 AI Agent 操作浏览器的视觉大模型 API 密钥(如 OpenAI Key)。 |
|
| 74 |
+
| `AGENT_LLM_MODEL` | `gpt-4o` | AI Agent 使用的模型名称。必须支持 Vision 多模态能力(如 `gpt-4o`, `claude-3-5-sonnet-20241022`)。 |
|
| 75 |
+
| `AGENT_LLM_BASE_URL` | 无 | 大模型 API 的自定义 Base URL(如使用中转代理地址时填写)。 |
|
| 76 |
+
| `OPENAI_API_KEY` | 无 | 独立配置:如果 AI 图文生成(`ai_generation.py`)需要使用不同的模型或通道,可在此配置。 |
|
| 77 |
+
|
| 78 |
+
---
|
| 79 |
+
|
| 80 |
+
## 3. 数据抓取实战操作指南
|
| 81 |
+
|
| 82 |
+
系统提供了多种灵活的抓取方式,以适应本地测试、运营操作和系统集成等不同场景。
|
| 83 |
+
|
| 84 |
+
### 3.1 方式一:命令行工具 (CLI) 本地抓取(推荐测试与获取 Cookie)
|
| 85 |
+
适合开发者在本地快速测试抓取效果,并获取最新、最完整的防风控 Cookie。
|
| 86 |
+
1. **扫码登录并保存 Cookie**:
|
| 87 |
+
```bash
|
| 88 |
+
python cli.py login pc-qrcode --save-cookies --write-env
|
| 89 |
+
```
|
| 90 |
+
> 运行后会弹出真实的浏览器界面供你扫码登录,登录成功后会自动将完整的 Cookie 和浏览器状态(包含 `storage_state.json`)更新到 `.env` 文件中。这是解决 `auth` 风控拦截的最佳方案。
|
| 91 |
+
2. **执行抓取命令**:
|
| 92 |
+
```bash
|
| 93 |
+
python cli.py search --query "小红书架构设计" --num 10
|
| 94 |
+
```
|
| 95 |
+
> 支持的子命令包括 `search` 等,结果会默认保存到项目目录下的 `datas/excel_datas/` 文件夹中。
|
| 96 |
+
|
| 97 |
+
### 3.2 方式二:前端运营控制台 (Web UI) 可视化抓取
|
| 98 |
+
适合运营人员或无需编写代码的用户进行可视化交互操作。
|
| 99 |
+
1. **启动服务**:确保后端 FastAPI 和前端 Vite 服务已正常启动(参考文档 1.2 和 1.3 节)。
|
| 100 |
+
2. **访问控制台**:在浏览器中打开 `http://localhost:5173/tasks` 进入任务中心。
|
| 101 |
+
3. **创建任务**:
|
| 102 |
+
- **task_type**:填写 `search`(搜索)、`note_url`(单篇笔记)或 `user_profile`(用户主页)。
|
| 103 |
+
- **target**:填写对应的关键词、笔记链接或用户主页链接。
|
| 104 |
+
- 点击 **创建任务**。
|
| 105 |
+
4. **查看结果**:任务创建后会在列表中显示,点击列表中的任务 ID 即可进入详情页,实时查看轮询状态、Raw(原始快照数据)和 Normalized(清洗后标准数据)。
|
| 106 |
+
|
| 107 |
+
### 3.3 方式三:API 接口调用 (上游业务集成)
|
| 108 |
+
适合将数据采集能力集成到 Java/Go/Python 等后端微服务中。
|
| 109 |
+
- **操作步骤**:通过发送 HTTP POST 请求发起任务,再通过 GET 请求轮询结果,或配置 Webhook 接收自动回调。
|
| 110 |
+
- **详细说明**:请参考本文档第 **4. 对接指南 (上游业务调用)** 节。
|
| 111 |
+
|
| 112 |
+
### 3.4 方式四:离线兜底与插件 RPA 回传
|
| 113 |
+
当线上自动化引擎均遭遇极严格风控拦截时的人工兜底方案。
|
| 114 |
+
- **操作步骤**:利用小红书数据助手导出 Excel 或使用 Chrome 插件在真实浏览器中抓取,随后调用对应的导入接口将数据回传至服务端。
|
| 115 |
+
- **详细说明**:请参考本文档 **4.4 场景三:离线数据兜底导入** 及 **4.5 浏览器插件 RPA 回传**。
|
| 116 |
+
|
| 117 |
+
---
|
| 118 |
+
|
| 119 |
+
## 4. 对接指南 (上游业务调用)
|
| 120 |
+
|
| 121 |
+
### 4.1 接口规范
|
| 122 |
+
所有 API 请求路径前缀为 `/api/v1`。
|
| 123 |
+
所有响应统一为 JSON 格式:
|
| 124 |
+
```json
|
| 125 |
+
{
|
| 126 |
+
"code": 200, // 200 为成功,100xx 为特定业务错误
|
| 127 |
+
"msg": "success",
|
| 128 |
+
"data": { ... } // 具体业务数据
|
| 129 |
+
}
|
| 130 |
+
```
|
| 131 |
+
|
| 132 |
+
### 4.2 场景一:异步轮询拉取 (推荐)
|
| 133 |
+
最简单且稳定的对接方式。上游(如 Java 服务)发起任务后,定期查询结果。
|
| 134 |
+
|
| 135 |
+
**步骤 1:创建任务**
|
| 136 |
+
```bash
|
| 137 |
+
curl -X POST "http://localhost:8000/api/v1/tasks" \
|
| 138 |
+
-H "Content-Type: application/json" \
|
| 139 |
+
-d '{
|
| 140 |
+
"task_type": "note_url",
|
| 141 |
+
"target": "https://www.xiaohongshu.com/explore/xxxx",
|
| 142 |
+
"engine": "auto"
|
| 143 |
+
}'
|
| 144 |
+
```
|
| 145 |
+
> 返回值中的 `data.task.id` 即为 `task_id`。
|
| 146 |
+
|
| 147 |
+
**步骤 2:轮询获取结果**
|
| 148 |
+
```bash
|
| 149 |
+
curl "http://localhost:8000/api/v1/tasks/<task_id>/result"
|
| 150 |
+
```
|
| 151 |
+
> **注意**:
|
| 152 |
+
> - 如果任务还在执行中,会返回 HTTP `409` (result not ready),上游应捕获 409 并等待 1~2 秒后重试。
|
| 153 |
+
> - 任务结束后返回 200,其中 `data.normalized` 为标准化后的数据字段,`data.meta` 包含采集消耗的引擎与溯源信息。
|
| 154 |
+
|
| 155 |
+
### 4.3 场景二:Webhook 回调推送
|
| 156 |
+
如果上游不希望轮询,可以在环境变量中配置 `CALLBACK_URL`。
|
| 157 |
+
|
| 158 |
+
服务在任务完成后(无论成功或失败),会自动向该地址发起 POST 请求:
|
| 159 |
+
- **Header**: 包含 `Idempotency-Key` (基于 task_id 和内容生成的哈希),上游应据此做**幂等去重**。
|
| 160 |
+
- **Body**:
|
| 161 |
+
```json
|
| 162 |
+
{
|
| 163 |
+
"task_id": "...",
|
| 164 |
+
"result": {
|
| 165 |
+
"raw": {...},
|
| 166 |
+
"normalized": {...},
|
| 167 |
+
"meta": {"ok": true, ...}
|
| 168 |
+
}
|
| 169 |
+
}
|
| 170 |
+
```
|
| 171 |
+
> 如果回调失败(网络抖动或上游返回 5xx),服务会自动使用指数退避算法重试最多 5 次。
|
| 172 |
+
|
| 173 |
+
### 4.4 场景三:离线数据兜底导入
|
| 174 |
+
当线上风控极其严格,导致 API 和浏览器双双失效时,业务方可通过“小红书数据助手”导出 Excel,并由上游系统或运营人员调用导入接口,将离线数据平滑注入现有业务流。
|
| 175 |
+
|
| 176 |
+
```bash
|
| 177 |
+
curl -X POST "http://localhost:8000/api/v1/import/excel" \
|
| 178 |
+
-F "file=@/path/to/data_assistant.xlsx" \
|
| 179 |
+
-F "operator=运营人员张三"
|
| 180 |
+
```
|
| 181 |
+
> 服务会自动识别 Excel 表头(如曝光、阅读、互动等),并返回标准化的 `normalized` 数据数组。
|
| 182 |
+
|
| 183 |
+
### 4.5 浏览器插件 RPA 回传:POST /api/v1/import/extension
|
| 184 |
+
用途:该链路属于**人工兜底通道**(不在服务端自动执行引擎内)。通常用于处于 `waiting_rpa` 的任务:运营人员在真实浏览器环境中完成登录/验证码后,通过 Chrome 插件采集页面并回传;服务端目前只校验 `task_id` 存在性,然后写入结果并将任务更新为 `rpa_imported`。
|
| 185 |
+
|
| 186 |
+
```bash
|
| 187 |
+
curl -X POST "http://localhost:8000/api/v1/import/extension" \
|
| 188 |
+
-H "Content-Type: application/json" \
|
| 189 |
+
-d '{
|
| 190 |
+
"task_id": "...",
|
| 191 |
+
"raw": {...},
|
| 192 |
+
"normalized": {...}
|
| 193 |
+
}'
|
| 194 |
+
```
|
| 195 |
+
|
| 196 |
+
### 4.6 运营控制台只读接口
|
| 197 |
+
资源池中心(只读):
|
| 198 |
+
- `GET /api/v1/resources/accounts`
|
| 199 |
+
- `GET /api/v1/resources/sessions`
|
| 200 |
+
- `GET /api/v1/resources/proxies`
|
| 201 |
+
|
| 202 |
+
错误中心(聚合统计):
|
| 203 |
+
- `GET /api/v1/errors/summary`
|
| 204 |
+
- `scan_limit`:扫描最近任务数量(默认取 `ERROR_SUMMARY_SCAN_LIMIT`,未配置则 1000)
|
| 205 |
+
- `status` / `error_kind`:逗号分隔过滤
|
| 206 |
+
|
| 207 |
+
内容库(Orchestrator SQLite,只读):
|
| 208 |
+
- `GET /api/v1/content/raw-notes`(`query` 模糊匹配 author/url/content)
|
| 209 |
+
- `GET /api/v1/content/cleaned-notes`(`query` 模糊匹配 cleaned_content/raw_note 字段)
|
| 210 |
+
|
| 211 |
+
---
|
| 212 |
+
|
| 213 |
+
## 5. 支持的任务类型 (task_type)
|
| 214 |
+
|
| 215 |
+
在调用 `POST /api/v1/tasks` 时,支持以下三种主要采集任务:
|
| 216 |
+
|
| 217 |
+
| task_type | target 含义 | 示例 | 产出数据 (`normalized` 核心字段) |
|
| 218 |
+
|---|---|---|---|
|
| 219 |
+
| `note_url` | 笔记链接 | `https://www.xiaohongshu.com/explore/xxx` | `note_id`, `title`, `author`, `publish_time` |
|
| 220 |
+
| `user_profile` | 用户主页链接或 ID | `https://www.xiaohongshu.com/user/profile/xxx` | `user_id`, `nickname`, `title` (个性签名) |
|
| 221 |
+
| `search` | 搜索关键词 | `AI Agent` | 匹配该关键词下的多条笔记基础信息列表 |
|
| 222 |
+
|
| 223 |
+
---
|
| 224 |
+
|
| 225 |
+
## 6. 运维与监控
|
| 226 |
+
|
| 227 |
+
### 6.1 IP 限流防雪崩
|
| 228 |
+
服务内置了防雪崩限流机制(默认每个 IP `100次 / 60秒`)。超出阈值的请求会立即被拒绝,并返回 HTTP `429 Too Many Requests` 以及 `Retry-After` Header。
|
| 229 |
+
> **建议**:上游业务在收到 429 时,应遵守 `Retry-After` 指定的秒数进行休眠,避免持续轰炸。
|
| 230 |
+
|
| 231 |
+
### 6.2 Prometheus 指标监控
|
| 232 |
+
服务直接暴露标准的 Prometheus Metrics 接口:
|
| 233 |
+
```bash
|
| 234 |
+
curl http://localhost:8000/api/v1/metrics
|
| 235 |
+
```
|
| 236 |
+
**核心监控指标**:
|
| 237 |
+
- `spider_xhs_tasks_total{engine="api", status="succeeded"}`:按引擎和状态统计的任务总数。
|
| 238 |
+
- `spider_xhs_queue_length`:当前正在排队中的积压任务数。
|
| 239 |
+
- `spider_xhs_recent_failure_rate`:近 5 分钟内的实时失败率(可用于配置 Grafana 报警:如 > 20% 时触发风控预警)。
|
| 240 |
+
- 代理池指标(若启用 Proxy Pool):`spider_xhs_proxy_pool_size`、`spider_xhs_proxy_pool_avg_score`、`spider_xhs_proxy_pool_ejected_total`、`spider_xhs_proxy_pool_failures_total{reason}`。
|
| 241 |
+
|
| 242 |
+
### 6.3 存储与数据清理
|
| 243 |
+
所有任务状态、采集结果、失败回调记录以及日志,均持久化存储在宿主机挂载的 `./storage` 目录下。
|
| 244 |
+
- **原子落盘 + 进程锁**:写入采用临时文件 + `os.replace` 原子替换;关键更新路径使用 `fcntl.flock` 做跨进程互斥,避免并发更新覆盖。
|
| 245 |
+
- **自动清理**:服务后台线程会每天定期扫描 `./storage/raw` 目录,自动清理超过 `RAW_DATA_RETENTION_DAYS`(默认 7 天)的 HTML 页面快照,防止磁盘爆满。
|
| 246 |
+
|
| 247 |
+
### 6.4 常见错误排查
|
| 248 |
+
|
| 249 |
+
如果在获取结果时发现 `data.meta.ok = false`,请查看 `data.meta.error_kind`:
|
| 250 |
+
|
| 251 |
+
- **`auth`**: 鉴权失败。可能是由于 Cookie 失效。**系统策略**:标记为失效并切换 Session Pool,若耗尽则报错。
|
| 252 |
+
- **`rate`**: 遭遇频控。**系统策略**:为当前账号设置冷却窗(例如 15 分钟),换号、降频并重试。
|
| 253 |
+
- **`risk` / `captcha`**: 触发滑动验证码。**系统策略**:主引擎报错,降级至浏览器引擎;若被验证码拦截,系统会唤醒 **Agentic Captcha Solver**(基于视觉大模型)自动拖动滑块/点选验证码;若 AI 依然失败,则置为 `waiting_rpa`,等待人工借助 Chrome 插件拉取。
|
| 254 |
+
- **`parse`**: 页面解析失败。通常发生在小红书前端 DOM 结构发生重大改版时。**系统策略**:自动降级唤醒 **Engine D (Agentic Crawler)**,利用大模型的视觉和推理能力自动寻找并提取页面数据,无视 DOM 结构变化。
|
| 255 |
+
- **`timeout` / `proxy_failed`**: 网络超时。**系统策略**:代理池立刻将此 IP 降分或剔除,换新代理并重新请求。
|
| 256 |
+
|
| 257 |
+
---
|
| 258 |
+
|
| 259 |
+
## 7. AI 自动化编排模块 (Orchestrator MVP)
|
| 260 |
+
|
| 261 |
+
本项目在底层采集服务之上,提供了一套基于 Python 脚本的轻量级业务编排样例(位于 `orchestrator/` 目录),演示了借助大模型和 Agentic 技术实现的**获客全链路自动化**。
|
| 262 |
+
|
| 263 |
+
### 7.1 初始化数据库与测试数据
|
| 264 |
+
编排模块依赖 SQLite (`orchestrator/data/mvp.db`) 作为主数据库,首次使用前需初始化:
|
| 265 |
+
```bash
|
| 266 |
+
# 生成 14 张核心业务表
|
| 267 |
+
python orchestrator/db_init.py
|
| 268 |
+
|
| 269 |
+
# 录入测试用的关键词与竞品账号
|
| 270 |
+
python orchestrator/seed_data.py
|
| 271 |
+
```
|
| 272 |
+
|
| 273 |
+
### 7.2 核心业务流转脚本
|
| 274 |
+
请确保后台 FastAPI 采集服务已启动,并且在环境变量中配置了 `AGENT_LLM_API_KEY`。可按顺序执行以下脚本体验闭环:
|
| 275 |
+
|
| 276 |
+
1. **采集同步**:读取关键词并请求爬虫微服务,将 JSON 数据落库。
|
| 277 |
+
```bash
|
| 278 |
+
python orchestrator/crawl_sync.py
|
| 279 |
+
```
|
| 280 |
+
2. **数据清洗**:从原始快照中清洗提取标题、互动量等结构化字段。
|
| 281 |
+
```bash
|
| 282 |
+
python orchestrator/note_cleaner.py
|
| 283 |
+
```
|
| 284 |
+
3. **AI 图文生成**:利用大模型对爆款笔记进行分析、仿写与原创配图生成。
|
| 285 |
+
```bash
|
| 286 |
+
python orchestrator/ai_generation.py
|
| 287 |
+
```
|
| 288 |
+
4. **AI 自动发布**:唤醒基于 `browser-use` 的 AI 浏览器代理,自动操控小红书创作者中心进行真实发帖。
|
| 289 |
+
```bash
|
| 290 |
+
python orchestrator/publish_tracker.py
|
| 291 |
+
```
|
| 292 |
+
5. **AI 自动私信触达**:筛选出高意向互动用户,利用 AI Agent 自动打开私信窗口进行获客留资话术的回复。
|
| 293 |
+
```bash
|
| 294 |
+
python orchestrator/lead_service.py
|
| 295 |
+
```
|
| 296 |
+
6. **飞书同步与告警(Mock)**:将获取的线索同步至上游 CRM(如飞书多维表格)。
|
| 297 |
+
```bash
|
| 298 |
+
python orchestrator/feishu_sync.py
|
| 299 |
+
```
|
apis/__init__.py
ADDED
|
File without changes
|
apis/xhs_creator_apis.py
ADDED
|
@@ -0,0 +1,428 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import time
|
| 3 |
+
|
| 4 |
+
import cv2
|
| 5 |
+
import numpy as np
|
| 6 |
+
from xhs_utils.cookie_util import trans_cookies
|
| 7 |
+
from xhs_utils.xhs_creator_util import get_upload_media_headers, get_post_note_headers, \
|
| 8 |
+
get_loc_data, signature_js, get_fileIds_params, get_query_transcode_headers, \
|
| 9 |
+
get_encryption_headers, sign_js, get_post_note_video_data, get_post_note_image_data, get_common_headers, \
|
| 10 |
+
generate_xs, generate_xsc, get_search_location_headers
|
| 11 |
+
from xhs_utils.xhs_util import splice_str, generate_x_b3_traceid
|
| 12 |
+
from xhs_utils.http_client import HttpClient
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class XHS_Creator_Apis():
|
| 16 |
+
def __init__(self):
|
| 17 |
+
self.base_url = "https://creator.xiaohongshu.com"
|
| 18 |
+
self.upload_url = "https://ros-upload.xiaohongshu.com"
|
| 19 |
+
self.edith_url = "https://edith.xiaohongshu.com"
|
| 20 |
+
self.xhs_web_url = "https://www.xiaohongshu.com"
|
| 21 |
+
self.client = HttpClient()
|
| 22 |
+
|
| 23 |
+
def get_topic(self, keyword, cookies):
|
| 24 |
+
try:
|
| 25 |
+
api = "/web_api/sns/v1/search/topic"
|
| 26 |
+
data = {
|
| 27 |
+
"keyword": keyword,
|
| 28 |
+
"suggest_topic_request": {
|
| 29 |
+
"title": "",
|
| 30 |
+
"desc": f"#{keyword}"
|
| 31 |
+
},
|
| 32 |
+
"page": {
|
| 33 |
+
"page_size": 20,
|
| 34 |
+
"page": 1
|
| 35 |
+
}
|
| 36 |
+
}
|
| 37 |
+
headers = get_common_headers()
|
| 38 |
+
xs, xt, data = generate_xs(cookies['a1'], api, data)
|
| 39 |
+
headers['x-s'], headers['x-t'] = xs, str(xt)
|
| 40 |
+
result = self.client.request_json("POST", self.edith_url + api, headers=headers, cookies=cookies, data=data.encode('utf-8'))
|
| 41 |
+
if not result.ok:
|
| 42 |
+
raise Exception(result.msg)
|
| 43 |
+
res_json = result.json
|
| 44 |
+
success, msg = res_json["success"], res_json["msg"]
|
| 45 |
+
except Exception as e:
|
| 46 |
+
res_json = None
|
| 47 |
+
success, msg = False, str(e)
|
| 48 |
+
return success, msg, res_json
|
| 49 |
+
|
| 50 |
+
def get_location_info(self, keyword, cookies):
|
| 51 |
+
try:
|
| 52 |
+
data = get_loc_data(keyword)
|
| 53 |
+
api = "/web_api/sns/v1/local/poi/creator/search"
|
| 54 |
+
headers = get_search_location_headers()
|
| 55 |
+
h = generate_xsc(cookies['a1'], api, data)
|
| 56 |
+
headers.update(h)
|
| 57 |
+
if data:
|
| 58 |
+
data = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
|
| 59 |
+
result = self.client.request_json("POST", self.edith_url + api, headers=headers, cookies=cookies, data=data.encode('utf-8'))
|
| 60 |
+
if not result.ok:
|
| 61 |
+
raise Exception(result.msg)
|
| 62 |
+
res_json = result.json
|
| 63 |
+
success, msg = res_json["success"], res_json["msg"]
|
| 64 |
+
except Exception as e:
|
| 65 |
+
res_json = None
|
| 66 |
+
success, msg = False, str(e)
|
| 67 |
+
return success, msg, res_json
|
| 68 |
+
|
| 69 |
+
# media_type: image or video
|
| 70 |
+
def get_fileIds(self, media_type, cookies):
|
| 71 |
+
try:
|
| 72 |
+
api = "/api/media/v1/upload/creator/permit"
|
| 73 |
+
headers = {
|
| 74 |
+
"accept": "application/json, text/plain, */*",
|
| 75 |
+
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
|
| 76 |
+
"authorization;": "",
|
| 77 |
+
"cache-control": "no-cache",
|
| 78 |
+
"pragma": "no-cache",
|
| 79 |
+
"priority": "u=1, i",
|
| 80 |
+
"referer": "https://creator.xiaohongshu.com/publish/publish?source=official&from=menu&target=image",
|
| 81 |
+
"sec-ch-ua": "\"Not)A;Brand\";v=\"8\", \"Chromium\";v=\"138\", \"Microsoft Edge\";v=\"138\"",
|
| 82 |
+
"sec-ch-ua-mobile": "?0",
|
| 83 |
+
"sec-ch-ua-platform": "\"Windows\"",
|
| 84 |
+
"sec-fetch-dest": "empty",
|
| 85 |
+
"sec-fetch-mode": "cors",
|
| 86 |
+
"sec-fetch-site": "same-origin",
|
| 87 |
+
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36 Edg/138.0.0.0",
|
| 88 |
+
"x-b3-traceid": "f4f93b86e05f2402",
|
| 89 |
+
"x-s": "XYS_2UQhPsHCH0c1PjhFHjIj2erjwjQM89PjNsQhPjHCHS4kJfz647PjNsQhPUHCHfM1qAZlPebKPbYxwrk9+LEt4p4OJbmLG98e4M4HLgihGDE9y9krzd8r+DEI4Bz+pepY4n+w//QfafzgwBH94A+G2BQxcAmG/ApG2gYyLrhE2rhl8ePlanWM//8Y+f+OLLH9/rzjpe4aabSayBYBL9kVz/YNPLiFGDkjJLSy2dps4n8GGnHF/fRs+M+bnDEtyA8Y+nq62dY8PFRH40zozFkwNAm+wBFMGjHVHdWFH0ijHdF=",
|
| 90 |
+
"x-t": str(int(time.time() * 1000))
|
| 91 |
+
}
|
| 92 |
+
params = get_fileIds_params(media_type)
|
| 93 |
+
splice_api = splice_str(api, params)
|
| 94 |
+
|
| 95 |
+
xs, xt, _ = generate_xs(cookies['a1'], splice_api)
|
| 96 |
+
headers['x-s'], headers['x-t'] = xs, str(xt)
|
| 97 |
+
result = self.client.request_json("GET", self.base_url + splice_api, headers=headers, cookies=cookies)
|
| 98 |
+
if not result.ok:
|
| 99 |
+
raise Exception(result.msg)
|
| 100 |
+
res_json = result.json
|
| 101 |
+
success, msg = res_json["success"], '获取fileIds成功'
|
| 102 |
+
except Exception as e:
|
| 103 |
+
return False, str(e), (None, None)
|
| 104 |
+
return success, msg, (res_json, xt)
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def upload_media(self, path_or_file, media_type, cookies):
|
| 108 |
+
res = {
|
| 109 |
+
"fileIds": '',
|
| 110 |
+
"width": '',
|
| 111 |
+
"height": '',
|
| 112 |
+
"video_id": ''
|
| 113 |
+
}
|
| 114 |
+
try:
|
| 115 |
+
success, msg, (data, xt) = self.get_fileIds(media_type, cookies)
|
| 116 |
+
if not success:
|
| 117 |
+
raise Exception(msg)
|
| 118 |
+
data = data['data']['uploadTempPermits'][0]
|
| 119 |
+
fileIds, expireTime, token = data['fileIds'][0].split('/')[-1], data['expireTime'], data['token']
|
| 120 |
+
res['fileIds'] = fileIds
|
| 121 |
+
xt, expireTime = str(xt)[:10], str(expireTime)[:10]
|
| 122 |
+
message = f"{xt};{expireTime}"
|
| 123 |
+
if media_type == "image":
|
| 124 |
+
width, height, file, file_size = self.get_file_info(path_or_file, media_type="image")
|
| 125 |
+
res['width'] = width
|
| 126 |
+
res['height'] = height
|
| 127 |
+
else:
|
| 128 |
+
file, file_size = self.get_file_info(path_or_file, media_type="video")
|
| 129 |
+
signature = signature_js.call('getSignature', message, fileIds, file_size)
|
| 130 |
+
headers = get_upload_media_headers(message, signature, token)
|
| 131 |
+
api = f"/spectrum/{fileIds}"
|
| 132 |
+
result = self.client.request_text("PUT", self.upload_url + api, headers=headers, cookies=cookies, data=file)
|
| 133 |
+
if not result.ok:
|
| 134 |
+
raise Exception(result.msg)
|
| 135 |
+
if media_type == "video" and result.headers:
|
| 136 |
+
res['video_id'] = result.headers.get('X-Ros-Video-Id', '')
|
| 137 |
+
except Exception as e:
|
| 138 |
+
return False, str(e), None
|
| 139 |
+
return True, "上传成功", res
|
| 140 |
+
|
| 141 |
+
def query_transcode(self, video_id, cookies):
|
| 142 |
+
res_json = None
|
| 143 |
+
success, msg = False, ''
|
| 144 |
+
try:
|
| 145 |
+
api = "/fe_api/burdock/v2/note/query_transcode"
|
| 146 |
+
headers = get_query_transcode_headers()
|
| 147 |
+
data = {
|
| 148 |
+
"videoId": video_id
|
| 149 |
+
}
|
| 150 |
+
xs, xt, data = generate_xs(cookies['a1'], api, data)
|
| 151 |
+
headers['x-b3-traceid'] = generate_x_b3_traceid()
|
| 152 |
+
headers['x-s'], headers['x-t'] = xs, str(xt)
|
| 153 |
+
result = self.client.request_json("POST", self.xhs_web_url + api, headers=headers, cookies=cookies, data=data)
|
| 154 |
+
if not result.ok:
|
| 155 |
+
raise Exception(result.msg)
|
| 156 |
+
res_json = result.json
|
| 157 |
+
success = res_json["success"]
|
| 158 |
+
if 'msg' in res_json:
|
| 159 |
+
msg = res_json['msg']
|
| 160 |
+
except Exception as e:
|
| 161 |
+
success, msg = False, str(e)
|
| 162 |
+
return success, msg, res_json
|
| 163 |
+
|
| 164 |
+
def encryption(self, file_id, cookies):
|
| 165 |
+
res_json = None
|
| 166 |
+
success, msg = False, ''
|
| 167 |
+
try:
|
| 168 |
+
api = "/web_api/sns/v5/creator/file/encryption"
|
| 169 |
+
headers = get_encryption_headers()
|
| 170 |
+
params = {
|
| 171 |
+
"file_id": file_id,
|
| 172 |
+
"type": "image",
|
| 173 |
+
"ts": str(int(time.time() * 1000)),
|
| 174 |
+
"sign": ""
|
| 175 |
+
}
|
| 176 |
+
sign = sign_js.call('urlSing', file_id)
|
| 177 |
+
params['sign'] = sign
|
| 178 |
+
splice_api = splice_str(api, params)
|
| 179 |
+
xs, xt, _ = generate_xs(cookies['a1'], splice_api)
|
| 180 |
+
headers['x-b3-traceid'] = generate_x_b3_traceid()
|
| 181 |
+
headers['x-s'], headers['x-t'] = xs, str(xt)
|
| 182 |
+
result = self.client.request_json("GET", self.xhs_web_url + splice_api, headers=headers, cookies=cookies)
|
| 183 |
+
if not result.ok:
|
| 184 |
+
raise Exception(result.msg)
|
| 185 |
+
res_json = result.json
|
| 186 |
+
success, msg = res_json["success"], res_json["msg"]
|
| 187 |
+
except Exception as e:
|
| 188 |
+
success, msg = False, str(e)
|
| 189 |
+
return success, msg, res_json
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
def post_note(self, noteInfo, cookies_str):
|
| 193 |
+
post_api = "/web_api/sns/v2/note"
|
| 194 |
+
headers = get_post_note_headers()
|
| 195 |
+
cookies = trans_cookies(cookies_str)
|
| 196 |
+
title = noteInfo['title']
|
| 197 |
+
desc = noteInfo['desc']
|
| 198 |
+
postTime = noteInfo['postTime']
|
| 199 |
+
location = noteInfo['location']
|
| 200 |
+
type = noteInfo['type']
|
| 201 |
+
media_type = noteInfo['media_type']
|
| 202 |
+
|
| 203 |
+
if location is not None:
|
| 204 |
+
success, msg, location_info = self.get_location_info(location, cookies)
|
| 205 |
+
if not success:
|
| 206 |
+
raise Exception(msg)
|
| 207 |
+
if len(location_info['data']['poi_list']) == 0:
|
| 208 |
+
raise Exception('未找到该地点')
|
| 209 |
+
location = location_info['data']['poi_list'][0]
|
| 210 |
+
post_loc = {
|
| 211 |
+
"name": location['name'],
|
| 212 |
+
"subname": location['full_address'],
|
| 213 |
+
"poi_id": location['poi_id'],
|
| 214 |
+
"poi_type": location['poi_type'],
|
| 215 |
+
}
|
| 216 |
+
else:
|
| 217 |
+
post_loc = {}
|
| 218 |
+
if media_type == 'video':
|
| 219 |
+
video = noteInfo['video']
|
| 220 |
+
success, msg, fileInfo = self.upload_media(video, media_type, cookies)
|
| 221 |
+
if not success:
|
| 222 |
+
raise Exception(msg)
|
| 223 |
+
firstFrameFileId = ''
|
| 224 |
+
while True:
|
| 225 |
+
success, msg, res = self.query_transcode(fileInfo['video_id'], cookies)
|
| 226 |
+
if not success:
|
| 227 |
+
raise Exception(msg)
|
| 228 |
+
if res['data']['hasFirstFrame'] == True:
|
| 229 |
+
firstFrameFileId = res['data']['firstFrameFileId']
|
| 230 |
+
break
|
| 231 |
+
time.sleep(3)
|
| 232 |
+
success, msg, res = self.encryption('/' + firstFrameFileId, cookies)
|
| 233 |
+
if not success:
|
| 234 |
+
raise Exception(msg)
|
| 235 |
+
data = get_post_note_video_data(title, desc, postTime, post_loc, type, fileInfo, firstFrameFileId)
|
| 236 |
+
else:
|
| 237 |
+
fileInfos = []
|
| 238 |
+
images = noteInfo['images']
|
| 239 |
+
for image in images:
|
| 240 |
+
success, msg, fileInfo = self.upload_media(image, media_type, cookies)
|
| 241 |
+
if not success:
|
| 242 |
+
raise Exception(msg)
|
| 243 |
+
fileInfos.append(fileInfo)
|
| 244 |
+
data = get_post_note_image_data(title, desc, postTime, post_loc, type, fileInfos)
|
| 245 |
+
topics = noteInfo['topics']
|
| 246 |
+
for topic in topics:
|
| 247 |
+
success, msg, res_json = self.get_topic(topic, cookies)
|
| 248 |
+
if not success:
|
| 249 |
+
raise Exception(msg)
|
| 250 |
+
if len(res_json['data']['topic_info_dtos']) == 0:
|
| 251 |
+
raise Exception(f'未找到话题{topic}')
|
| 252 |
+
insert_topic = res_json['data']['topic_info_dtos'][0]
|
| 253 |
+
insert_topic = {
|
| 254 |
+
"id": insert_topic['id'],
|
| 255 |
+
"link": insert_topic['link'],
|
| 256 |
+
"name": insert_topic['name'],
|
| 257 |
+
"type": 'topic'
|
| 258 |
+
}
|
| 259 |
+
data['common']['hash_tag'].append(insert_topic)
|
| 260 |
+
data['common']['desc'] += f" #{insert_topic['name']}[话题]# "
|
| 261 |
+
|
| 262 |
+
# headers['x-s'] = xs
|
| 263 |
+
# headers['x-t'] = str(int(time.time() * 1000))
|
| 264 |
+
|
| 265 |
+
xs, xt, _ = generate_xs(cookies['a1'], post_api, data)
|
| 266 |
+
headers['x-s'], headers['x-t'] = xs, str(xt)
|
| 267 |
+
|
| 268 |
+
if data:
|
| 269 |
+
data = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
|
| 270 |
+
|
| 271 |
+
# xs, xt, data = generate_xs(cookies['a1'], post_api, data)
|
| 272 |
+
# headers['x-s'], headers['x-t'] = xs, str(xt)
|
| 273 |
+
result = self.client.request_json("POST", self.edith_url + post_api, headers=headers, cookies=cookies, data=data.encode('utf-8'))
|
| 274 |
+
if not result.ok:
|
| 275 |
+
raise Exception(result.msg)
|
| 276 |
+
res_json = result.json
|
| 277 |
+
success, msg = res_json["success"], res_json["msg"]
|
| 278 |
+
return success, msg, res_json
|
| 279 |
+
|
| 280 |
+
def get_file_info(self, file, media_type="image"):
|
| 281 |
+
file_size = len(file)
|
| 282 |
+
if media_type == "image":
|
| 283 |
+
size = cv2.imdecode(np.frombuffer(file, np.uint8), cv2.IMREAD_COLOR).shape
|
| 284 |
+
w, h = size[1], size[0]
|
| 285 |
+
if w > 2 * h:
|
| 286 |
+
h = int(w / 2)
|
| 287 |
+
return w, h, file, file_size
|
| 288 |
+
else:
|
| 289 |
+
return file, file_size
|
| 290 |
+
|
| 291 |
+
|
| 292 |
+
# # page: 页数
|
| 293 |
+
# # time: 最近几天的时间
|
| 294 |
+
# def get_publish_note_info(self, page: int, time: int, cookies_str):
|
| 295 |
+
# success = False
|
| 296 |
+
# msg = '成功'
|
| 297 |
+
# res_json = None
|
| 298 |
+
# try:
|
| 299 |
+
# api = "/api/galaxy/creator/data/note_stats/new"
|
| 300 |
+
# headers = get_common_headers()
|
| 301 |
+
# cookies = trans_cookies(cookies_str)
|
| 302 |
+
# xs, xt, _ = generate_xs(cookies['a1'], '/api/galaxy/creator/data/note_stats/new', '')
|
| 303 |
+
# headers['x-s'], headers['x-t'] = xs, str(xt)
|
| 304 |
+
# headers['x-b3-traceid'] = generate_x_b3_traceid()
|
| 305 |
+
# params = {
|
| 306 |
+
# "page": str(page),
|
| 307 |
+
# "page_size": "12",
|
| 308 |
+
# "sort_by": "time",
|
| 309 |
+
# "note_type": "0",
|
| 310 |
+
# "time": str(time),
|
| 311 |
+
# "is_recent": "false"
|
| 312 |
+
# }
|
| 313 |
+
# response = requests.get(self.base_url + api, headers=headers, cookies=cookies, params=params)
|
| 314 |
+
# res_json = response.json()
|
| 315 |
+
# success = res_json["success"]
|
| 316 |
+
# except Exception as e:
|
| 317 |
+
# success, msg = False, str(e)
|
| 318 |
+
# return success, msg, res_json
|
| 319 |
+
#
|
| 320 |
+
#
|
| 321 |
+
# # 获取全部的发布信息
|
| 322 |
+
# def get_all_publish_note_info(self, cookies_str):
|
| 323 |
+
# page = 1
|
| 324 |
+
# time = 7
|
| 325 |
+
# success, msg, res_json = self.get_publish_note_info(page, time, cookies_str)
|
| 326 |
+
# if not success:
|
| 327 |
+
# return False, msg, None
|
| 328 |
+
# notes = res_json['data']['note_infos']
|
| 329 |
+
# total = res_json['data']['total']
|
| 330 |
+
# while len(notes) < total:
|
| 331 |
+
# page += 1
|
| 332 |
+
# success, msg, res_json = self.get_publish_note_info(page, time, cookies_str)
|
| 333 |
+
# if not success:
|
| 334 |
+
# return False, msg, None
|
| 335 |
+
# notes += res_json['data']['note_infos']
|
| 336 |
+
# return True, '成功', notes
|
| 337 |
+
|
| 338 |
+
# page: 页数
|
| 339 |
+
# time: 最近几天的时间
|
| 340 |
+
def get_publish_note_info(self, page, cookies_str):
|
| 341 |
+
success = False
|
| 342 |
+
msg = '成功'
|
| 343 |
+
res_json = None
|
| 344 |
+
try:
|
| 345 |
+
api = "/api/galaxy/creator/note/user/posted"
|
| 346 |
+
headers = get_common_headers()
|
| 347 |
+
cookies = trans_cookies(cookies_str)
|
| 348 |
+
xs, xt, _ = generate_xs(cookies['a1'], api, '')
|
| 349 |
+
headers['x-s'], headers['x-t'] = xs, str(xt)
|
| 350 |
+
headers['x-b3-traceid'] = generate_x_b3_traceid()
|
| 351 |
+
params = {
|
| 352 |
+
"tab": '0',
|
| 353 |
+
}
|
| 354 |
+
if page:
|
| 355 |
+
params["page"] = str(page)
|
| 356 |
+
result = self.client.request_json("GET", self.base_url + api, headers=headers, cookies=cookies, params=params)
|
| 357 |
+
if not result.ok:
|
| 358 |
+
raise Exception(result.msg)
|
| 359 |
+
res_json = result.json
|
| 360 |
+
success = res_json["success"]
|
| 361 |
+
except Exception as e:
|
| 362 |
+
success, msg = False, str(e)
|
| 363 |
+
return success, msg, res_json
|
| 364 |
+
|
| 365 |
+
|
| 366 |
+
# 获取全部的发布信息
|
| 367 |
+
def get_all_publish_note_info(self, cookies_str):
|
| 368 |
+
page = None
|
| 369 |
+
notes = []
|
| 370 |
+
while True:
|
| 371 |
+
success, msg, res_json = self.get_publish_note_info(page, cookies_str)
|
| 372 |
+
print(success, msg, res_json)
|
| 373 |
+
if not success:
|
| 374 |
+
return False, msg, notes
|
| 375 |
+
notes += res_json['data']['notes']
|
| 376 |
+
page = res_json['data']['page']
|
| 377 |
+
if page == -1:
|
| 378 |
+
break
|
| 379 |
+
return True, '成功', notes
|
| 380 |
+
|
| 381 |
+
|
| 382 |
+
if __name__ == '__main__':
|
| 383 |
+
xhs_creator_apis = XHS_Creator_Apis()
|
| 384 |
+
# 创作者平台 https://creator.xiaohongshu.com/login 的cookie
|
| 385 |
+
cookies_str = r''
|
| 386 |
+
noteInfos = [
|
| 387 |
+
{
|
| 388 |
+
# 标题
|
| 389 |
+
"title": "21121121212",
|
| 390 |
+
# 描述
|
| 391 |
+
"desc": "dwadaw最后一把直接神之一手直接立直后第一轮就胡牌了,最近吃点好的,哈哈",
|
| 392 |
+
# 13位时间戳 数字类型
|
| 393 |
+
"postTime": None,
|
| 394 |
+
# 设置地点 "河海大学"
|
| 395 |
+
"location": '南京',
|
| 396 |
+
# 0:公开 1:私密
|
| 397 |
+
"type": 1,
|
| 398 |
+
"media_type": "image",
|
| 399 |
+
# 设置话题
|
| 400 |
+
# "topics": ["雀魂", "麻将"],
|
| 401 |
+
"topics": [],
|
| 402 |
+
# 图片路径 最多15张
|
| 403 |
+
"images": [
|
| 404 |
+
open(r"D:\Desktop\签名\QQ图片20240903150607.jpg", 'rb').read(),
|
| 405 |
+
],
|
| 406 |
+
},
|
| 407 |
+
{
|
| 408 |
+
"title": "test2",
|
| 409 |
+
"desc": "dwadawd20240815",
|
| 410 |
+
"postTime": None,
|
| 411 |
+
"location": '河海大学',
|
| 412 |
+
"topics": ["北京"],
|
| 413 |
+
# "topics": [],
|
| 414 |
+
"type": 1,
|
| 415 |
+
"media_type": "video",
|
| 416 |
+
"video": open(r"D:\data\Videos\2024-05-02 21-14-45.mkv", 'rb').read(),
|
| 417 |
+
}
|
| 418 |
+
]
|
| 419 |
+
for noteInfo in noteInfos:
|
| 420 |
+
success, msg, info = xhs_creator_apis.post_note(noteInfo, cookies_str)
|
| 421 |
+
print(success, msg, info)
|
| 422 |
+
print('========')
|
| 423 |
+
|
| 424 |
+
# topics = ["雀魂", "麻将"]
|
| 425 |
+
# cookies = trans_cookies(cookies_str)
|
| 426 |
+
# for topic in topics:
|
| 427 |
+
# success, msg, res_json = xhs_creator_apis.get_topic(topic, cookies)
|
| 428 |
+
# print(success, msg, res_json)
|
apis/xhs_creator_login_apis.py
ADDED
|
@@ -0,0 +1,317 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from threading import Thread
|
| 3 |
+
|
| 4 |
+
import aiohttp
|
| 5 |
+
import asyncio
|
| 6 |
+
|
| 7 |
+
import qrcode
|
| 8 |
+
|
| 9 |
+
from apis.xhs_creator_apis import XHS_Creator_Apis
|
| 10 |
+
from xhs_utils.xhs_creator_util import generate_xs, splice_str, get_common_headers
|
| 11 |
+
from playwright.async_api import async_playwright
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class XHSLoginApi:
|
| 15 |
+
def __init__(self):
|
| 16 |
+
self.base_url = "https://customer.xiaohongshu.com"
|
| 17 |
+
self.home_url = 'https://creator.xiaohongshu.com'
|
| 18 |
+
|
| 19 |
+
# 生成初始cookies
|
| 20 |
+
async def creatorCheckInitCookies(self, page):
|
| 21 |
+
while True:
|
| 22 |
+
cookies = dict()
|
| 23 |
+
page_cookies = await page.context.cookies()
|
| 24 |
+
for cookie in page_cookies:
|
| 25 |
+
cookies[cookie['name']] = cookie['value']
|
| 26 |
+
if "a1" in cookies and "xsecappid" in cookies and "webId" in cookies and "acw_tc" in cookies and "gid" in cookies and "websectiga" in cookies and "sec_poison_id" in cookies:
|
| 27 |
+
break
|
| 28 |
+
await asyncio.sleep(1)
|
| 29 |
+
return cookies
|
| 30 |
+
|
| 31 |
+
async def creatorGenerateInitCookies(self, headless=True):
|
| 32 |
+
async with async_playwright() as p:
|
| 33 |
+
browser = await p.chromium.launch(
|
| 34 |
+
headless=headless,
|
| 35 |
+
args=[
|
| 36 |
+
'--disable-blink-features=AutomationControlled',
|
| 37 |
+
],
|
| 38 |
+
)
|
| 39 |
+
page = await browser.new_page()
|
| 40 |
+
await page.goto(self.home_url + '/login')
|
| 41 |
+
cookies = await self.creatorCheckInitCookies(page)
|
| 42 |
+
await browser.close()
|
| 43 |
+
return cookies
|
| 44 |
+
|
| 45 |
+
# 手机验证码登录
|
| 46 |
+
async def creatorGeneratePhoneCode(self, phone, cookies):
|
| 47 |
+
res_json = None
|
| 48 |
+
try:
|
| 49 |
+
api = "/api/cas/customer/web/verify-code"
|
| 50 |
+
data = {
|
| 51 |
+
"service": "https://creator.xiaohongshu.com",
|
| 52 |
+
"phone": phone,
|
| 53 |
+
"zone": "86"
|
| 54 |
+
}
|
| 55 |
+
headers = get_common_headers()
|
| 56 |
+
xs, xt, data = generate_xs(cookies['a1'], api, data)
|
| 57 |
+
headers['x-s'], headers['x-t'] = xs, str(xt)
|
| 58 |
+
async with aiohttp.ClientSession() as session:
|
| 59 |
+
async with session.post(self.base_url + api, headers=headers, cookies=cookies, data=data) as response:
|
| 60 |
+
res_json = await response.json()
|
| 61 |
+
success, msg = res_json['success'], res_json['msg']
|
| 62 |
+
except Exception as e:
|
| 63 |
+
return False, str(e), res_json
|
| 64 |
+
return success, msg, res_json
|
| 65 |
+
|
| 66 |
+
async def creatorLoginByPhone(self, phone, code, cookies):
|
| 67 |
+
res_json = None
|
| 68 |
+
try:
|
| 69 |
+
api = "/api/cas/customer/web/service-ticket"
|
| 70 |
+
data = {
|
| 71 |
+
"service": "https://creator.xiaohongshu.com",
|
| 72 |
+
"zone": "86",
|
| 73 |
+
"phone": phone,
|
| 74 |
+
"verify_code": code,
|
| 75 |
+
"source": "",
|
| 76 |
+
"type": 'phoneVerifyCode'
|
| 77 |
+
}
|
| 78 |
+
headers = get_common_headers()
|
| 79 |
+
xs, xt, data = generate_xs(cookies['a1'], api, data)
|
| 80 |
+
headers['x-s'], headers['x-t'] = xs, str(xt)
|
| 81 |
+
async with aiohttp.ClientSession() as session:
|
| 82 |
+
async with session.post(self.base_url + api, headers=headers, cookies=cookies, data=data) as response:
|
| 83 |
+
res_json = await response.json()
|
| 84 |
+
success, msg = res_json['success'], res_json['msg']
|
| 85 |
+
add_cookies = dict()
|
| 86 |
+
return_cookies = response.cookies
|
| 87 |
+
for item in return_cookies.keys():
|
| 88 |
+
add_cookies[return_cookies[item].key] = return_cookies[item].value
|
| 89 |
+
cookies.update(add_cookies)
|
| 90 |
+
except Exception as e:
|
| 91 |
+
return False, str(e), res_json
|
| 92 |
+
return success, msg, {
|
| 93 |
+
"cookies": cookies,
|
| 94 |
+
"res_json": res_json
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
# 二维码扫描登录
|
| 98 |
+
async def creatorGenerateQRcode(self, cookies):
|
| 99 |
+
try:
|
| 100 |
+
api = '/api/cas/customer/web/qr-code'
|
| 101 |
+
data = {
|
| 102 |
+
"service": "https://creator.xiaohongshu.com"
|
| 103 |
+
}
|
| 104 |
+
headers = get_common_headers()
|
| 105 |
+
xs, xt, data = generate_xs(cookies['a1'], api, data)
|
| 106 |
+
headers['x-s'], headers['x-t'] = xs, str(xt)
|
| 107 |
+
async with aiohttp.ClientSession() as session:
|
| 108 |
+
async with session.post(self.base_url + api, headers=headers, cookies=cookies, data=data) as response:
|
| 109 |
+
res = await response.json()
|
| 110 |
+
qr_id, verify_url = res['data']['id'], res["data"]["url"]
|
| 111 |
+
success, msg = res['success'], res['msg']
|
| 112 |
+
except Exception as e:
|
| 113 |
+
return False, str(e), {
|
| 114 |
+
"cookies": cookies,
|
| 115 |
+
"qr_id": None,
|
| 116 |
+
"verify_url": None
|
| 117 |
+
}
|
| 118 |
+
return success, msg, {
|
| 119 |
+
"cookies": cookies,
|
| 120 |
+
"qr_id": qr_id,
|
| 121 |
+
"verify_url": verify_url
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
async def creatorCheckQRCodeLogin(self, qr_id, cookies):
|
| 125 |
+
params = {
|
| 126 |
+
"service": "https://creator.xiaohongshu.com",
|
| 127 |
+
"qr_code_id": qr_id,
|
| 128 |
+
"source": ""
|
| 129 |
+
}
|
| 130 |
+
ticket = None
|
| 131 |
+
try:
|
| 132 |
+
api = f"/api/cas/customer/web/qr-code"
|
| 133 |
+
splice_api = splice_str(api, params)
|
| 134 |
+
headers = get_common_headers()
|
| 135 |
+
xs, xt, _ = generate_xs(cookies['a1'], api)
|
| 136 |
+
headers['x-s'], headers['x-t'] = xs, str(xt)
|
| 137 |
+
async with aiohttp.ClientSession() as session:
|
| 138 |
+
async with session.get(self.base_url + splice_api, headers=headers, cookies=cookies) as response:
|
| 139 |
+
res = await response.json()
|
| 140 |
+
success, msg = res['success'], res['msg']
|
| 141 |
+
code_status = res['data']['status']
|
| 142 |
+
if code_status == 1:
|
| 143 |
+
add_cookies = dict()
|
| 144 |
+
return_cookies = response.cookies
|
| 145 |
+
for item in return_cookies.keys():
|
| 146 |
+
add_cookies[return_cookies[item].key] = return_cookies[item].value
|
| 147 |
+
cookies.update(add_cookies)
|
| 148 |
+
ticket = res['data'].get('ticket', None)
|
| 149 |
+
msg = "验证成功"
|
| 150 |
+
elif code_status == 2:
|
| 151 |
+
msg = "请扫描二维码"
|
| 152 |
+
elif code_status == 3:
|
| 153 |
+
msg = "请确认登录"
|
| 154 |
+
elif code_status == -1:
|
| 155 |
+
msg = "验证码过期"
|
| 156 |
+
raise Exception(msg)
|
| 157 |
+
else:
|
| 158 |
+
msg = "未知错误"
|
| 159 |
+
raise Exception(msg)
|
| 160 |
+
except Exception as e:
|
| 161 |
+
success, msg = False, str(e)
|
| 162 |
+
return success, msg, {
|
| 163 |
+
'cookies': cookies,
|
| 164 |
+
'ticket': ticket
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
async def creatorLoginStep1(self, ticket, cookies):
|
| 168 |
+
api = "/sso/customer_login"
|
| 169 |
+
data = {
|
| 170 |
+
"ticket": ticket,
|
| 171 |
+
"login_service": "https://creator.xiaohongshu.com",
|
| 172 |
+
"subsystem_alias": "creator",
|
| 173 |
+
"set_global_domain": True
|
| 174 |
+
}
|
| 175 |
+
msg = '成功'
|
| 176 |
+
try:
|
| 177 |
+
headers = get_common_headers()
|
| 178 |
+
xs, xt, data = generate_xs(cookies['a1'], api, data)
|
| 179 |
+
headers['x-s'], headers['x-t'] = xs, str(xt)
|
| 180 |
+
async with aiohttp.ClientSession() as session:
|
| 181 |
+
async with session.post(self.home_url + api, headers=headers, cookies=cookies, data=data) as response:
|
| 182 |
+
res = await response.json()
|
| 183 |
+
success = res['success']
|
| 184 |
+
add_cookies = dict()
|
| 185 |
+
return_cookies = response.cookies
|
| 186 |
+
for item in return_cookies.keys():
|
| 187 |
+
add_cookies[return_cookies[item].key] = return_cookies[item].value
|
| 188 |
+
cookies.update(add_cookies)
|
| 189 |
+
except Exception as e:
|
| 190 |
+
success, msg = False, str(e)
|
| 191 |
+
return success, msg, {
|
| 192 |
+
'cookies': cookies,
|
| 193 |
+
"userInfo": res
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
async def creatorLoginStep2(self, cookies):
|
| 197 |
+
api = "/api/galaxy/user/cas/login"
|
| 198 |
+
msg = '成功'
|
| 199 |
+
try:
|
| 200 |
+
headers = get_common_headers()
|
| 201 |
+
xs, xt, _ = generate_xs(cookies['a1'], api)
|
| 202 |
+
headers['x-s'], headers['x-t'] = xs, str(xt)
|
| 203 |
+
async with aiohttp.ClientSession() as session:
|
| 204 |
+
async with session.post(self.home_url + api, headers=headers, cookies=cookies) as response:
|
| 205 |
+
res = await response.json()
|
| 206 |
+
success = res['success']
|
| 207 |
+
add_cookies = dict()
|
| 208 |
+
return_cookies = response.cookies
|
| 209 |
+
for item in return_cookies.keys():
|
| 210 |
+
add_cookies[return_cookies[item].key] = return_cookies[item].value
|
| 211 |
+
cookies.update(add_cookies)
|
| 212 |
+
except Exception as e:
|
| 213 |
+
success, msg = False, str(e)
|
| 214 |
+
return success, msg, cookies
|
| 215 |
+
|
| 216 |
+
def transfer_cookies(self, cookies):
|
| 217 |
+
cookies_str = ""
|
| 218 |
+
for key, value in cookies.items():
|
| 219 |
+
cookies_str += f"{key}={value}; "
|
| 220 |
+
cookies_str = cookies_str[:-2]
|
| 221 |
+
return cookies_str
|
| 222 |
+
|
| 223 |
+
def generateQrcode(self, verify_url):
|
| 224 |
+
qr = qrcode.QRCode(
|
| 225 |
+
version=1,
|
| 226 |
+
error_correction=qrcode.constants.ERROR_CORRECT_L,
|
| 227 |
+
box_size=10,
|
| 228 |
+
border=4,
|
| 229 |
+
)
|
| 230 |
+
qr.add_data(verify_url)
|
| 231 |
+
qr.make(fit=True)
|
| 232 |
+
img = qr.make_image(fill_color="black", back_color="white")
|
| 233 |
+
img.show()
|
| 234 |
+
|
| 235 |
+
async def qrcodeMain(self):
|
| 236 |
+
cookies = await self.creatorGenerateInitCookies()
|
| 237 |
+
print('获取初始cookies')
|
| 238 |
+
success, msg, qrcode_dict = await self.creatorGenerateQRcode(cookies)
|
| 239 |
+
print('获取二维码', success, msg)
|
| 240 |
+
print(qrcode_dict)
|
| 241 |
+
qrcode_thread = Thread(target=self.generateQrcode, args=(qrcode_dict['verify_url'],))
|
| 242 |
+
qrcode_thread.start()
|
| 243 |
+
while True:
|
| 244 |
+
success, msg, res = await self.creatorCheckQRCodeLogin(qrcode_dict['qr_id'], qrcode_dict['cookies'])
|
| 245 |
+
print('检查二维码登录', success, msg)
|
| 246 |
+
print(res)
|
| 247 |
+
if msg == "验证成功":
|
| 248 |
+
cookies = res['cookies']
|
| 249 |
+
ticket = res['ticket']
|
| 250 |
+
break
|
| 251 |
+
await asyncio.sleep(10)
|
| 252 |
+
|
| 253 |
+
if ticket is None:
|
| 254 |
+
print('登录成功')
|
| 255 |
+
else:
|
| 256 |
+
print('需要ticket继续认证')
|
| 257 |
+
success, msg, res = await self.creatorLoginStep1(ticket, cookies)
|
| 258 |
+
print('ticket认证第一步', success, msg)
|
| 259 |
+
print(res)
|
| 260 |
+
cookies = res['cookies']
|
| 261 |
+
userInfo = res['userInfo']
|
| 262 |
+
success, msg, cookies = await self.creatorLoginStep2(cookies)
|
| 263 |
+
print('ticket认证第二步', success, msg)
|
| 264 |
+
print(cookies)
|
| 265 |
+
print('登录成功')
|
| 266 |
+
cookies_str = self.transfer_cookies(cookies)
|
| 267 |
+
print(f'cookies_str: {cookies_str}')
|
| 268 |
+
|
| 269 |
+
async def phoneMain(self):
|
| 270 |
+
cookies = await self.creatorGenerateInitCookies()
|
| 271 |
+
print('获取初始cookies')
|
| 272 |
+
phone_num = ""
|
| 273 |
+
phone_num = ""
|
| 274 |
+
success, msg, res_json = await self.creatorGeneratePhoneCode(phone_num, cookies)
|
| 275 |
+
print('获取手机验证码', success, msg, res_json)
|
| 276 |
+
code = input("请输入验证码:")
|
| 277 |
+
success, msg, res_json = await self.creatorLoginByPhone(phone_num, code, cookies)
|
| 278 |
+
print('手机验证码登录', success, msg, res_json)
|
| 279 |
+
cookies = res_json['cookies']
|
| 280 |
+
cookies_str = self.transfer_cookies(cookies)
|
| 281 |
+
print(f'cookies_str: {cookies_str}')
|
| 282 |
+
self.test(cookies_str)
|
| 283 |
+
|
| 284 |
+
def test(self, cookies_str):
|
| 285 |
+
xhs_creator_apis = XHS_Creator_Apis()
|
| 286 |
+
noteInfos = [
|
| 287 |
+
{
|
| 288 |
+
# 标题
|
| 289 |
+
"title": "我是笨蛋",
|
| 290 |
+
# 描述
|
| 291 |
+
"desc": "我",
|
| 292 |
+
# 13位时间戳 数字类型
|
| 293 |
+
"postTime": None,
|
| 294 |
+
# 设置地点 "河海大学"
|
| 295 |
+
"location": None,
|
| 296 |
+
# 0:公开 1:私密
|
| 297 |
+
"type": 1,
|
| 298 |
+
"topics": ["测试"],
|
| 299 |
+
"media_type": "image",
|
| 300 |
+
# 图片路径 最多15张
|
| 301 |
+
"images": [
|
| 302 |
+
open(r"D:\Desktop\Data\images\temp\22.jpg", 'rb').read(),
|
| 303 |
+
open(r"D:\Desktop\Data\images\temp\22.jpg", 'rb').read(),
|
| 304 |
+
],
|
| 305 |
+
},
|
| 306 |
+
]
|
| 307 |
+
for noteInfo in noteInfos:
|
| 308 |
+
success, msg, info = xhs_creator_apis.post_note(noteInfo, cookies_str)
|
| 309 |
+
print(success, msg, info)
|
| 310 |
+
print('========')
|
| 311 |
+
|
| 312 |
+
|
| 313 |
+
if __name__ == '__main__':
|
| 314 |
+
login_util = XHSLoginApi()
|
| 315 |
+
loop = asyncio.get_event_loop()
|
| 316 |
+
# loop.run_until_complete(login_util.qrcodeMain())
|
| 317 |
+
loop.run_until_complete(login_util.phoneMain())
|
apis/xhs_pc_apis.py
ADDED
|
@@ -0,0 +1,1050 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# encoding: utf-8
|
| 2 |
+
import json
|
| 3 |
+
import re
|
| 4 |
+
import urllib
|
| 5 |
+
from xhs_utils.xhs_util import splice_str, generate_request_params, generate_x_b3_traceid, get_common_headers
|
| 6 |
+
from loguru import logger
|
| 7 |
+
from xhs_utils.http_client import HttpClient
|
| 8 |
+
|
| 9 |
+
"""
|
| 10 |
+
获小红书的api
|
| 11 |
+
:param cookies_str: 你的cookies
|
| 12 |
+
"""
|
| 13 |
+
class XHS_Apis():
|
| 14 |
+
def __init__(self):
|
| 15 |
+
self.base_url = "https://edith.xiaohongshu.com"
|
| 16 |
+
self.client = HttpClient()
|
| 17 |
+
|
| 18 |
+
def get_homefeed_all_channel(self, cookies_str: str, proxies: dict = None):
|
| 19 |
+
"""
|
| 20 |
+
获取主页的所有频道
|
| 21 |
+
返回主页的所有频道
|
| 22 |
+
"""
|
| 23 |
+
res_json = None
|
| 24 |
+
try:
|
| 25 |
+
api = "/api/sns/web/v1/homefeed/category"
|
| 26 |
+
headers, cookies, data = generate_request_params(cookies_str, api, '', 'GET')
|
| 27 |
+
result = self.client.request_json("GET", self.base_url + api, headers=headers, cookies=cookies, proxies=proxies)
|
| 28 |
+
res_json = result.json
|
| 29 |
+
if not result.ok:
|
| 30 |
+
raise Exception(result.msg)
|
| 31 |
+
success, msg = res_json["success"], res_json["msg"]
|
| 32 |
+
except Exception as e:
|
| 33 |
+
success = False
|
| 34 |
+
msg = str(e)
|
| 35 |
+
return success, msg, res_json
|
| 36 |
+
|
| 37 |
+
def get_homefeed_recommend(self, category, cursor_score, refresh_type, note_index, cookies_str: str, proxies: dict = None):
|
| 38 |
+
"""
|
| 39 |
+
获取主页推荐的笔记
|
| 40 |
+
:param category: 你想要获取的频道
|
| 41 |
+
:param cursor_score: 你想要获取的笔记的cursor
|
| 42 |
+
:param refresh_type: 你想要获取的笔记的刷新类型
|
| 43 |
+
:param note_index: 你想要获取的笔记的index
|
| 44 |
+
:param cookies_str: 你的cookies
|
| 45 |
+
返回主页推荐的笔记
|
| 46 |
+
"""
|
| 47 |
+
res_json = None
|
| 48 |
+
try:
|
| 49 |
+
api = f"/api/sns/web/v1/homefeed"
|
| 50 |
+
data = {
|
| 51 |
+
"cursor_score": cursor_score,
|
| 52 |
+
"num": 20,
|
| 53 |
+
"refresh_type": refresh_type,
|
| 54 |
+
"note_index": note_index,
|
| 55 |
+
"unread_begin_note_id": "",
|
| 56 |
+
"unread_end_note_id": "",
|
| 57 |
+
"unread_note_count": 0,
|
| 58 |
+
"category": category,
|
| 59 |
+
"search_key": "",
|
| 60 |
+
"need_num": 10,
|
| 61 |
+
"image_formats": [
|
| 62 |
+
"jpg",
|
| 63 |
+
"webp",
|
| 64 |
+
"avif"
|
| 65 |
+
],
|
| 66 |
+
"need_filter_image": False
|
| 67 |
+
}
|
| 68 |
+
headers, cookies, trans_data = generate_request_params(cookies_str, api, data, 'POST')
|
| 69 |
+
result = self.client.request_json("POST", self.base_url + api, headers=headers, cookies=cookies, data=trans_data, proxies=proxies)
|
| 70 |
+
res_json = result.json
|
| 71 |
+
if not result.ok:
|
| 72 |
+
raise Exception(result.msg)
|
| 73 |
+
success, msg = res_json["success"], res_json["msg"]
|
| 74 |
+
except Exception as e:
|
| 75 |
+
success = False
|
| 76 |
+
msg = str(e)
|
| 77 |
+
return success, msg, res_json
|
| 78 |
+
|
| 79 |
+
def get_homefeed_recommend_by_num(self, category, require_num, cookies_str: str, proxies: dict = None):
|
| 80 |
+
"""
|
| 81 |
+
根据数量获取主页推荐的笔记
|
| 82 |
+
:param category: 你想要获取的频道
|
| 83 |
+
:param require_num: 你想要获取的笔记的数量
|
| 84 |
+
:param cookies_str: 你的cookies
|
| 85 |
+
根据数量返回主页推荐的笔记
|
| 86 |
+
"""
|
| 87 |
+
cursor_score, refresh_type, note_index = "", 1, 0
|
| 88 |
+
note_list = []
|
| 89 |
+
try:
|
| 90 |
+
while True:
|
| 91 |
+
success, msg, res_json = self.get_homefeed_recommend(category, cursor_score, refresh_type, note_index, cookies_str, proxies)
|
| 92 |
+
if not success:
|
| 93 |
+
raise Exception(msg)
|
| 94 |
+
if "items" not in res_json["data"]:
|
| 95 |
+
break
|
| 96 |
+
notes = res_json["data"]["items"]
|
| 97 |
+
note_list.extend(notes)
|
| 98 |
+
cursor_score = res_json["data"]["cursor_score"]
|
| 99 |
+
refresh_type = 3
|
| 100 |
+
note_index += 20
|
| 101 |
+
if len(note_list) > require_num:
|
| 102 |
+
break
|
| 103 |
+
except Exception as e:
|
| 104 |
+
success = False
|
| 105 |
+
msg = str(e)
|
| 106 |
+
if len(note_list) > require_num:
|
| 107 |
+
note_list = note_list[:require_num]
|
| 108 |
+
return success, msg, note_list
|
| 109 |
+
|
| 110 |
+
def get_user_info(self, user_id: str, cookies_str: str, proxies: dict = None):
|
| 111 |
+
"""
|
| 112 |
+
获取用户的信息
|
| 113 |
+
:param user_id: 你想要获取的用户的id
|
| 114 |
+
:param cookies_str: 你的cookies
|
| 115 |
+
返回用户的信息
|
| 116 |
+
"""
|
| 117 |
+
res_json = None
|
| 118 |
+
try:
|
| 119 |
+
api = f"/api/sns/web/v1/user/otherinfo"
|
| 120 |
+
params = {
|
| 121 |
+
"target_user_id": user_id
|
| 122 |
+
}
|
| 123 |
+
splice_api = splice_str(api, params)
|
| 124 |
+
headers, cookies, data = generate_request_params(cookies_str, splice_api, '', 'GET')
|
| 125 |
+
result = self.client.request_json("GET", self.base_url + splice_api, headers=headers, cookies=cookies, proxies=proxies)
|
| 126 |
+
res_json = result.json
|
| 127 |
+
if not result.ok:
|
| 128 |
+
raise Exception(result.msg)
|
| 129 |
+
success, msg = res_json["success"], res_json["msg"]
|
| 130 |
+
except Exception as e:
|
| 131 |
+
success = False
|
| 132 |
+
msg = str(e)
|
| 133 |
+
return success, msg, res_json
|
| 134 |
+
|
| 135 |
+
def get_user_self_info(self, cookies_str: str, proxies: dict = None):
|
| 136 |
+
"""
|
| 137 |
+
获取用户自己的信息1
|
| 138 |
+
:param cookies_str: 你的cookies
|
| 139 |
+
返回用户自己的信息1
|
| 140 |
+
"""
|
| 141 |
+
res_json = None
|
| 142 |
+
try:
|
| 143 |
+
api = f"/api/sns/web/v1/user/selfinfo"
|
| 144 |
+
headers, cookies, data = generate_request_params(cookies_str, api, '', 'GET')
|
| 145 |
+
result = self.client.request_json("GET", self.base_url + api, headers=headers, cookies=cookies, proxies=proxies)
|
| 146 |
+
res_json = result.json
|
| 147 |
+
if not result.ok:
|
| 148 |
+
raise Exception(result.msg)
|
| 149 |
+
success, msg = res_json["success"], res_json["msg"]
|
| 150 |
+
except Exception as e:
|
| 151 |
+
success = False
|
| 152 |
+
msg = str(e)
|
| 153 |
+
return success, msg, res_json
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
def get_user_self_info2(self, cookies_str: str, proxies: dict = None):
|
| 157 |
+
"""
|
| 158 |
+
获取用户自己的信息2
|
| 159 |
+
:param cookies_str: 你的cookies
|
| 160 |
+
返回用户自己的信息2
|
| 161 |
+
"""
|
| 162 |
+
res_json = None
|
| 163 |
+
try:
|
| 164 |
+
api = f"/api/sns/web/v2/user/me"
|
| 165 |
+
headers, cookies, data = generate_request_params(cookies_str, api, '', 'GET')
|
| 166 |
+
result = self.client.request_json("GET", self.base_url + api, headers=headers, cookies=cookies, proxies=proxies)
|
| 167 |
+
res_json = result.json
|
| 168 |
+
if not result.ok:
|
| 169 |
+
raise Exception(result.msg)
|
| 170 |
+
success, msg = res_json["success"], res_json["msg"]
|
| 171 |
+
except Exception as e:
|
| 172 |
+
success = False
|
| 173 |
+
msg = str(e)
|
| 174 |
+
return success, msg, res_json
|
| 175 |
+
|
| 176 |
+
def get_user_note_info(self, user_id: str, cursor: str, cookies_str: str, xsec_token='', xsec_source='', proxies: dict = None):
|
| 177 |
+
"""
|
| 178 |
+
获取用户指定位置的笔记
|
| 179 |
+
:param user_id: 你想要获取的用户的id
|
| 180 |
+
:param cursor: 你想要获取的笔记的cursor
|
| 181 |
+
:param cookies_str: 你的cookies
|
| 182 |
+
返回用户指定位置的笔记
|
| 183 |
+
"""
|
| 184 |
+
res_json = None
|
| 185 |
+
try:
|
| 186 |
+
api = f"/api/sns/web/v1/user_posted"
|
| 187 |
+
params = {
|
| 188 |
+
"num": "30",
|
| 189 |
+
"cursor": cursor,
|
| 190 |
+
"user_id": user_id,
|
| 191 |
+
"image_formats": "jpg,webp,avif",
|
| 192 |
+
"xsec_token": xsec_token,
|
| 193 |
+
"xsec_source": xsec_source,
|
| 194 |
+
}
|
| 195 |
+
splice_api = splice_str(api, params)
|
| 196 |
+
headers, cookies, data = generate_request_params(cookies_str, splice_api, '', 'GET')
|
| 197 |
+
result = self.client.request_json("GET", self.base_url + splice_api, headers=headers, cookies=cookies, proxies=proxies)
|
| 198 |
+
res_json = result.json
|
| 199 |
+
if not result.ok:
|
| 200 |
+
raise Exception(result.msg)
|
| 201 |
+
success, msg = res_json["success"], res_json["msg"]
|
| 202 |
+
except Exception as e:
|
| 203 |
+
success = False
|
| 204 |
+
msg = str(e)
|
| 205 |
+
return success, msg, res_json
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
def get_user_all_notes(self, user_url: str, cookies_str: str, proxies: dict = None):
|
| 209 |
+
"""
|
| 210 |
+
获取用户所有笔记
|
| 211 |
+
:param user_id: 你想要获取的用户的id
|
| 212 |
+
:param cookies_str: 你的cookies
|
| 213 |
+
返回用户的所有笔记
|
| 214 |
+
"""
|
| 215 |
+
cursor = ''
|
| 216 |
+
note_list = []
|
| 217 |
+
try:
|
| 218 |
+
urlParse = urllib.parse.urlparse(user_url)
|
| 219 |
+
user_id = urlParse.path.split("/")[-1]
|
| 220 |
+
kvs = urlParse.query.split('&')
|
| 221 |
+
kvDist = {kv.split('=')[0]: kv.split('=')[1] for kv in kvs}
|
| 222 |
+
xsec_token = kvDist['xsec_token'] if 'xsec_token' in kvDist else ""
|
| 223 |
+
xsec_source = kvDist['xsec_source'] if 'xsec_source' in kvDist else "pc_search"
|
| 224 |
+
while True:
|
| 225 |
+
success, msg, res_json = self.get_user_note_info(user_id, cursor, cookies_str, xsec_token, xsec_source, proxies)
|
| 226 |
+
if not success:
|
| 227 |
+
raise Exception(msg)
|
| 228 |
+
notes = res_json["data"]["notes"]
|
| 229 |
+
if 'cursor' in res_json["data"]:
|
| 230 |
+
cursor = str(res_json["data"]["cursor"])
|
| 231 |
+
else:
|
| 232 |
+
break
|
| 233 |
+
note_list.extend(notes)
|
| 234 |
+
if len(notes) == 0 or not res_json["data"]["has_more"]:
|
| 235 |
+
break
|
| 236 |
+
except Exception as e:
|
| 237 |
+
success = False
|
| 238 |
+
msg = str(e)
|
| 239 |
+
return success, msg, note_list
|
| 240 |
+
|
| 241 |
+
def get_user_like_note_info(self, user_id: str, cursor: str, cookies_str: str, xsec_token='', xsec_source='', proxies: dict = None):
|
| 242 |
+
"""
|
| 243 |
+
获取用户指定位置喜欢的笔记
|
| 244 |
+
:param user_id: 你想要获取的用户的id
|
| 245 |
+
:param cursor: 你想要获取的笔记的cursor
|
| 246 |
+
:param cookies_str: 你的cookies
|
| 247 |
+
返回用户指定位置喜欢的笔记
|
| 248 |
+
"""
|
| 249 |
+
res_json = None
|
| 250 |
+
try:
|
| 251 |
+
api = f"/api/sns/web/v1/note/like/page"
|
| 252 |
+
params = {
|
| 253 |
+
"num": "30",
|
| 254 |
+
"cursor": cursor,
|
| 255 |
+
"user_id": user_id,
|
| 256 |
+
"image_formats": "jpg,webp,avif",
|
| 257 |
+
"xsec_token": xsec_token,
|
| 258 |
+
"xsec_source": xsec_source,
|
| 259 |
+
}
|
| 260 |
+
splice_api = splice_str(api, params)
|
| 261 |
+
headers, cookies, data = generate_request_params(cookies_str, splice_api, '', 'GET')
|
| 262 |
+
result = self.client.request_json("GET", self.base_url + splice_api, headers=headers, cookies=cookies, proxies=proxies)
|
| 263 |
+
res_json = result.json
|
| 264 |
+
if not result.ok:
|
| 265 |
+
raise Exception(result.msg)
|
| 266 |
+
success, msg = res_json["success"], res_json["msg"]
|
| 267 |
+
except Exception as e:
|
| 268 |
+
success = False
|
| 269 |
+
msg = str(e)
|
| 270 |
+
return success, msg, res_json
|
| 271 |
+
|
| 272 |
+
def get_user_all_like_note_info(self, user_url: str, cookies_str: str, proxies: dict = None):
|
| 273 |
+
"""
|
| 274 |
+
获取用户所有喜欢笔记
|
| 275 |
+
:param user_id: 你想要获取的用户的id
|
| 276 |
+
:param cookies_str: 你的cookies
|
| 277 |
+
返回用户的所有喜欢笔记
|
| 278 |
+
"""
|
| 279 |
+
cursor = ''
|
| 280 |
+
note_list = []
|
| 281 |
+
try:
|
| 282 |
+
urlParse = urllib.parse.urlparse(user_url)
|
| 283 |
+
user_id = urlParse.path.split("/")[-1]
|
| 284 |
+
kvs = urlParse.query.split('&')
|
| 285 |
+
kvDist = {kv.split('=')[0]: kv.split('=')[1] for kv in kvs}
|
| 286 |
+
xsec_token = kvDist['xsec_token'] if 'xsec_token' in kvDist else ""
|
| 287 |
+
xsec_source = kvDist['xsec_source'] if 'xsec_source' in kvDist else "pc_user"
|
| 288 |
+
while True:
|
| 289 |
+
success, msg, res_json = self.get_user_like_note_info(user_id, cursor, cookies_str, xsec_token,
|
| 290 |
+
xsec_source, proxies)
|
| 291 |
+
if not success:
|
| 292 |
+
raise Exception(msg)
|
| 293 |
+
notes = res_json["data"]["notes"]
|
| 294 |
+
if 'cursor' in res_json["data"]:
|
| 295 |
+
cursor = str(res_json["data"]["cursor"])
|
| 296 |
+
else:
|
| 297 |
+
break
|
| 298 |
+
note_list.extend(notes)
|
| 299 |
+
if len(notes) == 0 or not res_json["data"]["has_more"]:
|
| 300 |
+
break
|
| 301 |
+
except Exception as e:
|
| 302 |
+
success = False
|
| 303 |
+
msg = str(e)
|
| 304 |
+
return success, msg, note_list
|
| 305 |
+
|
| 306 |
+
def get_user_collect_note_info(self, user_id: str, cursor: str, cookies_str: str, xsec_token='', xsec_source='', proxies: dict = None):
|
| 307 |
+
"""
|
| 308 |
+
获取用户指定位置收藏的笔记
|
| 309 |
+
:param user_id: 你想要获取的用户的id
|
| 310 |
+
:param cursor: 你想要获取的笔记的cursor
|
| 311 |
+
:param cookies_str: 你的cookies
|
| 312 |
+
返回用户指定位置收藏的笔记
|
| 313 |
+
"""
|
| 314 |
+
res_json = None
|
| 315 |
+
try:
|
| 316 |
+
api = f"/api/sns/web/v2/note/collect/page"
|
| 317 |
+
params = {
|
| 318 |
+
"num": "30",
|
| 319 |
+
"cursor": cursor,
|
| 320 |
+
"user_id": user_id,
|
| 321 |
+
"image_formats": "jpg,webp,avif",
|
| 322 |
+
"xsec_token": xsec_token,
|
| 323 |
+
"xsec_source": xsec_source,
|
| 324 |
+
}
|
| 325 |
+
splice_api = splice_str(api, params)
|
| 326 |
+
headers, cookies, data = generate_request_params(cookies_str, splice_api, '', 'GET')
|
| 327 |
+
result = self.client.request_json("GET", self.base_url + splice_api, headers=headers, cookies=cookies, proxies=proxies)
|
| 328 |
+
res_json = result.json
|
| 329 |
+
if not result.ok:
|
| 330 |
+
raise Exception(result.msg)
|
| 331 |
+
success, msg = res_json["success"], res_json["msg"]
|
| 332 |
+
except Exception as e:
|
| 333 |
+
success = False
|
| 334 |
+
msg = str(e)
|
| 335 |
+
return success, msg, res_json
|
| 336 |
+
|
| 337 |
+
def get_user_all_collect_note_info(self, user_url: str, cookies_str: str, proxies: dict = None):
|
| 338 |
+
"""
|
| 339 |
+
获取用户所有收藏笔记
|
| 340 |
+
:param user_id: 你想要获取的用户的id
|
| 341 |
+
:param cookies_str: 你的cookies
|
| 342 |
+
返回用户的所有收藏笔记
|
| 343 |
+
"""
|
| 344 |
+
cursor = ''
|
| 345 |
+
note_list = []
|
| 346 |
+
try:
|
| 347 |
+
urlParse = urllib.parse.urlparse(user_url)
|
| 348 |
+
user_id = urlParse.path.split("/")[-1]
|
| 349 |
+
kvs = urlParse.query.split('&')
|
| 350 |
+
kvDist = {kv.split('=')[0]: kv.split('=')[1] for kv in kvs}
|
| 351 |
+
xsec_token = kvDist['xsec_token'] if 'xsec_token' in kvDist else ""
|
| 352 |
+
xsec_source = kvDist['xsec_source'] if 'xsec_source' in kvDist else "pc_search"
|
| 353 |
+
while True:
|
| 354 |
+
success, msg, res_json = self.get_user_collect_note_info(user_id, cursor, cookies_str, xsec_token,
|
| 355 |
+
xsec_source, proxies)
|
| 356 |
+
if not success:
|
| 357 |
+
raise Exception(msg)
|
| 358 |
+
notes = res_json["data"]["notes"]
|
| 359 |
+
if 'cursor' in res_json["data"]:
|
| 360 |
+
cursor = str(res_json["data"]["cursor"])
|
| 361 |
+
else:
|
| 362 |
+
break
|
| 363 |
+
note_list.extend(notes)
|
| 364 |
+
if len(notes) == 0 or not res_json["data"]["has_more"]:
|
| 365 |
+
break
|
| 366 |
+
except Exception as e:
|
| 367 |
+
success = False
|
| 368 |
+
msg = str(e)
|
| 369 |
+
return success, msg, note_list
|
| 370 |
+
|
| 371 |
+
def get_note_info(self, url: str, cookies_str: str, proxies: dict = None):
|
| 372 |
+
"""
|
| 373 |
+
获取笔记的详细
|
| 374 |
+
:param url: 你想要获取的笔记的url
|
| 375 |
+
:param cookies_str: 你的cookies
|
| 376 |
+
:param xsec_source: 你的xsec_source 默认为pc_search pc_user pc_feed
|
| 377 |
+
返回笔记的详细
|
| 378 |
+
"""
|
| 379 |
+
res_json = None
|
| 380 |
+
try:
|
| 381 |
+
urlParse = urllib.parse.urlparse(url)
|
| 382 |
+
note_id = urlParse.path.split("/")[-1]
|
| 383 |
+
kvs = urlParse.query.split('&')
|
| 384 |
+
kvDist = {kv.split('=')[0]: kv.split('=')[1] for kv in kvs}
|
| 385 |
+
api = f"/api/sns/web/v1/feed"
|
| 386 |
+
data = {
|
| 387 |
+
"source_note_id": note_id,
|
| 388 |
+
"image_formats": [
|
| 389 |
+
"jpg",
|
| 390 |
+
"webp",
|
| 391 |
+
"avif"
|
| 392 |
+
],
|
| 393 |
+
"extra": {
|
| 394 |
+
"need_body_topic": "1"
|
| 395 |
+
},
|
| 396 |
+
"xsec_source": kvDist['xsec_source'] if 'xsec_source' in kvDist else "pc_search",
|
| 397 |
+
"xsec_token": kvDist['xsec_token']
|
| 398 |
+
}
|
| 399 |
+
headers, cookies, data = generate_request_params(cookies_str, api, data, 'POST')
|
| 400 |
+
result = self.client.request_json("POST", self.base_url + api, headers=headers, cookies=cookies, data=data, proxies=proxies)
|
| 401 |
+
res_json = result.json
|
| 402 |
+
if not result.ok:
|
| 403 |
+
raise Exception(result.msg)
|
| 404 |
+
success, msg = res_json["success"], res_json["msg"]
|
| 405 |
+
except Exception as e:
|
| 406 |
+
success = False
|
| 407 |
+
msg = str(e)
|
| 408 |
+
return success, msg, res_json
|
| 409 |
+
|
| 410 |
+
|
| 411 |
+
def get_search_keyword(self, word: str, cookies_str: str, proxies: dict = None):
|
| 412 |
+
"""
|
| 413 |
+
获取搜索关键词
|
| 414 |
+
:param word: 你的关键词
|
| 415 |
+
:param cookies_str: 你的cookies
|
| 416 |
+
返回搜索关键词
|
| 417 |
+
"""
|
| 418 |
+
res_json = None
|
| 419 |
+
try:
|
| 420 |
+
api = "/api/sns/web/v1/search/recommend"
|
| 421 |
+
params = {
|
| 422 |
+
"keyword": urllib.parse.quote(word)
|
| 423 |
+
}
|
| 424 |
+
splice_api = splice_str(api, params)
|
| 425 |
+
headers, cookies, data = generate_request_params(cookies_str, splice_api, '', 'GET')
|
| 426 |
+
result = self.client.request_json("GET", self.base_url + splice_api, headers=headers, cookies=cookies, proxies=proxies)
|
| 427 |
+
res_json = result.json
|
| 428 |
+
if not result.ok:
|
| 429 |
+
raise Exception(result.msg)
|
| 430 |
+
success, msg = res_json["success"], res_json["msg"]
|
| 431 |
+
except Exception as e:
|
| 432 |
+
success = False
|
| 433 |
+
msg = str(e)
|
| 434 |
+
return success, msg, res_json
|
| 435 |
+
|
| 436 |
+
def search_note(self, query: str, cookies_str: str, page=1, sort_type_choice=0, note_type=0, note_time=0, note_range=0, pos_distance=0, geo="", proxies: dict = None):
|
| 437 |
+
"""
|
| 438 |
+
获取搜索笔记的结果
|
| 439 |
+
:param query 搜索的关键词
|
| 440 |
+
:param cookies_str 你的cookies
|
| 441 |
+
:param page 搜索的页数
|
| 442 |
+
:param sort_type_choice 排序方式 0 综合排序, 1 最新, 2 最多点赞, 3 最多评论, 4 最多收藏
|
| 443 |
+
:param note_type 笔记类型 0 不限, 1 视频笔记, 2 普通笔记
|
| 444 |
+
:param note_time 笔记时间 0 不限, 1 一天内, 2 一周内天, 3 半年内
|
| 445 |
+
:param note_range 笔记范围 0 不限, 1 已看过, 2 未看过, 3 已关注
|
| 446 |
+
:param pos_distance 位置距离 0 不限, 1 同城, 2 附近 指定这个必须要指定 geo
|
| 447 |
+
返回搜索的结果
|
| 448 |
+
"""
|
| 449 |
+
res_json = None
|
| 450 |
+
sort_type = "general"
|
| 451 |
+
if sort_type_choice == 1:
|
| 452 |
+
sort_type = "time_descending"
|
| 453 |
+
elif sort_type_choice == 2:
|
| 454 |
+
sort_type = "popularity_descending"
|
| 455 |
+
elif sort_type_choice == 3:
|
| 456 |
+
sort_type = "comment_descending"
|
| 457 |
+
elif sort_type_choice == 4:
|
| 458 |
+
sort_type = "collect_descending"
|
| 459 |
+
filter_note_type = "不限"
|
| 460 |
+
if note_type == 1:
|
| 461 |
+
filter_note_type = "视频笔记"
|
| 462 |
+
elif note_type == 2:
|
| 463 |
+
filter_note_type = "普通笔记"
|
| 464 |
+
filter_note_time = "不限"
|
| 465 |
+
if note_time == 1:
|
| 466 |
+
filter_note_time = "一天内"
|
| 467 |
+
elif note_time == 2:
|
| 468 |
+
filter_note_time = "一周内"
|
| 469 |
+
elif note_time == 3:
|
| 470 |
+
filter_note_time = "半年内"
|
| 471 |
+
filter_note_range = "不限"
|
| 472 |
+
if note_range == 1:
|
| 473 |
+
filter_note_range = "已看过"
|
| 474 |
+
elif note_range == 2:
|
| 475 |
+
filter_note_range = "未看过"
|
| 476 |
+
elif note_range == 3:
|
| 477 |
+
filter_note_range = "已关注"
|
| 478 |
+
filter_pos_distance = "不限"
|
| 479 |
+
if pos_distance == 1:
|
| 480 |
+
filter_pos_distance = "同城"
|
| 481 |
+
elif pos_distance == 2:
|
| 482 |
+
filter_pos_distance = "附近"
|
| 483 |
+
if geo:
|
| 484 |
+
geo = json.dumps(geo, separators=(',', ':'))
|
| 485 |
+
try:
|
| 486 |
+
api = "/api/sns/web/v1/search/notes"
|
| 487 |
+
data = {
|
| 488 |
+
"keyword": query,
|
| 489 |
+
"page": page,
|
| 490 |
+
"page_size": 20,
|
| 491 |
+
"search_id": generate_x_b3_traceid(21),
|
| 492 |
+
"sort": "general",
|
| 493 |
+
"note_type": 0,
|
| 494 |
+
"ext_flags": [],
|
| 495 |
+
"filters": [
|
| 496 |
+
{
|
| 497 |
+
"tags": [
|
| 498 |
+
sort_type
|
| 499 |
+
],
|
| 500 |
+
"type": "sort_type"
|
| 501 |
+
},
|
| 502 |
+
{
|
| 503 |
+
"tags": [
|
| 504 |
+
filter_note_type
|
| 505 |
+
],
|
| 506 |
+
"type": "filter_note_type"
|
| 507 |
+
},
|
| 508 |
+
{
|
| 509 |
+
"tags": [
|
| 510 |
+
filter_note_time
|
| 511 |
+
],
|
| 512 |
+
"type": "filter_note_time"
|
| 513 |
+
},
|
| 514 |
+
{
|
| 515 |
+
"tags": [
|
| 516 |
+
filter_note_range
|
| 517 |
+
],
|
| 518 |
+
"type": "filter_note_range"
|
| 519 |
+
},
|
| 520 |
+
{
|
| 521 |
+
"tags": [
|
| 522 |
+
filter_pos_distance
|
| 523 |
+
],
|
| 524 |
+
"type": "filter_pos_distance"
|
| 525 |
+
}
|
| 526 |
+
],
|
| 527 |
+
"geo": geo,
|
| 528 |
+
"image_formats": [
|
| 529 |
+
"jpg",
|
| 530 |
+
"webp",
|
| 531 |
+
"avif"
|
| 532 |
+
]
|
| 533 |
+
}
|
| 534 |
+
headers, cookies, data = generate_request_params(cookies_str, api, data, 'POST')
|
| 535 |
+
result = self.client.request_json("POST", self.base_url + api, headers=headers, cookies=cookies, data=data, proxies=proxies)
|
| 536 |
+
res_json = result.json
|
| 537 |
+
if not result.ok:
|
| 538 |
+
raise Exception(result.msg)
|
| 539 |
+
success, msg = res_json["success"], res_json["msg"]
|
| 540 |
+
except Exception as e:
|
| 541 |
+
success = False
|
| 542 |
+
msg = str(e)
|
| 543 |
+
return success, msg, res_json
|
| 544 |
+
|
| 545 |
+
def search_some_note(self, query: str, require_num: int, cookies_str: str, sort_type_choice=0, note_type=0, note_time=0, note_range=0, pos_distance=0, geo="", proxies: dict = None):
|
| 546 |
+
"""
|
| 547 |
+
指定数量搜索笔记,设置排序方式和笔记类型和笔记数量
|
| 548 |
+
:param query 搜索的关键词
|
| 549 |
+
:param require_num 搜索的数量
|
| 550 |
+
:param cookies_str 你的cookies
|
| 551 |
+
:param sort_type_choice 排序方式 0 综合排序, 1 最新, 2 最多点赞, 3 最多评论, 4 最多收藏
|
| 552 |
+
:param note_type 笔记类型 0 不限, 1 视频笔记, 2 普通笔记
|
| 553 |
+
:param note_time 笔记时间 0 不限, 1 一天内, 2 一周内天, 3 半年内
|
| 554 |
+
:param note_range 笔记范围 0 不限, 1 已看过, 2 未看过, 3 已关注
|
| 555 |
+
:param pos_distance 位置距离 0 不限, 1 同城, 2 附近 指定这个必须要指定 geo
|
| 556 |
+
:param geo: 定位信息 经纬度
|
| 557 |
+
返回搜索的结果
|
| 558 |
+
"""
|
| 559 |
+
page = 1
|
| 560 |
+
note_list = []
|
| 561 |
+
try:
|
| 562 |
+
while True:
|
| 563 |
+
success, msg, res_json = self.search_note(query, cookies_str, page, sort_type_choice, note_type, note_time, note_range, pos_distance, geo, proxies)
|
| 564 |
+
if not success:
|
| 565 |
+
raise Exception(msg)
|
| 566 |
+
if "items" not in res_json["data"]:
|
| 567 |
+
break
|
| 568 |
+
notes = res_json["data"]["items"]
|
| 569 |
+
note_list.extend(notes)
|
| 570 |
+
page += 1
|
| 571 |
+
if len(note_list) >= require_num or not res_json["data"]["has_more"]:
|
| 572 |
+
break
|
| 573 |
+
except Exception as e:
|
| 574 |
+
success = False
|
| 575 |
+
msg = str(e)
|
| 576 |
+
if len(note_list) > require_num:
|
| 577 |
+
note_list = note_list[:require_num]
|
| 578 |
+
return success, msg, note_list
|
| 579 |
+
|
| 580 |
+
def search_user(self, query: str, cookies_str: str, page=1, proxies: dict = None):
|
| 581 |
+
"""
|
| 582 |
+
获取搜索用户的结果
|
| 583 |
+
:param query 搜索的关键词
|
| 584 |
+
:param cookies_str 你的cookies
|
| 585 |
+
:param page 搜索的页数
|
| 586 |
+
返回搜索的结果
|
| 587 |
+
"""
|
| 588 |
+
res_json = None
|
| 589 |
+
try:
|
| 590 |
+
api = "/api/sns/web/v1/search/usersearch"
|
| 591 |
+
data = {
|
| 592 |
+
"search_user_request": {
|
| 593 |
+
"keyword": query,
|
| 594 |
+
"search_id": "2dn9they1jbjxwawlo4xd",
|
| 595 |
+
"page": page,
|
| 596 |
+
"page_size": 15,
|
| 597 |
+
"biz_type": "web_search_user",
|
| 598 |
+
"request_id": "22471139-1723999898524"
|
| 599 |
+
}
|
| 600 |
+
}
|
| 601 |
+
headers, cookies, data = generate_request_params(cookies_str, api, data, 'POST')
|
| 602 |
+
result = self.client.request_json("POST", self.base_url + api, headers=headers, cookies=cookies, data=data, proxies=proxies)
|
| 603 |
+
res_json = result.json
|
| 604 |
+
if not result.ok:
|
| 605 |
+
raise Exception(result.msg)
|
| 606 |
+
success, msg = res_json["success"], res_json["msg"]
|
| 607 |
+
except Exception as e:
|
| 608 |
+
success = False
|
| 609 |
+
msg = str(e)
|
| 610 |
+
return success, msg, res_json
|
| 611 |
+
|
| 612 |
+
def search_some_user(self, query: str, require_num: int, cookies_str: str, proxies: dict = None):
|
| 613 |
+
"""
|
| 614 |
+
指定数量搜索用户
|
| 615 |
+
:param query 搜索的关键词
|
| 616 |
+
:param require_num 搜索的数量
|
| 617 |
+
:param cookies_str 你的cookies
|
| 618 |
+
返回搜索的结果
|
| 619 |
+
"""
|
| 620 |
+
page = 1
|
| 621 |
+
user_list = []
|
| 622 |
+
try:
|
| 623 |
+
while True:
|
| 624 |
+
success, msg, res_json = self.search_user(query, cookies_str, page, proxies)
|
| 625 |
+
if not success:
|
| 626 |
+
raise Exception(msg)
|
| 627 |
+
if "users" not in res_json["data"]:
|
| 628 |
+
break
|
| 629 |
+
users = res_json["data"]["users"]
|
| 630 |
+
user_list.extend(users)
|
| 631 |
+
page += 1
|
| 632 |
+
if len(user_list) >= require_num or not res_json["data"]["has_more"]:
|
| 633 |
+
break
|
| 634 |
+
except Exception as e:
|
| 635 |
+
success = False
|
| 636 |
+
msg = str(e)
|
| 637 |
+
if len(user_list) > require_num:
|
| 638 |
+
user_list = user_list[:require_num]
|
| 639 |
+
return success, msg, user_list
|
| 640 |
+
|
| 641 |
+
def get_note_out_comment(self, note_id: str, cursor: str, xsec_token: str, cookies_str: str, proxies: dict = None):
|
| 642 |
+
"""
|
| 643 |
+
获取指定位置的笔记一级评论
|
| 644 |
+
:param note_id 笔记的id
|
| 645 |
+
:param cursor 指定位置的评论的cursor
|
| 646 |
+
:param cookies_str 你的cookies
|
| 647 |
+
返回指定位置的笔记一级评论
|
| 648 |
+
"""
|
| 649 |
+
res_json = None
|
| 650 |
+
try:
|
| 651 |
+
api = "/api/sns/web/v2/comment/page"
|
| 652 |
+
params = {
|
| 653 |
+
"note_id": note_id,
|
| 654 |
+
"cursor": cursor,
|
| 655 |
+
"top_comment_id": "",
|
| 656 |
+
"image_formats": "jpg,webp,avif",
|
| 657 |
+
"xsec_token": xsec_token
|
| 658 |
+
}
|
| 659 |
+
splice_api = splice_str(api, params)
|
| 660 |
+
headers, cookies, data = generate_request_params(cookies_str, splice_api, '', 'GET')
|
| 661 |
+
result = self.client.request_json("GET", self.base_url + splice_api, headers=headers, cookies=cookies, proxies=proxies)
|
| 662 |
+
res_json = result.json
|
| 663 |
+
if not result.ok:
|
| 664 |
+
raise Exception(result.msg)
|
| 665 |
+
success, msg = res_json["success"], res_json["msg"]
|
| 666 |
+
except Exception as e:
|
| 667 |
+
success = False
|
| 668 |
+
msg = str(e)
|
| 669 |
+
return success, msg, res_json
|
| 670 |
+
|
| 671 |
+
def get_note_all_out_comment(self, note_id: str, xsec_token: str, cookies_str: str, proxies: dict = None):
|
| 672 |
+
"""
|
| 673 |
+
获取笔记的全部一级评论
|
| 674 |
+
:param note_id 笔记的id
|
| 675 |
+
:param cookies_str 你的cookies
|
| 676 |
+
返回笔记的全部一级评论
|
| 677 |
+
"""
|
| 678 |
+
cursor = ''
|
| 679 |
+
note_out_comment_list = []
|
| 680 |
+
try:
|
| 681 |
+
while True:
|
| 682 |
+
success, msg, res_json = self.get_note_out_comment(note_id, cursor, xsec_token, cookies_str, proxies)
|
| 683 |
+
if not success:
|
| 684 |
+
raise Exception(msg)
|
| 685 |
+
comments = res_json["data"]["comments"]
|
| 686 |
+
if 'cursor' in res_json["data"]:
|
| 687 |
+
cursor = str(res_json["data"]["cursor"])
|
| 688 |
+
else:
|
| 689 |
+
break
|
| 690 |
+
note_out_comment_list.extend(comments)
|
| 691 |
+
if len(note_out_comment_list) == 0 or not res_json["data"]["has_more"]:
|
| 692 |
+
break
|
| 693 |
+
except Exception as e:
|
| 694 |
+
success = False
|
| 695 |
+
msg = str(e)
|
| 696 |
+
return success, msg, note_out_comment_list
|
| 697 |
+
|
| 698 |
+
def get_note_inner_comment(self, comment: dict, cursor: str, xsec_token: str, cookies_str: str, proxies: dict = None):
|
| 699 |
+
"""
|
| 700 |
+
获取指定位置的笔记二级评论
|
| 701 |
+
:param comment 笔记的一级评论
|
| 702 |
+
:param cursor 指定位置的评论的cursor
|
| 703 |
+
:param cookies_str 你的cookies
|
| 704 |
+
返回指定位置的笔记二级评论
|
| 705 |
+
"""
|
| 706 |
+
res_json = None
|
| 707 |
+
try:
|
| 708 |
+
api = "/api/sns/web/v2/comment/sub/page"
|
| 709 |
+
params = {
|
| 710 |
+
"note_id": comment['note_id'],
|
| 711 |
+
"root_comment_id": comment['id'],
|
| 712 |
+
"num": "10",
|
| 713 |
+
"cursor": cursor,
|
| 714 |
+
"image_formats": "jpg,webp,avif",
|
| 715 |
+
"top_comment_id": '',
|
| 716 |
+
"xsec_token": xsec_token
|
| 717 |
+
}
|
| 718 |
+
splice_api = splice_str(api, params)
|
| 719 |
+
headers, cookies, data = generate_request_params(cookies_str, splice_api, '', 'GET')
|
| 720 |
+
result = self.client.request_json("GET", self.base_url + splice_api, headers=headers, cookies=cookies, proxies=proxies)
|
| 721 |
+
res_json = result.json
|
| 722 |
+
if not result.ok:
|
| 723 |
+
raise Exception(result.msg)
|
| 724 |
+
success, msg = res_json["success"], res_json["msg"]
|
| 725 |
+
except Exception as e:
|
| 726 |
+
success = False
|
| 727 |
+
msg = str(e)
|
| 728 |
+
return success, msg, res_json
|
| 729 |
+
|
| 730 |
+
def get_note_all_inner_comment(self, comment: dict, xsec_token: str, cookies_str: str, proxies: dict = None):
|
| 731 |
+
"""
|
| 732 |
+
获取笔记的全部二级评论
|
| 733 |
+
:param comment 笔记的一级评论
|
| 734 |
+
:param cookies_str 你的cookies
|
| 735 |
+
返回笔记的全部二级评论
|
| 736 |
+
"""
|
| 737 |
+
try:
|
| 738 |
+
if not comment['sub_comment_has_more']:
|
| 739 |
+
return True, 'success', comment
|
| 740 |
+
cursor = comment['sub_comment_cursor']
|
| 741 |
+
inner_comment_list = []
|
| 742 |
+
while True:
|
| 743 |
+
success, msg, res_json = self.get_note_inner_comment(comment, cursor, xsec_token, cookies_str, proxies)
|
| 744 |
+
if not success:
|
| 745 |
+
raise Exception(msg)
|
| 746 |
+
comments = res_json["data"]["comments"]
|
| 747 |
+
if 'cursor' in res_json["data"]:
|
| 748 |
+
cursor = str(res_json["data"]["cursor"])
|
| 749 |
+
else:
|
| 750 |
+
break
|
| 751 |
+
inner_comment_list.extend(comments)
|
| 752 |
+
if not res_json["data"]["has_more"]:
|
| 753 |
+
break
|
| 754 |
+
comment['sub_comments'].extend(inner_comment_list)
|
| 755 |
+
except Exception as e:
|
| 756 |
+
success = False
|
| 757 |
+
msg = str(e)
|
| 758 |
+
return success, msg, comment
|
| 759 |
+
|
| 760 |
+
def get_note_all_comment(self, url: str, cookies_str: str, proxies: dict = None):
|
| 761 |
+
"""
|
| 762 |
+
获取一篇文章的所有评论
|
| 763 |
+
:param note_id: 你想要获取的笔记的id
|
| 764 |
+
:param cookies_str: 你的cookies
|
| 765 |
+
返回一篇文章的所有评论
|
| 766 |
+
"""
|
| 767 |
+
out_comment_list = []
|
| 768 |
+
try:
|
| 769 |
+
urlParse = urllib.parse.urlparse(url)
|
| 770 |
+
note_id = urlParse.path.split("/")[-1]
|
| 771 |
+
kvs = urlParse.query.split('&')
|
| 772 |
+
kvDist = {kv.split('=')[0]: kv.split('=')[1] for kv in kvs}
|
| 773 |
+
success, msg, out_comment_list = self.get_note_all_out_comment(note_id, kvDist['xsec_token'], cookies_str, proxies)
|
| 774 |
+
if not success:
|
| 775 |
+
raise Exception(msg)
|
| 776 |
+
for comment in out_comment_list:
|
| 777 |
+
success, msg, new_comment = self.get_note_all_inner_comment(comment, kvDist['xsec_token'], cookies_str, proxies)
|
| 778 |
+
if not success:
|
| 779 |
+
raise Exception(msg)
|
| 780 |
+
except Exception as e:
|
| 781 |
+
success = False
|
| 782 |
+
msg = str(e)
|
| 783 |
+
return success, msg, out_comment_list
|
| 784 |
+
|
| 785 |
+
def get_unread_message(self, cookies_str: str, proxies: dict = None):
|
| 786 |
+
"""
|
| 787 |
+
获取未读消息
|
| 788 |
+
:param cookies_str: 你的cookies
|
| 789 |
+
返回未读消息
|
| 790 |
+
"""
|
| 791 |
+
res_json = None
|
| 792 |
+
try:
|
| 793 |
+
api = "/api/sns/web/unread_count"
|
| 794 |
+
headers, cookies, data = generate_request_params(cookies_str, api, '', 'GET')
|
| 795 |
+
result = self.client.request_json("GET", self.base_url + api, headers=headers, cookies=cookies, proxies=proxies)
|
| 796 |
+
res_json = result.json
|
| 797 |
+
if not result.ok:
|
| 798 |
+
raise Exception(result.msg)
|
| 799 |
+
success, msg = res_json["success"], res_json["msg"]
|
| 800 |
+
except Exception as e:
|
| 801 |
+
success = False
|
| 802 |
+
msg = str(e)
|
| 803 |
+
return success, msg, res_json
|
| 804 |
+
|
| 805 |
+
def get_metions(self, cursor: str, cookies_str: str, proxies: dict = None):
|
| 806 |
+
"""
|
| 807 |
+
获取评论和@提醒
|
| 808 |
+
:param cursor: 你想要获取的评论和@提醒的cursor
|
| 809 |
+
:param cookies_str: 你的cookies
|
| 810 |
+
返回评论和@提醒
|
| 811 |
+
"""
|
| 812 |
+
res_json = None
|
| 813 |
+
try:
|
| 814 |
+
api = "/api/sns/web/v1/you/mentions"
|
| 815 |
+
params = {
|
| 816 |
+
"num": "20",
|
| 817 |
+
"cursor": cursor
|
| 818 |
+
}
|
| 819 |
+
splice_api = splice_str(api, params)
|
| 820 |
+
headers, cookies, data = generate_request_params(cookies_str, splice_api, '', 'GET')
|
| 821 |
+
result = self.client.request_json("GET", self.base_url + splice_api, headers=headers, cookies=cookies, proxies=proxies)
|
| 822 |
+
res_json = result.json
|
| 823 |
+
if not result.ok:
|
| 824 |
+
raise Exception(result.msg)
|
| 825 |
+
success, msg = res_json["success"], res_json["msg"]
|
| 826 |
+
except Exception as e:
|
| 827 |
+
success = False
|
| 828 |
+
msg = str(e)
|
| 829 |
+
return success, msg, res_json
|
| 830 |
+
|
| 831 |
+
def get_all_metions(self, cookies_str: str, proxies: dict = None):
|
| 832 |
+
"""
|
| 833 |
+
获取全部的评论和@提醒
|
| 834 |
+
:param cookies_str: 你的cookies
|
| 835 |
+
返回全部的评论和@提醒
|
| 836 |
+
"""
|
| 837 |
+
cursor = ''
|
| 838 |
+
metions_list = []
|
| 839 |
+
try:
|
| 840 |
+
while True:
|
| 841 |
+
success, msg, res_json = self.get_metions(cursor, cookies_str, proxies)
|
| 842 |
+
if not success:
|
| 843 |
+
raise Exception(msg)
|
| 844 |
+
metions = res_json["data"]["message_list"]
|
| 845 |
+
if 'cursor' in res_json["data"]:
|
| 846 |
+
cursor = str(res_json["data"]["cursor"])
|
| 847 |
+
else:
|
| 848 |
+
break
|
| 849 |
+
metions_list.extend(metions)
|
| 850 |
+
if not res_json["data"]["has_more"]:
|
| 851 |
+
break
|
| 852 |
+
except Exception as e:
|
| 853 |
+
success = False
|
| 854 |
+
msg = str(e)
|
| 855 |
+
return success, msg, metions_list
|
| 856 |
+
|
| 857 |
+
def get_likesAndcollects(self, cursor: str, cookies_str: str, proxies: dict = None):
|
| 858 |
+
"""
|
| 859 |
+
获取赞和收藏
|
| 860 |
+
:param cursor: 你想要获取的赞和收藏的cursor
|
| 861 |
+
:param cookies_str: 你的cookies
|
| 862 |
+
返回赞和收藏
|
| 863 |
+
"""
|
| 864 |
+
res_json = None
|
| 865 |
+
try:
|
| 866 |
+
api = "/api/sns/web/v1/you/likes"
|
| 867 |
+
params = {
|
| 868 |
+
"num": "20",
|
| 869 |
+
"cursor": cursor
|
| 870 |
+
}
|
| 871 |
+
splice_api = splice_str(api, params)
|
| 872 |
+
headers, cookies, data = generate_request_params(cookies_str, splice_api, '', 'GET')
|
| 873 |
+
result = self.client.request_json("GET", self.base_url + splice_api, headers=headers, cookies=cookies, proxies=proxies)
|
| 874 |
+
res_json = result.json
|
| 875 |
+
if not result.ok:
|
| 876 |
+
raise Exception(result.msg)
|
| 877 |
+
success, msg = res_json["success"], res_json["msg"]
|
| 878 |
+
except Exception as e:
|
| 879 |
+
success = False
|
| 880 |
+
msg = str(e)
|
| 881 |
+
return success, msg, res_json
|
| 882 |
+
|
| 883 |
+
def get_all_likesAndcollects(self, cookies_str: str, proxies: dict = None):
|
| 884 |
+
"""
|
| 885 |
+
获取全部的赞和收藏
|
| 886 |
+
:param cookies_str: 你的cookies
|
| 887 |
+
返回全部的赞和收藏
|
| 888 |
+
"""
|
| 889 |
+
cursor = ''
|
| 890 |
+
likesAndcollects_list = []
|
| 891 |
+
try:
|
| 892 |
+
while True:
|
| 893 |
+
success, msg, res_json = self.get_likesAndcollects(cursor, cookies_str, proxies)
|
| 894 |
+
if not success:
|
| 895 |
+
raise Exception(msg)
|
| 896 |
+
likesAndcollects = res_json["data"]["message_list"]
|
| 897 |
+
if 'cursor' in res_json["data"]:
|
| 898 |
+
cursor = str(res_json["data"]["cursor"])
|
| 899 |
+
else:
|
| 900 |
+
break
|
| 901 |
+
likesAndcollects_list.extend(likesAndcollects)
|
| 902 |
+
if not res_json["data"]["has_more"]:
|
| 903 |
+
break
|
| 904 |
+
except Exception as e:
|
| 905 |
+
success = False
|
| 906 |
+
msg = str(e)
|
| 907 |
+
return success, msg, likesAndcollects_list
|
| 908 |
+
|
| 909 |
+
def get_new_connections(self, cursor: str, cookies_str: str, proxies: dict = None):
|
| 910 |
+
"""
|
| 911 |
+
获取新增关注
|
| 912 |
+
:param cursor: 你想要获取的新增关注的cursor
|
| 913 |
+
:param cookies_str: 你的cookies
|
| 914 |
+
返回新增关注
|
| 915 |
+
"""
|
| 916 |
+
res_json = None
|
| 917 |
+
try:
|
| 918 |
+
api = "/api/sns/web/v1/you/connections"
|
| 919 |
+
params = {
|
| 920 |
+
"num": "20",
|
| 921 |
+
"cursor": cursor
|
| 922 |
+
}
|
| 923 |
+
splice_api = splice_str(api, params)
|
| 924 |
+
headers, cookies, data = generate_request_params(cookies_str, splice_api, '', 'GET')
|
| 925 |
+
result = self.client.request_json("GET", self.base_url + splice_api, headers=headers, cookies=cookies, proxies=proxies)
|
| 926 |
+
res_json = result.json
|
| 927 |
+
if not result.ok:
|
| 928 |
+
raise Exception(result.msg)
|
| 929 |
+
success, msg = res_json["success"], res_json["msg"]
|
| 930 |
+
except Exception as e:
|
| 931 |
+
success = False
|
| 932 |
+
msg = str(e)
|
| 933 |
+
return success, msg, res_json
|
| 934 |
+
|
| 935 |
+
def get_all_new_connections(self, cookies_str: str, proxies: dict = None):
|
| 936 |
+
"""
|
| 937 |
+
获取全部的新增关注
|
| 938 |
+
:param cookies_str: 你的cookies
|
| 939 |
+
返回全部的新增关注
|
| 940 |
+
"""
|
| 941 |
+
cursor = ''
|
| 942 |
+
connections_list = []
|
| 943 |
+
try:
|
| 944 |
+
while True:
|
| 945 |
+
success, msg, res_json = self.get_new_connections(cursor, cookies_str, proxies)
|
| 946 |
+
if not success:
|
| 947 |
+
raise Exception(msg)
|
| 948 |
+
connections = res_json["data"]["message_list"]
|
| 949 |
+
if 'cursor' in res_json["data"]:
|
| 950 |
+
cursor = str(res_json["data"]["cursor"])
|
| 951 |
+
else:
|
| 952 |
+
break
|
| 953 |
+
connections_list.extend(connections)
|
| 954 |
+
if not res_json["data"]["has_more"]:
|
| 955 |
+
break
|
| 956 |
+
except Exception as e:
|
| 957 |
+
success = False
|
| 958 |
+
msg = str(e)
|
| 959 |
+
return success, msg, connections_list
|
| 960 |
+
|
| 961 |
+
@staticmethod
|
| 962 |
+
def get_note_no_water_video(note_id):
|
| 963 |
+
"""
|
| 964 |
+
获取笔记无水印视频
|
| 965 |
+
:param note_id: 你想要获取的���记的id
|
| 966 |
+
返回笔记无水印视频
|
| 967 |
+
"""
|
| 968 |
+
success = True
|
| 969 |
+
msg = '成功'
|
| 970 |
+
video_addr = None
|
| 971 |
+
try:
|
| 972 |
+
headers = get_common_headers()
|
| 973 |
+
url = f"https://www.xiaohongshu.com/explore/{note_id}"
|
| 974 |
+
client = HttpClient()
|
| 975 |
+
result = client.request_text("GET", url, headers=headers)
|
| 976 |
+
if not result.ok:
|
| 977 |
+
raise Exception(result.msg)
|
| 978 |
+
res = result.text
|
| 979 |
+
video_addr = re.findall(r'<meta name="og:video" content="(.*?)">', res)[0]
|
| 980 |
+
except Exception as e:
|
| 981 |
+
success = False
|
| 982 |
+
msg = str(e)
|
| 983 |
+
return success, msg, video_addr
|
| 984 |
+
|
| 985 |
+
|
| 986 |
+
@staticmethod
|
| 987 |
+
def get_note_no_water_img(img_url):
|
| 988 |
+
"""
|
| 989 |
+
获取笔记无水印图片
|
| 990 |
+
:param img_url: 你想要获取的图片的url
|
| 991 |
+
返回笔记无水印图片
|
| 992 |
+
"""
|
| 993 |
+
success = True
|
| 994 |
+
msg = '成功'
|
| 995 |
+
new_url = None
|
| 996 |
+
try:
|
| 997 |
+
# 新版图片资源优先保留 notes_pre_post token,使用 ci.xiaohongshu.com 输出 JPEG。
|
| 998 |
+
# 例:
|
| 999 |
+
# https://sns-webpic-qc.xhscdn.com/<time>/<hash>/notes_pre_post/<img_id>!nd_dft_wlteh_webp_3
|
| 1000 |
+
# -> https://ci.xiaohongshu.com/notes_pre_post/<img_id>?imageView2/format/jpeg
|
| 1001 |
+
if 'notes_pre_post/' in img_url:
|
| 1002 |
+
token = 'notes_pre_post/' + img_url.split('notes_pre_post/', 1)[1].split('!', 1)[0].split('?', 1)[0]
|
| 1003 |
+
new_url = f'https://ci.xiaohongshu.com/{token}?imageView2/format/jpeg'
|
| 1004 |
+
elif 'spectrum' in img_url:
|
| 1005 |
+
token = '/'.join(img_url.split('/')[-2:]).split('!', 1)[0].split('?', 1)[0]
|
| 1006 |
+
new_url = f'https://ci.xiaohongshu.com/{token}?imageView2/format/jpeg'
|
| 1007 |
+
elif '.jpg' in img_url:
|
| 1008 |
+
token = '/'.join([split for split in img_url.split('/')[-3:]]).split('!', 1)[0].split('?', 1)[0]
|
| 1009 |
+
new_url = f'https://ci.xiaohongshu.com/{token}?imageView2/format/jpeg'
|
| 1010 |
+
else:
|
| 1011 |
+
token = img_url.split('/')[-1].split('!', 1)[0].split('?', 1)[0]
|
| 1012 |
+
new_url = f'https://ci.xiaohongshu.com/{token}?imageView2/format/jpeg'
|
| 1013 |
+
except Exception as e:
|
| 1014 |
+
success = False
|
| 1015 |
+
msg = str(e)
|
| 1016 |
+
return success, msg, new_url
|
| 1017 |
+
|
| 1018 |
+
if __name__ == '__main__':
|
| 1019 |
+
"""
|
| 1020 |
+
此文件为小红书api的使用示例
|
| 1021 |
+
所有涉及数据爬取的api都在此文件中
|
| 1022 |
+
数据注入的api违规请勿尝试
|
| 1023 |
+
"""
|
| 1024 |
+
xhs_apis = XHS_Apis()
|
| 1025 |
+
cookies_str = r''
|
| 1026 |
+
# 获取用户信息
|
| 1027 |
+
user_url = 'https://www.xiaohongshu.com/user/profile/67a332a2000000000d008358?xsec_token=ABTf9yz4cLHhTycIlksF0jOi1yIZgfcaQ6IXNNGdKJ8xg=&xsec_source=pc_feed'
|
| 1028 |
+
success, msg, user_info = xhs_apis.get_user_info('67a332a2000000000d008358', cookies_str)
|
| 1029 |
+
logger.info(f'获取用户信息结果 {json.dumps(user_info, ensure_ascii=False)}: {success}, msg: {msg}')
|
| 1030 |
+
success, msg, note_list = xhs_apis.get_user_all_notes(user_url, cookies_str)
|
| 1031 |
+
logger.info(f'获取用户所有笔记结果 {json.dumps(note_list, ensure_ascii=False)}: {success}, msg: {msg}')
|
| 1032 |
+
# 获取笔记信息
|
| 1033 |
+
note_url = r'https://www.xiaohongshu.com/explore/67d7c713000000000900e391?xsec_token=AB1ACxbo5cevHxV_bWibTmK8R1DDz0NnAW1PbFZLABXtE=&xsec_source=pc_user'
|
| 1034 |
+
success, msg, note_info = xhs_apis.get_note_info(note_url, cookies_str)
|
| 1035 |
+
logger.info(f'获取笔记信息结果 {json.dumps(note_info, ensure_ascii=False)}: {success}, msg: {msg}')
|
| 1036 |
+
# 获取搜索关键词
|
| 1037 |
+
query = "榴莲"
|
| 1038 |
+
success, msg, search_keyword = xhs_apis.get_search_keyword(query, cookies_str)
|
| 1039 |
+
logger.info(f'获取搜索关键词结果 {json.dumps(search_keyword, ensure_ascii=False)}: {success}, msg: {msg}')
|
| 1040 |
+
# 搜索笔记
|
| 1041 |
+
query = "榴莲"
|
| 1042 |
+
query_num = 10
|
| 1043 |
+
sort = "general"
|
| 1044 |
+
note_type = 0
|
| 1045 |
+
success, msg, notes = xhs_apis.search_some_note(query, query_num, cookies_str, sort, note_type)
|
| 1046 |
+
logger.info(f'搜索笔记结果 {json.dumps(notes, ensure_ascii=False)}: {success}, msg: {msg}')
|
| 1047 |
+
# 获取笔记评论
|
| 1048 |
+
note_url = r'https://www.xiaohongshu.com/explore/67d7c713000000000900e391?xsec_token=AB1ACxbo5cevHxV_bWibTmK8R1DDz0NnAW1PbFZLABXtE=&xsec_source=pc_user'
|
| 1049 |
+
success, msg, note_all_comment = xhs_apis.get_note_all_comment(note_url, cookies_str)
|
| 1050 |
+
logger.info(f'获取笔记评论结果 {json.dumps(note_all_comment, ensure_ascii=False)}: {success}, msg: {msg}')
|
apis/xhs_pc_login_apis.py
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from threading import Thread
|
| 3 |
+
|
| 4 |
+
import aiohttp
|
| 5 |
+
import asyncio
|
| 6 |
+
|
| 7 |
+
import qrcode
|
| 8 |
+
|
| 9 |
+
from xhs_utils.xhs_util import generate_headers, splice_str
|
| 10 |
+
from playwright.async_api import async_playwright
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class XHSLoginApi:
|
| 14 |
+
def __init__(self):
|
| 15 |
+
self.base_url = "https://edith.xiaohongshu.com"
|
| 16 |
+
self.home_url = 'https://www.xiaohongshu.com/explore'
|
| 17 |
+
self.generate_qrcode_api = '/api/sns/web/v1/login/qrcode/create'
|
| 18 |
+
|
| 19 |
+
# 生成初始cookies
|
| 20 |
+
async def xhsCheckInitCookies(self, page):
|
| 21 |
+
while True:
|
| 22 |
+
cookies = dict()
|
| 23 |
+
page_cookies = await page.context.cookies()
|
| 24 |
+
for cookie in page_cookies:
|
| 25 |
+
cookies[cookie['name']] = cookie['value']
|
| 26 |
+
if "webId" in cookies and "a1" in cookies and "gid" in cookies:
|
| 27 |
+
break
|
| 28 |
+
await asyncio.sleep(1)
|
| 29 |
+
if 'web_session' in cookies:
|
| 30 |
+
del cookies['web_session']
|
| 31 |
+
return cookies
|
| 32 |
+
|
| 33 |
+
async def xhsGenerateInitCookies(self, headless=True):
|
| 34 |
+
async with async_playwright() as p:
|
| 35 |
+
browser = await p.chromium.launch(
|
| 36 |
+
headless=headless,
|
| 37 |
+
args=[
|
| 38 |
+
'--disable-blink-features=AutomationControlled',
|
| 39 |
+
],
|
| 40 |
+
)
|
| 41 |
+
page = await browser.new_page()
|
| 42 |
+
await page.goto(self.home_url)
|
| 43 |
+
cookies = await self.xhsCheckInitCookies(page)
|
| 44 |
+
await browser.close()
|
| 45 |
+
return cookies
|
| 46 |
+
|
| 47 |
+
# 手机验证码登录
|
| 48 |
+
async def xhsGeneratePhoneVerificationCode(self, phone_num, cookies):
|
| 49 |
+
try:
|
| 50 |
+
api = "/api/sns/web/v2/login/send_code"
|
| 51 |
+
params = {
|
| 52 |
+
"phone": phone_num,
|
| 53 |
+
"zone": "86",
|
| 54 |
+
"type": "login"
|
| 55 |
+
}
|
| 56 |
+
splice_api = splice_str(api, params)
|
| 57 |
+
headers, _ = generate_headers(cookies['a1'], splice_api)
|
| 58 |
+
async with aiohttp.ClientSession() as session:
|
| 59 |
+
async with session.get(self.base_url + splice_api, headers=headers, cookies=cookies) as response:
|
| 60 |
+
res = await response.json()
|
| 61 |
+
print(res)
|
| 62 |
+
success, msg = res['success'], res['msg']
|
| 63 |
+
except Exception as e:
|
| 64 |
+
success, msg = False, str(e)
|
| 65 |
+
return success, msg
|
| 66 |
+
|
| 67 |
+
async def xhsCheckPhoneVerificationCode(self, phone_num, code, cookies):
|
| 68 |
+
mobile_token = None
|
| 69 |
+
try:
|
| 70 |
+
api = "/api/sns/web/v1/login/check_code"
|
| 71 |
+
params = {
|
| 72 |
+
"phone": phone_num,
|
| 73 |
+
"zone": "86",
|
| 74 |
+
"code": code
|
| 75 |
+
}
|
| 76 |
+
splice_api = splice_str(api, params)
|
| 77 |
+
headers, _ = generate_headers(cookies['a1'], splice_api)
|
| 78 |
+
async with aiohttp.ClientSession() as session:
|
| 79 |
+
async with session.get(self.base_url + splice_api, headers=headers, cookies=cookies) as response:
|
| 80 |
+
res = await response.json()
|
| 81 |
+
success, msg = res['success'], res['msg']
|
| 82 |
+
mobile_token = res['data']['mobile_token']
|
| 83 |
+
except Exception as e:
|
| 84 |
+
success, msg = False, str(e)
|
| 85 |
+
return success, msg, mobile_token
|
| 86 |
+
|
| 87 |
+
async def xhsPhoneVerificationCodeLogin(self, mobile_token, phone, cookies):
|
| 88 |
+
cookies_str = None
|
| 89 |
+
try:
|
| 90 |
+
api = "/api/sns/web/v2/login/code"
|
| 91 |
+
data = {
|
| 92 |
+
"mobile_token": mobile_token,
|
| 93 |
+
"zone": "86",
|
| 94 |
+
"phone": phone
|
| 95 |
+
}
|
| 96 |
+
headers, data = generate_headers(cookies['a1'], api, data)
|
| 97 |
+
async with aiohttp.ClientSession() as session:
|
| 98 |
+
async with session.post(self.base_url + api, headers=headers, cookies=cookies, data=data) as response:
|
| 99 |
+
res = await response.json()
|
| 100 |
+
success, msg = res['success'], res['msg']
|
| 101 |
+
cookies['web_session'] = res['data']['session']
|
| 102 |
+
cookies_str = ''
|
| 103 |
+
for key in cookies:
|
| 104 |
+
cookies_str += f'{key}={cookies[key]}; '
|
| 105 |
+
cookies_str = cookies_str[:-2]
|
| 106 |
+
except Exception as e:
|
| 107 |
+
success, msg = False, str(e)
|
| 108 |
+
return success, msg, cookies_str
|
| 109 |
+
|
| 110 |
+
# 二维码扫描登录
|
| 111 |
+
async def xhsGenerateQRcode(self, cookies):
|
| 112 |
+
try:
|
| 113 |
+
data = {
|
| 114 |
+
"qr_type": 1
|
| 115 |
+
}
|
| 116 |
+
headers, data = generate_headers(cookies['a1'], self.generate_qrcode_api, data)
|
| 117 |
+
async with aiohttp.ClientSession() as session:
|
| 118 |
+
async with session.post(self.base_url + self.generate_qrcode_api, headers=headers, cookies=cookies, data=data) as response:
|
| 119 |
+
res = await response.json()
|
| 120 |
+
qr_id, code, verify_url = res['data']['qr_id'], res['data']['code'], res["data"]["url"]
|
| 121 |
+
success, msg = res['success'], res['msg']
|
| 122 |
+
except Exception as e:
|
| 123 |
+
return False, str(e), {
|
| 124 |
+
"cookies": cookies,
|
| 125 |
+
"qr_id": None,
|
| 126 |
+
"code": None,
|
| 127 |
+
"verify_url": None
|
| 128 |
+
}
|
| 129 |
+
return success, msg, {
|
| 130 |
+
"cookies": cookies,
|
| 131 |
+
"qr_id": qr_id,
|
| 132 |
+
"code": code,
|
| 133 |
+
"verify_url": verify_url
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
async def xhsCheckQRCodeLogin(self, qr_id, code, cookies):
|
| 137 |
+
cookies_str = None
|
| 138 |
+
try:
|
| 139 |
+
check_api = f"/api/sns/web/v1/login/qrcode/status?qr_id={qr_id}&code={code}"
|
| 140 |
+
headers, _ = generate_headers(cookies['a1'], check_api)
|
| 141 |
+
headers['x-login-mode'] = ""
|
| 142 |
+
async with aiohttp.ClientSession() as session:
|
| 143 |
+
async with session.get(self.base_url + check_api, headers=headers, cookies=cookies) as response:
|
| 144 |
+
res = await response.json()
|
| 145 |
+
success, msg = res['success'], res['msg']
|
| 146 |
+
code_status = res['data']['code_status']
|
| 147 |
+
if code_status == 0:
|
| 148 |
+
msg = "请扫描二维码"
|
| 149 |
+
elif code_status == 1:
|
| 150 |
+
msg = "请确认登录"
|
| 151 |
+
elif code_status == 2:
|
| 152 |
+
cookies['web_session'] = res['data']['login_info']['session']
|
| 153 |
+
cookies_str = ''
|
| 154 |
+
for key in cookies:
|
| 155 |
+
cookies_str += f'{key}={cookies[key]}; '
|
| 156 |
+
cookies_str = cookies_str[:-2]
|
| 157 |
+
elif code_status == 3:
|
| 158 |
+
msg = "二维码已失效"
|
| 159 |
+
raise Exception(msg)
|
| 160 |
+
else:
|
| 161 |
+
msg = "未知code_status"
|
| 162 |
+
raise Exception(msg)
|
| 163 |
+
except Exception as e:
|
| 164 |
+
success, msg = False, str(e)
|
| 165 |
+
return success, msg, {
|
| 166 |
+
'cookies_str': cookies_str,
|
| 167 |
+
'res': res
|
| 168 |
+
}
|
| 169 |
+
def generateQrcode(self, verify_url):
|
| 170 |
+
qr = qrcode.QRCode(
|
| 171 |
+
version=1,
|
| 172 |
+
error_correction=qrcode.constants.ERROR_CORRECT_L,
|
| 173 |
+
box_size=10,
|
| 174 |
+
border=4,
|
| 175 |
+
)
|
| 176 |
+
qr.add_data(verify_url)
|
| 177 |
+
qr.make(fit=True)
|
| 178 |
+
img = qr.make_image(fill_color="black", back_color="white")
|
| 179 |
+
img.show()
|
| 180 |
+
|
| 181 |
+
async def qrcodeMain(self):
|
| 182 |
+
cookies = await self.xhsGenerateInitCookies()
|
| 183 |
+
success, msg, qrcode_dict = await self.xhsGenerateQRcode(cookies)
|
| 184 |
+
qrcode_thread = Thread(target=self.generateQrcode, args=(qrcode_dict['verify_url'],))
|
| 185 |
+
qrcode_thread.start()
|
| 186 |
+
# asyncio.create_task(asyncio.to_thread(self.generateQrcode, qrcode_dict['verify_url']))
|
| 187 |
+
while True:
|
| 188 |
+
success, msg, res = await self.xhsCheckQRCodeLogin(qrcode_dict['qr_id'], qrcode_dict['code'], qrcode_dict['cookies'])
|
| 189 |
+
print(success, msg, res)
|
| 190 |
+
print(res['cookies_str'])
|
| 191 |
+
await asyncio.sleep(1)
|
| 192 |
+
|
| 193 |
+
async def phoneMain(self):
|
| 194 |
+
cookies = await self.xhsGenerateInitCookies()
|
| 195 |
+
phone_num = ""
|
| 196 |
+
success, msg = await self.xhsGeneratePhoneVerificationCode(phone_num, cookies)
|
| 197 |
+
print(success, msg)
|
| 198 |
+
code = input("请输入验证码:")
|
| 199 |
+
success, msg, mobile_token = await self.xhsCheckPhoneVerificationCode(phone_num, code, cookies)
|
| 200 |
+
print(success, msg, mobile_token)
|
| 201 |
+
success, msg, cookies_str = await self.xhsPhoneVerificationCodeLogin(mobile_token, phone_num, cookies)
|
| 202 |
+
print(success, msg, cookies_str)
|
| 203 |
+
|
| 204 |
+
if __name__ == '__main__':
|
| 205 |
+
login_util = XHSLoginApi()
|
| 206 |
+
loop = asyncio.get_event_loop()
|
| 207 |
+
loop.run_until_complete(login_util.qrcodeMain())
|
| 208 |
+
# loop.run_until_complete(login_util.phoneMain())
|
apis/xhs_pugongying_apis.py
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from xhs_utils.cookie_util import trans_cookies
|
| 3 |
+
from xhs_utils.xhs_pugongying_util import generate_pugongying_headers, get_pugongying_bozhu_data, generate_pugongying_data
|
| 4 |
+
from xhs_utils.xhs_util import get_request_headers_template
|
| 5 |
+
from xhs_utils.http_client import HttpClient
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class PuGongYingAPI:
|
| 9 |
+
def __init__(self):
|
| 10 |
+
self.base_url = "https://pgy.xiaohongshu.com"
|
| 11 |
+
self.client = HttpClient()
|
| 12 |
+
|
| 13 |
+
def get_all_categories(self, cookies):
|
| 14 |
+
api = '/api/solar/cooperator/content/tag_tree'
|
| 15 |
+
headers = generate_pugongying_headers(cookies['a1'], api)
|
| 16 |
+
result = self.client.request_json("GET", self.base_url + api, headers=headers, cookies=cookies)
|
| 17 |
+
if not result.ok:
|
| 18 |
+
raise Exception(result.msg)
|
| 19 |
+
distribution_category = result.json["data"]
|
| 20 |
+
return distribution_category
|
| 21 |
+
|
| 22 |
+
def choose_categories(self, cookies):
|
| 23 |
+
distribution_category = self.get_all_categories(cookies)
|
| 24 |
+
for first_index, first_category_temp in enumerate(distribution_category):
|
| 25 |
+
print(f'{first_index}: {first_category_temp["taxonomy1Tag"]}')
|
| 26 |
+
for second_index, second_category_temp in enumerate(first_category_temp["taxonomy2Tags"]):
|
| 27 |
+
print(f'---- {second_index}: {second_category_temp}')
|
| 28 |
+
choice = input("请选择您的类目:如果输入-1则为全部类目,输入1-2-4代表整个美妆/个护,服饰鞋包,母婴用品类目,输入1(1,3,4)-2代表美妆/个护类目下的1,3,4子类目和服饰鞋的全部\n")
|
| 29 |
+
contentTag = generate_pugongying_data(choice, distribution_category)
|
| 30 |
+
return contentTag, distribution_category
|
| 31 |
+
|
| 32 |
+
def get_track(self, data, cookies):
|
| 33 |
+
api = "/api/solar/cooperator/blogger/track"
|
| 34 |
+
data = json.dumps(data, separators=(',', ':'))
|
| 35 |
+
headers = generate_pugongying_headers(cookies['a1'], api, data)
|
| 36 |
+
result = self.client.request_json("POST", self.base_url + api, headers=headers, cookies=cookies, data=data)
|
| 37 |
+
if not result.ok:
|
| 38 |
+
raise Exception(result.msg)
|
| 39 |
+
return result.json
|
| 40 |
+
|
| 41 |
+
def get_user_by_page(self, page, cookies, contentTag=None):
|
| 42 |
+
api = "/api/solar/cooperator/blogger/v2"
|
| 43 |
+
self_info = self.get_self_info(cookies)
|
| 44 |
+
brandUserId = self_info["data"]["userId"]
|
| 45 |
+
# brandUserId = cookies['x-user-id-ark.xiaohongshu.com']
|
| 46 |
+
data = get_pugongying_bozhu_data(page, brandUserId, contentTag)
|
| 47 |
+
trackId = self.get_track(data, cookies)["data"]["trackId"]
|
| 48 |
+
data['trackId'] = trackId
|
| 49 |
+
data = json.dumps(data, separators=(',', ':'))
|
| 50 |
+
headers = generate_pugongying_headers(cookies['a1'], api, data)
|
| 51 |
+
result = self.client.request_json("POST", self.base_url + api, headers=headers, cookies=cookies, data=data)
|
| 52 |
+
if not result.ok:
|
| 53 |
+
raise Exception(result.msg)
|
| 54 |
+
res_json = result.json
|
| 55 |
+
total = res_json["data"]["total"]
|
| 56 |
+
user_list = res_json["data"]["kols"]
|
| 57 |
+
return user_list, total
|
| 58 |
+
|
| 59 |
+
def get_some_user(self, num, cookies, contentTag=None):
|
| 60 |
+
user_list = []
|
| 61 |
+
page = 1
|
| 62 |
+
while len(user_list) < num:
|
| 63 |
+
user_list_temp, total = self.get_user_by_page(page, cookies, contentTag)
|
| 64 |
+
user_list.extend(user_list_temp)
|
| 65 |
+
page += 1
|
| 66 |
+
if page > total / 20 + 1:
|
| 67 |
+
break
|
| 68 |
+
if len(user_list) > num:
|
| 69 |
+
user_list = user_list[:num]
|
| 70 |
+
return user_list
|
| 71 |
+
|
| 72 |
+
def get_user_detail(self, user_id, cookies):
|
| 73 |
+
api = "/api/solar/kol/dataV3/dataSummary"
|
| 74 |
+
params = {
|
| 75 |
+
"userId": user_id,
|
| 76 |
+
"business": "0"
|
| 77 |
+
}
|
| 78 |
+
headers = generate_pugongying_headers(cookies['a1'], api)
|
| 79 |
+
result = self.client.request_json("GET", self.base_url + api, headers=headers, cookies=cookies, params=params)
|
| 80 |
+
if not result.ok:
|
| 81 |
+
raise Exception(result.msg)
|
| 82 |
+
return result.json
|
| 83 |
+
|
| 84 |
+
def get_user_fans_detail(self, user_id, cookies):
|
| 85 |
+
api = "/api/solar/kol/dataV3/fansSummary"
|
| 86 |
+
params = {
|
| 87 |
+
"userId": user_id
|
| 88 |
+
}
|
| 89 |
+
headers = generate_pugongying_headers(cookies['a1'], api)
|
| 90 |
+
result = self.client.request_json("GET", self.base_url + api, headers=headers, cookies=cookies, params=params)
|
| 91 |
+
if not result.ok:
|
| 92 |
+
raise Exception(result.msg)
|
| 93 |
+
return result.json
|
| 94 |
+
|
| 95 |
+
def get_user_fans_history(self, user_id, cookies):
|
| 96 |
+
api = f"/api/solar/kol/data/{user_id}/fans_overall_new_history"
|
| 97 |
+
params = {
|
| 98 |
+
"dateType": "1",
|
| 99 |
+
"increaseType": "1"
|
| 100 |
+
}
|
| 101 |
+
headers = generate_pugongying_headers(cookies['a1'], api)
|
| 102 |
+
result = self.client.request_json("GET", self.base_url + api, headers=headers, cookies=cookies, params=params)
|
| 103 |
+
if not result.ok:
|
| 104 |
+
raise Exception(result.msg)
|
| 105 |
+
return result.json
|
| 106 |
+
|
| 107 |
+
def get_user_notes_detail(self, user_id, cookies):
|
| 108 |
+
api = "/api/solar/kol/dataV3/notesRate"
|
| 109 |
+
params = {
|
| 110 |
+
"userId": user_id,
|
| 111 |
+
"business": "0",
|
| 112 |
+
"noteType": "3",
|
| 113 |
+
"dateType": "1",
|
| 114 |
+
"advertiseSwitch": "1"
|
| 115 |
+
}
|
| 116 |
+
headers = generate_pugongying_headers(cookies['a1'], api)
|
| 117 |
+
result = self.client.request_json("GET", self.base_url + api, headers=headers, cookies=cookies, params=params)
|
| 118 |
+
if not result.ok:
|
| 119 |
+
raise Exception(result.msg)
|
| 120 |
+
return result.json
|
| 121 |
+
|
| 122 |
+
def get_self_info(self, cookies):
|
| 123 |
+
url = "https://pgy.xiaohongshu.com/api/solar/user/info"
|
| 124 |
+
headers = get_request_headers_template()
|
| 125 |
+
result = self.client.request_json("GET", url, headers=headers, cookies=cookies)
|
| 126 |
+
if not result.ok:
|
| 127 |
+
raise Exception(result.msg)
|
| 128 |
+
return result.json
|
| 129 |
+
|
| 130 |
+
def send_invite(self, user_id, cookies, productName, time, inviteContent, contactInfo):
|
| 131 |
+
api = "/api/solar/invite/initiate_invite"
|
| 132 |
+
self_info = self.get_self_info(cookies)
|
| 133 |
+
cooperateBrandId = self_info["data"]["userId"]
|
| 134 |
+
cooperateBrandName = self_info["data"]["nickName"]
|
| 135 |
+
data = {
|
| 136 |
+
"kolId": user_id,
|
| 137 |
+
"cooperateBrandId": cooperateBrandId,
|
| 138 |
+
"cooperateBrandName": cooperateBrandName,
|
| 139 |
+
"inviteType": 1,
|
| 140 |
+
"productName": productName,
|
| 141 |
+
"expectedPublishTimeStart": time[0],
|
| 142 |
+
"expectedPublishTimeEnd": time[1],
|
| 143 |
+
"inviteContent": inviteContent,
|
| 144 |
+
"contactInfo": contactInfo,
|
| 145 |
+
"contactType": 1,
|
| 146 |
+
"brandUserId": cooperateBrandId
|
| 147 |
+
}
|
| 148 |
+
data = json.dumps(data, separators=(',', ':'))
|
| 149 |
+
headers = generate_pugongying_headers(cookies['a1'], api)
|
| 150 |
+
result = self.client.request_json("POST", self.base_url + api, headers=headers, cookies=cookies, data=data)
|
| 151 |
+
if not result.ok:
|
| 152 |
+
raise Exception(result.msg)
|
| 153 |
+
return result.json
|
| 154 |
+
|
| 155 |
+
if __name__ == '__main__':
|
| 156 |
+
pugongying_api = PuGongYingAPI()
|
| 157 |
+
# "https://pgy.xiaohongshu.com"的cookie
|
| 158 |
+
cookies_str = ''
|
| 159 |
+
cookies = trans_cookies(cookies_str)
|
| 160 |
+
contentTag, distribution_category = pugongying_api.choose_categories(cookies)
|
| 161 |
+
user_list = pugongying_api.get_some_user(1, cookies, contentTag)
|
| 162 |
+
for user in user_list:
|
| 163 |
+
user_id = user["userId"]
|
| 164 |
+
user_detail = pugongying_api.get_user_detail(user_id, cookies)
|
| 165 |
+
fans_detail = pugongying_api.get_user_fans_detail(user_id, cookies)
|
| 166 |
+
fans_history = pugongying_api.get_user_fans_history(user_id, cookies)
|
| 167 |
+
notes_detail = pugongying_api.get_user_notes_detail(user_id, cookies)
|
| 168 |
+
# 期望发布时间 产品名称,【开始时间,结束时间】,合作内容介绍,联系方式
|
| 169 |
+
invite_res = pugongying_api.send_invite(user_id, cookies, "测试", ["2021-10-01", "2021-10-01"], "测试", "")
|
| 170 |
+
print(user_detail)
|
| 171 |
+
print(fans_detail)
|
| 172 |
+
print(fans_history)
|
| 173 |
+
print(notes_detail)
|
| 174 |
+
print(invite_res)
|
| 175 |
+
print(f'url: https://www.xiaohongshu.com/user/profile/{user_id}')
|
| 176 |
+
print('===========================')
|
apis/xhs_qianfan_apis.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from xhs_utils.cookie_util import trans_cookies
|
| 3 |
+
from xhs_utils.xhs_qianfan_util import get_qianfan_headers_template, generate_qianfan_data, get_qianfan_userDetail_headers_template
|
| 4 |
+
from xhs_utils.http_client import HttpClient
|
| 5 |
+
|
| 6 |
+
class QianFanAPI:
|
| 7 |
+
def __init__(self):
|
| 8 |
+
self.client = HttpClient()
|
| 9 |
+
|
| 10 |
+
def get_all_categories(self, cookies):
|
| 11 |
+
headers = get_qianfan_headers_template()
|
| 12 |
+
url = "https://pgy.xiaohongshu.com/api/draco/distributor-square/distributors-tags"
|
| 13 |
+
params = {
|
| 14 |
+
"types": "content_category,distribution_category,user_design_tag,content_tag"
|
| 15 |
+
}
|
| 16 |
+
result = self.client.request_json("GET", url, headers=headers, cookies=cookies, params=params)
|
| 17 |
+
if not result.ok:
|
| 18 |
+
raise Exception(result.msg)
|
| 19 |
+
distribution_category = result.json["data"]['distributor_tag_map']["distribution_category"]
|
| 20 |
+
return distribution_category
|
| 21 |
+
|
| 22 |
+
def choose_categories(self, cookies):
|
| 23 |
+
distribution_category = self.get_all_categories(cookies)
|
| 24 |
+
for first_index, first_category_temp in enumerate(distribution_category):
|
| 25 |
+
print(f'{first_index}: {first_category_temp["first_category"]}')
|
| 26 |
+
for second_index, second_category_temp in enumerate(first_category_temp["second_category"]):
|
| 27 |
+
print(f'---- {second_index}: {second_category_temp}')
|
| 28 |
+
choice = input(
|
| 29 |
+
"请选择您的类目:如果输入-1则为全部类目,输入1-2-4代表整个美妆/个护,服饰鞋包,母婴用品类目,输入1(1,3,4)-2代表美妆/个护类目下的1,3,4子类目和服饰鞋的全部\n")
|
| 30 |
+
return choice, distribution_category
|
| 31 |
+
|
| 32 |
+
def get_user_by_page(self, choice, distribution_category, page, cookies):
|
| 33 |
+
headers = get_qianfan_headers_template()
|
| 34 |
+
url = "https://pgy.xiaohongshu.com/api/draco/distributor-square/distributors"
|
| 35 |
+
data = generate_qianfan_data(choice, distribution_category, page)
|
| 36 |
+
data = json.dumps(data, separators=(',', ':'))
|
| 37 |
+
result = self.client.request_json("POST", url, headers=headers, cookies=cookies, data=data)
|
| 38 |
+
if not result.ok:
|
| 39 |
+
raise Exception(result.msg)
|
| 40 |
+
res_json = result.json
|
| 41 |
+
total = res_json["data"]["total"]
|
| 42 |
+
user_list = res_json["data"]["list"]
|
| 43 |
+
return user_list, total
|
| 44 |
+
|
| 45 |
+
def get_some_user(self, choice, distribution_category, num, cookies):
|
| 46 |
+
user_list = []
|
| 47 |
+
page = 1
|
| 48 |
+
while len(user_list) < num:
|
| 49 |
+
user_list_temp, total = self.get_user_by_page(choice, distribution_category, page, cookies)
|
| 50 |
+
user_list.extend(user_list_temp)
|
| 51 |
+
page += 1
|
| 52 |
+
if page > total / 20 + 1:
|
| 53 |
+
break
|
| 54 |
+
if len(user_list) > num:
|
| 55 |
+
user_list = user_list[:num]
|
| 56 |
+
return user_list
|
| 57 |
+
|
| 58 |
+
def get_user_detail(self, user_id, cookies):
|
| 59 |
+
headers = get_qianfan_userDetail_headers_template(user_id)
|
| 60 |
+
url = "https://pgy.xiaohongshu.com/api/draco/distributor-square/distributor/detail/overview/v2"
|
| 61 |
+
data = {
|
| 62 |
+
"buyer_id": user_id,
|
| 63 |
+
"date_type": 2
|
| 64 |
+
}
|
| 65 |
+
data = json.dumps(data, separators=(',', ':'))
|
| 66 |
+
result = self.client.request_json("POST", url, headers=headers, cookies=cookies, data=data)
|
| 67 |
+
if not result.ok:
|
| 68 |
+
raise Exception(result.msg)
|
| 69 |
+
return result.json
|
| 70 |
+
|
| 71 |
+
def get_user_cooperation(self, user_id, cookies):
|
| 72 |
+
headers = get_qianfan_userDetail_headers_template(user_id)
|
| 73 |
+
url = "https://pgy.xiaohongshu.com/api/draco/distributor-square/distributor/cooperative/category/v2"
|
| 74 |
+
data = {
|
| 75 |
+
"buyer_id": user_id,
|
| 76 |
+
"first_live_category": "",
|
| 77 |
+
"second_live_category": "",
|
| 78 |
+
"date_type": 2,
|
| 79 |
+
"page": 1,
|
| 80 |
+
"size": 10
|
| 81 |
+
}
|
| 82 |
+
data = json.dumps(data, separators=(',', ':'))
|
| 83 |
+
result = self.client.request_json("POST", url, headers=headers, cookies=cookies, data=data)
|
| 84 |
+
if not result.ok:
|
| 85 |
+
raise Exception(result.msg)
|
| 86 |
+
return result.json
|
| 87 |
+
|
| 88 |
+
def get_user_shop(self, user_id, cookies):
|
| 89 |
+
headers = get_qianfan_userDetail_headers_template(user_id)
|
| 90 |
+
url = "https://pgy.xiaohongshu.com/api/draco/distributor-square/distributor/cooperative/shop/v2"
|
| 91 |
+
data = {
|
| 92 |
+
"buyer_id": "56509e0d9eb5782270a7b5ea",
|
| 93 |
+
"first_live_category": "",
|
| 94 |
+
"second_live_category": "",
|
| 95 |
+
"date_type": 2,
|
| 96 |
+
"page": 1,
|
| 97 |
+
"size": 10
|
| 98 |
+
}
|
| 99 |
+
data = json.dumps(data, separators=(',', ':'))
|
| 100 |
+
result = self.client.request_json("POST", url, headers=headers, cookies=cookies, data=data)
|
| 101 |
+
if not result.ok:
|
| 102 |
+
raise Exception(result.msg)
|
| 103 |
+
return result.json
|
| 104 |
+
|
| 105 |
+
def get_user_item(self, user_id, cookies):
|
| 106 |
+
headers = get_qianfan_userDetail_headers_template(user_id)
|
| 107 |
+
url = "https://pgy.xiaohongshu.com/api/draco/distributor-square/distributor/cooperative/item/v2"
|
| 108 |
+
data = {
|
| 109 |
+
"buyer_id": "56509e0d9eb5782270a7b5ea",
|
| 110 |
+
"first_live_category": "",
|
| 111 |
+
"second_live_category": "",
|
| 112 |
+
"date_type": 2,
|
| 113 |
+
"page": 1,
|
| 114 |
+
"size": 10
|
| 115 |
+
}
|
| 116 |
+
data = json.dumps(data, separators=(',', ':'))
|
| 117 |
+
result = self.client.request_json("POST", url, headers=headers, cookies=cookies, data=data)
|
| 118 |
+
if not result.ok:
|
| 119 |
+
raise Exception(result.msg)
|
| 120 |
+
return result.json
|
| 121 |
+
|
| 122 |
+
def get_user_fans(self, user_id, cookies):
|
| 123 |
+
headers = get_qianfan_userDetail_headers_template(user_id)
|
| 124 |
+
url = "https://pgy.xiaohongshu.com/api/draco/distributor-square/distribuitor/detail/fans"
|
| 125 |
+
params = {
|
| 126 |
+
"distributor_id": user_id,
|
| 127 |
+
"date_type": "2"
|
| 128 |
+
}
|
| 129 |
+
result = self.client.request_json("GET", url, headers=headers, cookies=cookies, params=params)
|
| 130 |
+
if not result.ok:
|
| 131 |
+
raise Exception(result.msg)
|
| 132 |
+
return result.json
|
| 133 |
+
|
| 134 |
+
if __name__ == '__main__':
|
| 135 |
+
qianfan_api = QianFanAPI()
|
| 136 |
+
# https://pgy.xiaohongshu.com 的cookie
|
| 137 |
+
cookies_str = ''
|
| 138 |
+
cookies = trans_cookies(cookies_str)
|
| 139 |
+
choice, distribution_category = qianfan_api.choose_categories(cookies)
|
| 140 |
+
user_list = qianfan_api.get_some_user(choice, distribution_category, 10, cookies)
|
| 141 |
+
for user in user_list:
|
| 142 |
+
user_id = user["distributor_id"]
|
| 143 |
+
user_detail = qianfan_api.get_user_detail(user_id, cookies)
|
| 144 |
+
user_cooperation = qianfan_api.get_user_cooperation(user_id, cookies)
|
| 145 |
+
user_shop = qianfan_api.get_user_shop(user_id, cookies)
|
| 146 |
+
user_item = qianfan_api.get_user_item(user_id, cookies)
|
| 147 |
+
user_fans = qianfan_api.get_user_fans(user_id, cookies)
|
| 148 |
+
print(user)
|
| 149 |
+
print(user_detail)
|
| 150 |
+
print(user_cooperation)
|
| 151 |
+
print(user_shop)
|
| 152 |
+
print(user_item)
|
| 153 |
+
print(user_fans)
|
| 154 |
+
print(f'url: https://www.xiaohongshu.com/user/profile/{user_id}')
|
| 155 |
+
print(f'qianfan_url: https://pgy.xiaohongshu.com/microapp/distribution/live-blogger-info/{user_id}?source=square')
|
cli.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import asyncio
|
| 3 |
+
import os
|
| 4 |
+
import sys
|
| 5 |
+
|
| 6 |
+
import qrcode
|
| 7 |
+
|
| 8 |
+
from xhs_utils.common_util import init
|
| 9 |
+
from xhs_utils.session_manager import SessionManager
|
| 10 |
+
from xhs_utils.state_store import StateStore
|
| 11 |
+
from xhs_utils.spider import Data_Spider
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def _ensure_excel_name(args, default_name):
|
| 15 |
+
if args.save_choice in ("all", "excel") and not args.excel_name:
|
| 16 |
+
args.excel_name = default_name
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def cmd_note(args):
|
| 20 |
+
cookies_str, base_path = init()
|
| 21 |
+
sm = SessionManager(cookies_file=args.cookies_file)
|
| 22 |
+
if args.cookies:
|
| 23 |
+
cookies_str = args.cookies
|
| 24 |
+
if args.save_cookies and cookies_str:
|
| 25 |
+
sm.save_to_file(cookies_str)
|
| 26 |
+
if args.write_env:
|
| 27 |
+
sm.save_to_env_file(cookies_str, env_file=args.env_file)
|
| 28 |
+
|
| 29 |
+
state = StateStore(args.state_file) if args.resume else None
|
| 30 |
+
spider = Data_Spider()
|
| 31 |
+
_ensure_excel_name(args, "note")
|
| 32 |
+
proxies = None
|
| 33 |
+
if args.proxy:
|
| 34 |
+
proxies = {"http": args.proxy, "https": args.proxy}
|
| 35 |
+
summary = spider.spider_some_note([args.url], cookies_str, base_path, args.save_choice, args.excel_name, proxies=proxies, state_store=state)
|
| 36 |
+
if isinstance(summary, dict):
|
| 37 |
+
print(summary)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def cmd_user(args):
|
| 41 |
+
cookies_str, base_path = init()
|
| 42 |
+
sm = SessionManager(cookies_file=args.cookies_file)
|
| 43 |
+
if args.cookies:
|
| 44 |
+
cookies_str = args.cookies
|
| 45 |
+
if args.save_cookies and cookies_str:
|
| 46 |
+
sm.save_to_file(cookies_str)
|
| 47 |
+
if args.write_env:
|
| 48 |
+
sm.save_to_env_file(cookies_str, env_file=args.env_file)
|
| 49 |
+
|
| 50 |
+
state = StateStore(args.state_file) if args.resume else None
|
| 51 |
+
spider = Data_Spider()
|
| 52 |
+
_ensure_excel_name(args, args.url.split("/")[-1].split("?")[0])
|
| 53 |
+
proxies = None
|
| 54 |
+
if args.proxy:
|
| 55 |
+
proxies = {"http": args.proxy, "https": args.proxy}
|
| 56 |
+
summary = spider.spider_user_all_note(args.url, cookies_str, base_path, args.save_choice, args.excel_name, proxies=proxies, state_store=state)
|
| 57 |
+
print(summary)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def cmd_search(args):
|
| 61 |
+
cookies_str, base_path = init()
|
| 62 |
+
sm = SessionManager(cookies_file=args.cookies_file)
|
| 63 |
+
if args.cookies:
|
| 64 |
+
cookies_str = args.cookies
|
| 65 |
+
if args.save_cookies and cookies_str:
|
| 66 |
+
sm.save_to_file(cookies_str)
|
| 67 |
+
if args.write_env:
|
| 68 |
+
sm.save_to_env_file(cookies_str, env_file=args.env_file)
|
| 69 |
+
|
| 70 |
+
state = StateStore(args.state_file) if args.resume else None
|
| 71 |
+
spider = Data_Spider()
|
| 72 |
+
_ensure_excel_name(args, args.query)
|
| 73 |
+
proxies = None
|
| 74 |
+
if args.proxy:
|
| 75 |
+
proxies = {"http": args.proxy, "https": args.proxy}
|
| 76 |
+
summary = spider.spider_some_search_note(
|
| 77 |
+
args.query,
|
| 78 |
+
args.num,
|
| 79 |
+
cookies_str,
|
| 80 |
+
base_path,
|
| 81 |
+
args.save_choice,
|
| 82 |
+
args.sort,
|
| 83 |
+
args.note_type,
|
| 84 |
+
args.note_time,
|
| 85 |
+
args.note_range,
|
| 86 |
+
args.pos_distance,
|
| 87 |
+
geo=None,
|
| 88 |
+
excel_name=args.excel_name,
|
| 89 |
+
proxies=proxies,
|
| 90 |
+
state_store=state,
|
| 91 |
+
)
|
| 92 |
+
print(summary)
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
async def _pc_login_qrcode(save_path: str | None, headless: bool, poll_interval_s: float):
|
| 96 |
+
try:
|
| 97 |
+
from apis.xhs_pc_login_apis import XHSLoginApi
|
| 98 |
+
except Exception as e:
|
| 99 |
+
raise RuntimeError(f"missing_login_dependency: {e}")
|
| 100 |
+
|
| 101 |
+
login_api = XHSLoginApi()
|
| 102 |
+
cookies = await login_api.xhsGenerateInitCookies(headless=headless)
|
| 103 |
+
success, msg, qrcode_dict = await login_api.xhsGenerateQRcode(cookies)
|
| 104 |
+
if not success:
|
| 105 |
+
raise RuntimeError(msg)
|
| 106 |
+
|
| 107 |
+
verify_url = qrcode_dict["verify_url"]
|
| 108 |
+
if not verify_url:
|
| 109 |
+
raise RuntimeError("verify_url_empty")
|
| 110 |
+
|
| 111 |
+
if save_path:
|
| 112 |
+
os.makedirs(os.path.dirname(os.path.abspath(save_path)), exist_ok=True)
|
| 113 |
+
img = qrcode.make(verify_url)
|
| 114 |
+
img.save(save_path)
|
| 115 |
+
else:
|
| 116 |
+
print(verify_url)
|
| 117 |
+
|
| 118 |
+
while True:
|
| 119 |
+
success, msg, res = await login_api.xhsCheckQRCodeLogin(qrcode_dict["qr_id"], qrcode_dict["code"], qrcode_dict["cookies"])
|
| 120 |
+
if success and res.get("cookies_str"):
|
| 121 |
+
return res["cookies_str"]
|
| 122 |
+
await asyncio.sleep(poll_interval_s)
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def cmd_login_pc_qrcode(args):
|
| 126 |
+
cookies_str = asyncio.run(_pc_login_qrcode(args.qr_path, args.headless, args.poll_interval))
|
| 127 |
+
sm = SessionManager(cookies_file=args.cookies_file)
|
| 128 |
+
if args.save_cookies:
|
| 129 |
+
sm.save_to_file(cookies_str)
|
| 130 |
+
if args.write_env:
|
| 131 |
+
sm.save_to_env_file(cookies_str, env_file=args.env_file)
|
| 132 |
+
print(cookies_str)
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def build_parser():
|
| 136 |
+
parser = argparse.ArgumentParser(prog="xhs", add_help=True)
|
| 137 |
+
parser.add_argument("--cookies", default=None)
|
| 138 |
+
parser.add_argument("--cookies-file", default=None)
|
| 139 |
+
parser.add_argument("--save-cookies", action="store_true")
|
| 140 |
+
parser.add_argument("--write-env", action="store_true")
|
| 141 |
+
parser.add_argument("--env-file", default=os.path.join(".env"))
|
| 142 |
+
parser.add_argument("--proxy", default=None)
|
| 143 |
+
parser.add_argument("--state-file", default=os.path.join("datas", "state.json"))
|
| 144 |
+
|
| 145 |
+
sub = parser.add_subparsers(dest="cmd", required=True)
|
| 146 |
+
|
| 147 |
+
p_note = sub.add_parser("note")
|
| 148 |
+
p_note.add_argument("--url", required=True)
|
| 149 |
+
p_note.add_argument("--save-choice", default="all", choices=["all", "excel", "media", "media-video", "media-image"])
|
| 150 |
+
p_note.add_argument("--excel-name", default="")
|
| 151 |
+
p_note.add_argument("--resume", action="store_true")
|
| 152 |
+
p_note.set_defaults(func=cmd_note)
|
| 153 |
+
|
| 154 |
+
p_user = sub.add_parser("user")
|
| 155 |
+
p_user.add_argument("--url", required=True)
|
| 156 |
+
p_user.add_argument("--save-choice", default="all", choices=["all", "excel", "media", "media-video", "media-image"])
|
| 157 |
+
p_user.add_argument("--excel-name", default="")
|
| 158 |
+
p_user.add_argument("--resume", action="store_true")
|
| 159 |
+
p_user.set_defaults(func=cmd_user)
|
| 160 |
+
|
| 161 |
+
p_search = sub.add_parser("search")
|
| 162 |
+
p_search.add_argument("--query", required=True)
|
| 163 |
+
p_search.add_argument("--num", type=int, default=10)
|
| 164 |
+
p_search.add_argument("--sort", type=int, default=0)
|
| 165 |
+
p_search.add_argument("--note-type", type=int, default=0)
|
| 166 |
+
p_search.add_argument("--note-time", type=int, default=0)
|
| 167 |
+
p_search.add_argument("--note-range", type=int, default=0)
|
| 168 |
+
p_search.add_argument("--pos-distance", type=int, default=0)
|
| 169 |
+
p_search.add_argument("--save-choice", default="all", choices=["all", "excel", "media", "media-video", "media-image"])
|
| 170 |
+
p_search.add_argument("--excel-name", default="")
|
| 171 |
+
p_search.add_argument("--resume", action="store_true")
|
| 172 |
+
p_search.set_defaults(func=cmd_search)
|
| 173 |
+
|
| 174 |
+
p_login = sub.add_parser("login")
|
| 175 |
+
login_sub = p_login.add_subparsers(dest="login_cmd", required=True)
|
| 176 |
+
p_login_pc = login_sub.add_parser("pc-qrcode")
|
| 177 |
+
p_login_pc.add_argument("--qr-path", default=os.path.join("datas", "qrcode.png"))
|
| 178 |
+
p_login_pc.add_argument("--headless", action="store_true")
|
| 179 |
+
p_login_pc.add_argument("--poll-interval", type=float, default=1.0)
|
| 180 |
+
p_login_pc.set_defaults(func=cmd_login_pc_qrcode)
|
| 181 |
+
|
| 182 |
+
return parser
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
def main(argv=None):
|
| 186 |
+
parser = build_parser()
|
| 187 |
+
args = parser.parse_args(argv)
|
| 188 |
+
try:
|
| 189 |
+
args.func(args)
|
| 190 |
+
except KeyboardInterrupt:
|
| 191 |
+
raise
|
| 192 |
+
except Exception as e:
|
| 193 |
+
print(str(e), file=sys.stderr)
|
| 194 |
+
sys.exit(2)
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
if __name__ == "__main__":
|
| 198 |
+
main()
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: "3.8"
|
| 2 |
+
|
| 3 |
+
services:
|
| 4 |
+
spider_xhs:
|
| 5 |
+
build: .
|
| 6 |
+
ports:
|
| 7 |
+
- "8000:8000"
|
| 8 |
+
volumes:
|
| 9 |
+
- ./storage:/app/storage
|
| 10 |
+
environment:
|
| 11 |
+
ENGINE_STRATEGY: auto
|
| 12 |
+
STORAGE_ROOT: /app/storage
|
| 13 |
+
CALLBACK_URL: http://example.com/callback
|
| 14 |
+
restart: always
|
engines/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
_pkg_root = Path(__file__).resolve().parents[1]
|
| 5 |
+
if str(_pkg_root) not in sys.path:
|
| 6 |
+
sys.path.insert(0, str(_pkg_root))
|
| 7 |
+
|
| 8 |
+
from .base import Engine, EngineRunOutput
|
| 9 |
+
from .mediacrawler import MediaCrawlerEngine
|
| 10 |
+
from .spider_xhs import SpiderXHSEngine
|
| 11 |
+
from .agentic_crawler import AgenticCrawlerEngine
|
| 12 |
+
|
| 13 |
+
__all__ = [
|
| 14 |
+
"Engine",
|
| 15 |
+
"EngineRunOutput",
|
| 16 |
+
"MediaCrawlerEngine",
|
| 17 |
+
"SpiderXHSEngine",
|
| 18 |
+
"AgenticCrawlerEngine",
|
| 19 |
+
]
|
engines/agentic_crawler.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Any, Dict
|
| 2 |
+
import asyncio
|
| 3 |
+
from datetime import datetime, timezone
|
| 4 |
+
import hashlib
|
| 5 |
+
import sys
|
| 6 |
+
import os
|
| 7 |
+
|
| 8 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 9 |
+
|
| 10 |
+
from .base import EngineRunOutput
|
| 11 |
+
from service.tasks import TaskRecord
|
| 12 |
+
from orchestrator.agent_utils import get_llm, get_browser_with_auth
|
| 13 |
+
from browser_use import Agent
|
| 14 |
+
|
| 15 |
+
class AgenticCrawlerEngine:
|
| 16 |
+
name = "agentic_crawler"
|
| 17 |
+
|
| 18 |
+
def __init__(self, proxy: str | None = None, storage_state_paths: list[str] | None = None):
|
| 19 |
+
self.proxy = proxy
|
| 20 |
+
self.storage_state_paths = storage_state_paths or []
|
| 21 |
+
|
| 22 |
+
def _get_storage_state_path(self) -> str | None:
|
| 23 |
+
if self.storage_state_paths:
|
| 24 |
+
return self.storage_state_paths[0]
|
| 25 |
+
return None
|
| 26 |
+
|
| 27 |
+
def run(self, task: TaskRecord) -> EngineRunOutput:
|
| 28 |
+
return asyncio.run(self._run_async(task))
|
| 29 |
+
|
| 30 |
+
async def _run_async(self, task: TaskRecord) -> EngineRunOutput:
|
| 31 |
+
task_type = task.task_type
|
| 32 |
+
payload = task.payload or {}
|
| 33 |
+
|
| 34 |
+
llm = get_llm()
|
| 35 |
+
browser = get_browser_with_auth(self._get_storage_state_path())
|
| 36 |
+
|
| 37 |
+
prompt = ""
|
| 38 |
+
if task_type == "search_notes":
|
| 39 |
+
keyword = payload.get("keyword")
|
| 40 |
+
prompt = f"Go to Xiaohongshu (https://www.xiaohongshu.com/). Search for '{keyword}', click on the first 3 notes, and extract their title, author, likes, and content. Return the results in a valid JSON array format."
|
| 41 |
+
elif task_type == "note_detail":
|
| 42 |
+
url = payload.get("note_url") or payload.get("url")
|
| 43 |
+
prompt = f"Open this Xiaohongshu note: {url}. Extract its title, author, likes, and content. Return the result in a valid JSON format."
|
| 44 |
+
else:
|
| 45 |
+
await browser.close()
|
| 46 |
+
raise ValueError(f"Agentic Crawler does not support task type: {task_type}")
|
| 47 |
+
|
| 48 |
+
agent = Agent(task=prompt, llm=llm, browser=browser)
|
| 49 |
+
|
| 50 |
+
try:
|
| 51 |
+
result = await agent.run()
|
| 52 |
+
final_text = result.final_result() if hasattr(result, 'final_result') else str(result)
|
| 53 |
+
|
| 54 |
+
source_ref = payload.get("keyword") or payload.get("note_url") or payload.get("url") or ""
|
| 55 |
+
dedup_payload = f"{task_type}:{source_ref}".encode("utf-8", errors="ignore")
|
| 56 |
+
|
| 57 |
+
meta = {
|
| 58 |
+
"task_id": task.id,
|
| 59 |
+
"source_engine": "browser",
|
| 60 |
+
"engine_name": self.name,
|
| 61 |
+
"source_type": task_type,
|
| 62 |
+
"source_ref": source_ref,
|
| 63 |
+
"operator": payload.get("operator") or "system",
|
| 64 |
+
"ingested_at": datetime.now(timezone.utc).isoformat(),
|
| 65 |
+
"dedup_key": hashlib.sha1(dedup_payload).hexdigest(),
|
| 66 |
+
"ok": True,
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
return EngineRunOutput(
|
| 70 |
+
raw={"agent_result": final_text},
|
| 71 |
+
normalized={"data": final_text},
|
| 72 |
+
meta=meta
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
except Exception as e:
|
| 76 |
+
raise RuntimeError(f"Agentic Crawler failed: {e}")
|
| 77 |
+
finally:
|
| 78 |
+
await browser.close()
|
engines/base.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from dataclasses import dataclass
|
| 4 |
+
from typing import Any, Dict, Protocol
|
| 5 |
+
|
| 6 |
+
from ..service.tasks import TaskRecord
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
@dataclass(frozen=True)
|
| 10 |
+
class EngineRunOutput:
|
| 11 |
+
raw: Any
|
| 12 |
+
normalized: Any
|
| 13 |
+
meta: Dict[str, Any]
|
| 14 |
+
|
| 15 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 16 |
+
return {"raw": self.raw, "normalized": self.normalized, "meta": self.meta}
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class Engine(Protocol):
|
| 20 |
+
name: str
|
| 21 |
+
|
| 22 |
+
def run(self, task: TaskRecord) -> EngineRunOutput: ...
|
| 23 |
+
|
engines/mediacrawler.py
ADDED
|
@@ -0,0 +1,1081 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import hashlib
|
| 4 |
+
import html
|
| 5 |
+
import json
|
| 6 |
+
import os
|
| 7 |
+
import random
|
| 8 |
+
import re
|
| 9 |
+
import time
|
| 10 |
+
import urllib.parse
|
| 11 |
+
from datetime import datetime, timezone
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
from typing import Any, Dict, Iterable, List, Tuple
|
| 14 |
+
|
| 15 |
+
try:
|
| 16 |
+
from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
|
| 17 |
+
from playwright.sync_api import sync_playwright
|
| 18 |
+
|
| 19 |
+
_PLAYWRIGHT_IMPORT_ERROR: str | None = None
|
| 20 |
+
except Exception as e:
|
| 21 |
+
PlaywrightTimeoutError = Exception
|
| 22 |
+
sync_playwright = None
|
| 23 |
+
_PLAYWRIGHT_IMPORT_ERROR = str(e)
|
| 24 |
+
|
| 25 |
+
from ..service.tasks import TaskRecord
|
| 26 |
+
from .base import EngineRunOutput
|
| 27 |
+
|
| 28 |
+
try:
|
| 29 |
+
import mediacrawler as _mediacrawler
|
| 30 |
+
|
| 31 |
+
_MEDIACRAWLER_IMPORT_ERROR: str | None = None
|
| 32 |
+
except Exception as e:
|
| 33 |
+
_mediacrawler = None
|
| 34 |
+
_MEDIACRAWLER_IMPORT_ERROR = str(e)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
_NOTE_ID_RE = re.compile(r"/explore/([^/?#]+)")
|
| 38 |
+
_USER_ID_RE = re.compile(r"/user/profile/([^/?#]+)")
|
| 39 |
+
_HREF_RE = re.compile(r'href=[\'"]([^\'"]+)[\'"]', re.IGNORECASE)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def _iso_now() -> str:
|
| 43 |
+
return datetime.now(timezone.utc).isoformat()
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def _dedup_key(source_type: str, source_ref: str) -> str:
|
| 47 |
+
payload = f"{source_type}:{source_ref}".encode("utf-8", errors="ignore")
|
| 48 |
+
return hashlib.sha1(payload).hexdigest()
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def _parse_user_id(payload: Dict[str, Any]) -> str | None:
|
| 52 |
+
user_id = payload.get("user_id") or payload.get("uid")
|
| 53 |
+
if user_id:
|
| 54 |
+
return str(user_id)
|
| 55 |
+
user_url = payload.get("user_url") or payload.get("url")
|
| 56 |
+
if not user_url:
|
| 57 |
+
return None
|
| 58 |
+
try:
|
| 59 |
+
parsed = urllib.parse.urlparse(str(user_url))
|
| 60 |
+
return parsed.path.split("/")[-1] or None
|
| 61 |
+
except Exception:
|
| 62 |
+
return None
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def _validate_storage_state(path: Path) -> Tuple[bool, str | None]:
|
| 66 |
+
if not path.exists():
|
| 67 |
+
return False, f"storage_state_not_found={path}"
|
| 68 |
+
if not path.is_file():
|
| 69 |
+
return False, f"storage_state_not_file={path}"
|
| 70 |
+
if not os.access(path, os.R_OK):
|
| 71 |
+
return False, f"storage_state_not_readable={path}"
|
| 72 |
+
try:
|
| 73 |
+
with path.open("r", encoding="utf-8") as f:
|
| 74 |
+
json.load(f)
|
| 75 |
+
except Exception as e:
|
| 76 |
+
return False, f"storage_state_invalid_json={path} err={e}"
|
| 77 |
+
return True, None
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def _extract_first_existing(paths: Iterable[Path]) -> Path | None:
|
| 81 |
+
for path in paths:
|
| 82 |
+
ok, _ = _validate_storage_state(path)
|
| 83 |
+
if ok:
|
| 84 |
+
return path
|
| 85 |
+
return None
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def _classify_browser_error(msg: str) -> str:
|
| 89 |
+
m = (msg or "").lower()
|
| 90 |
+
if "captcha_detected" in m:
|
| 91 |
+
return "captcha"
|
| 92 |
+
if "err_proxy" in m or "proxy" in m or "tunnel" in m:
|
| 93 |
+
return "proxy_failed"
|
| 94 |
+
if "timeout" in m:
|
| 95 |
+
return "timeout"
|
| 96 |
+
if "net::" in m or "dns" in m or "connection" in m:
|
| 97 |
+
return "timeout"
|
| 98 |
+
if "storage_state" in m or "login" in m or "auth" in m:
|
| 99 |
+
return "auth"
|
| 100 |
+
if "captcha" in m or "验证码" in msg or "验证" in msg:
|
| 101 |
+
return "captcha"
|
| 102 |
+
return "parse"
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def _proxy_settings(proxy: str | None) -> dict | None:
|
| 106 |
+
if not proxy:
|
| 107 |
+
return None
|
| 108 |
+
proxy_value = str(proxy).strip()
|
| 109 |
+
if proxy_value == "":
|
| 110 |
+
return None
|
| 111 |
+
if "://" not in proxy_value:
|
| 112 |
+
proxy_value = f"http://{proxy_value}"
|
| 113 |
+
try:
|
| 114 |
+
parsed = urllib.parse.urlparse(proxy_value)
|
| 115 |
+
if parsed.scheme and parsed.hostname:
|
| 116 |
+
server = f"{parsed.scheme}://{parsed.hostname}"
|
| 117 |
+
if parsed.port:
|
| 118 |
+
server = f"{server}:{parsed.port}"
|
| 119 |
+
cfg: dict[str, Any] = {"server": server}
|
| 120 |
+
if parsed.username:
|
| 121 |
+
cfg["username"] = urllib.parse.unquote(parsed.username)
|
| 122 |
+
if parsed.password:
|
| 123 |
+
cfg["password"] = urllib.parse.unquote(parsed.password)
|
| 124 |
+
return cfg
|
| 125 |
+
except Exception:
|
| 126 |
+
return {"server": proxy_value}
|
| 127 |
+
return {"server": proxy_value}
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def _extract_note_id(url: str | None) -> str | None:
|
| 131 |
+
if not url:
|
| 132 |
+
return None
|
| 133 |
+
m = _NOTE_ID_RE.search(str(url))
|
| 134 |
+
return m.group(1) if m else None
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def _extract_user_id(url: str | None) -> str | None:
|
| 138 |
+
if not url:
|
| 139 |
+
return None
|
| 140 |
+
m = _USER_ID_RE.search(str(url))
|
| 141 |
+
return m.group(1) if m else None
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def _coerce_timestamp_to_iso(value: Any) -> str | None:
|
| 145 |
+
if value is None:
|
| 146 |
+
return None
|
| 147 |
+
if isinstance(value, (int, float)):
|
| 148 |
+
ts = float(value)
|
| 149 |
+
if ts > 1e12:
|
| 150 |
+
ts = ts / 1000.0
|
| 151 |
+
if ts <= 0:
|
| 152 |
+
return None
|
| 153 |
+
return datetime.fromtimestamp(ts, tz=timezone.utc).isoformat()
|
| 154 |
+
raw = str(value).strip()
|
| 155 |
+
if raw == "":
|
| 156 |
+
return None
|
| 157 |
+
if raw.isdigit():
|
| 158 |
+
try:
|
| 159 |
+
return _coerce_timestamp_to_iso(int(raw))
|
| 160 |
+
except Exception:
|
| 161 |
+
return None
|
| 162 |
+
for fmt in ("%Y-%m-%d %H:%M:%S", "%Y-%m-%d", "%Y/%m/%d %H:%M:%S", "%Y/%m/%d"):
|
| 163 |
+
try:
|
| 164 |
+
dt = datetime.strptime(raw, fmt)
|
| 165 |
+
return dt.replace(tzinfo=timezone.utc).isoformat()
|
| 166 |
+
except Exception:
|
| 167 |
+
continue
|
| 168 |
+
return None
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
def _meta_content(html_text: str, *, key: str, attr: str = "property") -> str | None:
|
| 172 |
+
if not html_text:
|
| 173 |
+
return None
|
| 174 |
+
pattern = re.compile(
|
| 175 |
+
rf"<meta[^>]+{attr}\s*=\s*['\"]{re.escape(key)}['\"][^>]*>",
|
| 176 |
+
re.IGNORECASE,
|
| 177 |
+
)
|
| 178 |
+
m = pattern.search(html_text)
|
| 179 |
+
if not m:
|
| 180 |
+
return None
|
| 181 |
+
tag = m.group(0)
|
| 182 |
+
m2 = re.search(r"content\s*=\s*['\"]([^'\"]+)['\"]", tag, re.IGNORECASE)
|
| 183 |
+
if not m2:
|
| 184 |
+
return None
|
| 185 |
+
val = html.unescape(m2.group(1)).strip()
|
| 186 |
+
return val or None
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
def _tag_text(html_text: str, tag: str) -> str | None:
|
| 190 |
+
if not html_text:
|
| 191 |
+
return None
|
| 192 |
+
m = re.search(rf"<{tag}[^>]*>(.*?)</{tag}>", html_text, re.IGNORECASE | re.DOTALL)
|
| 193 |
+
if not m:
|
| 194 |
+
return None
|
| 195 |
+
text = re.sub(r"<[^>]+>", " ", m.group(1))
|
| 196 |
+
text = html.unescape(text)
|
| 197 |
+
text = re.sub(r"\s+", " ", text).strip()
|
| 198 |
+
return text or None
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
def _search_json_value(html_text: str, key: str) -> str | None:
|
| 202 |
+
if not html_text:
|
| 203 |
+
return None
|
| 204 |
+
m = re.search(rf'"{re.escape(key)}"\s*:\s*"([^"]+)"', html_text)
|
| 205 |
+
if not m:
|
| 206 |
+
return None
|
| 207 |
+
val = html.unescape(m.group(1)).strip()
|
| 208 |
+
return val or None
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
def _search_json_number(html_text: str, key: str) -> int | None:
|
| 212 |
+
if not html_text:
|
| 213 |
+
return None
|
| 214 |
+
m = re.search(rf'"{re.escape(key)}"\s*:\s*(\d{{8,13}})', html_text)
|
| 215 |
+
if not m:
|
| 216 |
+
return None
|
| 217 |
+
try:
|
| 218 |
+
return int(m.group(1))
|
| 219 |
+
except Exception:
|
| 220 |
+
return None
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
def _extract_note_fields_from_html(html_text: str) -> Dict[str, Any]:
|
| 224 |
+
title = (
|
| 225 |
+
_meta_content(html_text, key="og:title")
|
| 226 |
+
or _meta_content(html_text, key="twitter:title", attr="name")
|
| 227 |
+
or _search_json_value(html_text, "title")
|
| 228 |
+
or _tag_text(html_text, "title")
|
| 229 |
+
)
|
| 230 |
+
author = (
|
| 231 |
+
_search_json_value(html_text, "nickname")
|
| 232 |
+
or _search_json_value(html_text, "name")
|
| 233 |
+
or _meta_content(html_text, key="og:site_name")
|
| 234 |
+
)
|
| 235 |
+
author_id = _search_json_value(html_text, "user_id") or _search_json_value(html_text, "userId")
|
| 236 |
+
ts = (
|
| 237 |
+
_search_json_number(html_text, "time")
|
| 238 |
+
or _search_json_number(html_text, "publishTime")
|
| 239 |
+
or _search_json_number(html_text, "publish_time")
|
| 240 |
+
)
|
| 241 |
+
publish_time = _coerce_timestamp_to_iso(ts) if ts is not None else None
|
| 242 |
+
return {
|
| 243 |
+
"title": title,
|
| 244 |
+
"author": author,
|
| 245 |
+
"author_id": author_id,
|
| 246 |
+
"publish_time": publish_time,
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
def _extract_profile_fields_from_html(html_text: str) -> Dict[str, Any]:
|
| 251 |
+
title = (
|
| 252 |
+
_meta_content(html_text, key="og:title")
|
| 253 |
+
or _meta_content(html_text, key="twitter:title", attr="name")
|
| 254 |
+
or _tag_text(html_text, "title")
|
| 255 |
+
)
|
| 256 |
+
nickname = _search_json_value(html_text, "nickname") or _search_json_value(html_text, "name")
|
| 257 |
+
user_id = _search_json_value(html_text, "user_id") or _search_json_value(html_text, "userId")
|
| 258 |
+
return {"title": title, "nickname": nickname, "user_id": user_id}
|
| 259 |
+
|
| 260 |
+
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
class MediaCrawlerEngine:
|
| 264 |
+
name = "mediacrawler"
|
| 265 |
+
|
| 266 |
+
def __init__(
|
| 267 |
+
self,
|
| 268 |
+
*,
|
| 269 |
+
proxy: str | None = None,
|
| 270 |
+
storage_state_paths: Iterable[Path] = (),
|
| 271 |
+
headless: bool = True,
|
| 272 |
+
):
|
| 273 |
+
self.proxy = proxy
|
| 274 |
+
self.storage_state_paths = tuple(storage_state_paths)
|
| 275 |
+
self.headless = bool(headless)
|
| 276 |
+
self.retry_max = self._env_int("MEDIACRAWLER_RETRY_MAX", 3)
|
| 277 |
+
self.retry_base_s = self._env_float("MEDIACRAWLER_RETRY_BASE_S", 0.8)
|
| 278 |
+
self.retry_cap_s = self._env_float("MEDIACRAWLER_RETRY_CAP_S", 12.0)
|
| 279 |
+
self.delay_min_s = self._env_float("MEDIACRAWLER_DELAY_MIN_S", 0.4)
|
| 280 |
+
self.delay_max_s = self._env_float("MEDIACRAWLER_DELAY_MAX_S", 1.2)
|
| 281 |
+
self.proxy_pool_env = os.getenv("SERVICE_PROXY_POOL") or os.getenv("MEDIACRAWLER_PROXY_POOL") or ""
|
| 282 |
+
self.stealth_enabled = self._env_bool("MEDIACRAWLER_STEALTH", True)
|
| 283 |
+
self.humanize_enabled = self._env_bool("MEDIACRAWLER_HUMANIZE", True)
|
| 284 |
+
|
| 285 |
+
def run(self, task: TaskRecord) -> EngineRunOutput:
|
| 286 |
+
source_type = task.task_type
|
| 287 |
+
payload = task.payload or {}
|
| 288 |
+
operator = str(payload.get("operator") or "system")
|
| 289 |
+
|
| 290 |
+
raw: Any = None
|
| 291 |
+
normalized: Any = None
|
| 292 |
+
ok = False
|
| 293 |
+
error_kind: str | None = None
|
| 294 |
+
error_message: str | None = None
|
| 295 |
+
source_ref: str = ""
|
| 296 |
+
|
| 297 |
+
if self._is_mock_mode(payload):
|
| 298 |
+
try:
|
| 299 |
+
if source_type == "note_url":
|
| 300 |
+
source_ref = str(payload.get("note_url") or payload.get("url") or "")
|
| 301 |
+
raw, normalized = self._run_note_url_mock(source_ref, payload)
|
| 302 |
+
elif source_type == "search":
|
| 303 |
+
query = str(payload.get("query") or payload.get("keyword") or "")
|
| 304 |
+
source_ref = query
|
| 305 |
+
raw, normalized = self._run_search_mock(query, payload)
|
| 306 |
+
elif source_type == "user_profile":
|
| 307 |
+
user_url = payload.get("user_url") or payload.get("url")
|
| 308 |
+
user_id = _parse_user_id(payload)
|
| 309 |
+
source_ref = str(user_url or user_id or "")
|
| 310 |
+
raw, normalized = self._run_user_profile_mock(user_url, user_id, payload)
|
| 311 |
+
else:
|
| 312 |
+
error_kind = "parse"
|
| 313 |
+
error_message = f"unsupported_task_type={source_type}"
|
| 314 |
+
meta = self._build_meta(
|
| 315 |
+
task_id=task.id,
|
| 316 |
+
operator=operator,
|
| 317 |
+
source_type=source_type,
|
| 318 |
+
source_ref=source_ref,
|
| 319 |
+
ok=False,
|
| 320 |
+
error_kind=error_kind,
|
| 321 |
+
error_message=error_message,
|
| 322 |
+
storage_state_path=None,
|
| 323 |
+
)
|
| 324 |
+
return EngineRunOutput(raw=raw, normalized=normalized, meta=meta)
|
| 325 |
+
ok = True
|
| 326 |
+
except Exception as e:
|
| 327 |
+
error_message = str(e)
|
| 328 |
+
error_kind = _classify_browser_error(error_message)
|
| 329 |
+
meta = self._build_meta(
|
| 330 |
+
task_id=task.id,
|
| 331 |
+
operator=operator,
|
| 332 |
+
source_type=source_type,
|
| 333 |
+
source_ref=source_ref,
|
| 334 |
+
ok=ok,
|
| 335 |
+
error_kind=error_kind,
|
| 336 |
+
error_message=error_message,
|
| 337 |
+
storage_state_path=None,
|
| 338 |
+
)
|
| 339 |
+
return EngineRunOutput(raw=raw, normalized=normalized, meta=meta)
|
| 340 |
+
|
| 341 |
+
if sync_playwright is None:
|
| 342 |
+
source_ref = str(
|
| 343 |
+
payload.get("note_url")
|
| 344 |
+
or payload.get("url")
|
| 345 |
+
or payload.get("query")
|
| 346 |
+
or payload.get("keyword")
|
| 347 |
+
or payload.get("user_id")
|
| 348 |
+
or ""
|
| 349 |
+
)
|
| 350 |
+
meta = self._build_meta(
|
| 351 |
+
task_id=task.id,
|
| 352 |
+
operator=operator,
|
| 353 |
+
source_type=source_type,
|
| 354 |
+
source_ref=source_ref,
|
| 355 |
+
ok=False,
|
| 356 |
+
error_kind="missing_dependency",
|
| 357 |
+
error_message=f"playwright 依赖未安装或不可用: {_PLAYWRIGHT_IMPORT_ERROR or 'import_failed'}",
|
| 358 |
+
storage_state_path=None,
|
| 359 |
+
)
|
| 360 |
+
return EngineRunOutput(raw=raw, normalized=normalized, meta=meta)
|
| 361 |
+
|
| 362 |
+
configured_paths: List[Path] = list(self.storage_state_paths)
|
| 363 |
+
storage_state_payload = payload.get("storage_state_path") or payload.get("storage_state")
|
| 364 |
+
if storage_state_payload:
|
| 365 |
+
try:
|
| 366 |
+
configured_paths.insert(0, Path(str(storage_state_payload)).expanduser())
|
| 367 |
+
except Exception:
|
| 368 |
+
pass
|
| 369 |
+
storage_state_path = _extract_first_existing(configured_paths)
|
| 370 |
+
if storage_state_path is None:
|
| 371 |
+
first_choice = configured_paths[0] if configured_paths else None
|
| 372 |
+
msg = "missing_storage_state_path"
|
| 373 |
+
if first_choice is not None:
|
| 374 |
+
ok_state, state_err = _validate_storage_state(first_choice)
|
| 375 |
+
if not ok_state:
|
| 376 |
+
msg = state_err or msg
|
| 377 |
+
meta = self._build_meta(
|
| 378 |
+
task_id=task.id,
|
| 379 |
+
operator=operator,
|
| 380 |
+
source_type=source_type,
|
| 381 |
+
source_ref=str(
|
| 382 |
+
payload.get("note_url")
|
| 383 |
+
or payload.get("url")
|
| 384 |
+
or payload.get("query")
|
| 385 |
+
or payload.get("keyword")
|
| 386 |
+
or payload.get("user_id")
|
| 387 |
+
or ""
|
| 388 |
+
),
|
| 389 |
+
ok=False,
|
| 390 |
+
error_kind="auth",
|
| 391 |
+
error_message=msg,
|
| 392 |
+
storage_state_path=str(first_choice) if first_choice is not None else None,
|
| 393 |
+
)
|
| 394 |
+
return EngineRunOutput(raw=raw, normalized=normalized, meta=meta)
|
| 395 |
+
|
| 396 |
+
try:
|
| 397 |
+
if source_type == "note_url":
|
| 398 |
+
source_ref = str(payload.get("note_url") or payload.get("url") or "")
|
| 399 |
+
raw, normalized = self._run_note_url(source_ref, storage_state_path)
|
| 400 |
+
elif source_type == "search":
|
| 401 |
+
query = str(payload.get("query") or payload.get("keyword") or "")
|
| 402 |
+
source_ref = query
|
| 403 |
+
raw, normalized = self._run_search(query, payload, storage_state_path)
|
| 404 |
+
elif source_type == "user_profile":
|
| 405 |
+
user_url = payload.get("user_url") or payload.get("url")
|
| 406 |
+
user_id = _parse_user_id(payload)
|
| 407 |
+
source_ref = str(user_url or user_id or "")
|
| 408 |
+
raw, normalized = self._run_user_profile(user_url, user_id, storage_state_path)
|
| 409 |
+
else:
|
| 410 |
+
error_kind = "parse"
|
| 411 |
+
error_message = f"unsupported_task_type={source_type}"
|
| 412 |
+
meta = self._build_meta(
|
| 413 |
+
task_id=task.id,
|
| 414 |
+
operator=operator,
|
| 415 |
+
source_type=source_type,
|
| 416 |
+
source_ref=source_ref,
|
| 417 |
+
ok=False,
|
| 418 |
+
error_kind=error_kind,
|
| 419 |
+
error_message=error_message,
|
| 420 |
+
storage_state_path=str(storage_state_path),
|
| 421 |
+
)
|
| 422 |
+
return EngineRunOutput(raw=raw, normalized=normalized, meta=meta)
|
| 423 |
+
|
| 424 |
+
ok = True
|
| 425 |
+
except Exception as e:
|
| 426 |
+
error_message = str(e)
|
| 427 |
+
error_kind = _classify_browser_error(error_message)
|
| 428 |
+
|
| 429 |
+
meta = self._build_meta(
|
| 430 |
+
task_id=task.id,
|
| 431 |
+
operator=operator,
|
| 432 |
+
source_type=source_type,
|
| 433 |
+
source_ref=source_ref,
|
| 434 |
+
ok=ok,
|
| 435 |
+
error_kind=error_kind,
|
| 436 |
+
error_message=error_message,
|
| 437 |
+
storage_state_path=str(storage_state_path),
|
| 438 |
+
)
|
| 439 |
+
return EngineRunOutput(raw=raw, normalized=normalized, meta=meta)
|
| 440 |
+
|
| 441 |
+
def _run_note_url(self, note_url: str, storage_state_path: Path) -> Tuple[Any, Any]:
|
| 442 |
+
if not note_url:
|
| 443 |
+
raise ValueError("missing_note_url")
|
| 444 |
+
url = str(note_url).strip()
|
| 445 |
+
note_id = _extract_note_id(url)
|
| 446 |
+
|
| 447 |
+
def job(page: Any) -> Tuple[Any, Any]:
|
| 448 |
+
self._random_delay()
|
| 449 |
+
self._humanize(page)
|
| 450 |
+
self._goto_with_retry(page, url)
|
| 451 |
+
self._random_delay()
|
| 452 |
+
self._humanize(page)
|
| 453 |
+
html_text = page.content()
|
| 454 |
+
self._raise_if_captcha(html_text)
|
| 455 |
+
extracted = self._extract_note_fields_from_page(page) or {}
|
| 456 |
+
if not extracted.get("title"):
|
| 457 |
+
extracted.update(_extract_note_fields_from_html(html_text))
|
| 458 |
+
title = extracted.get("title") or page.title()
|
| 459 |
+
author = extracted.get("author")
|
| 460 |
+
author_id = extracted.get("author_id")
|
| 461 |
+
publish_time = extracted.get("publish_time")
|
| 462 |
+
raw = {
|
| 463 |
+
"url": page.url,
|
| 464 |
+
"note_id": note_id or _extract_note_id(page.url),
|
| 465 |
+
"title": title,
|
| 466 |
+
"author": author,
|
| 467 |
+
"author_id": author_id,
|
| 468 |
+
"publish_time": publish_time,
|
| 469 |
+
"html": html_text,
|
| 470 |
+
}
|
| 471 |
+
normalized = {
|
| 472 |
+
"note_url": page.url,
|
| 473 |
+
"note_id": note_id or _extract_note_id(page.url),
|
| 474 |
+
"title": title,
|
| 475 |
+
"author": author,
|
| 476 |
+
"author_id": author_id,
|
| 477 |
+
"publish_time": publish_time,
|
| 478 |
+
"crawled_at": _iso_now(),
|
| 479 |
+
}
|
| 480 |
+
return raw, normalized
|
| 481 |
+
|
| 482 |
+
return self._run_with_browser_retry(job, storage_state_path)
|
| 483 |
+
|
| 484 |
+
def _run_search(
|
| 485 |
+
self,
|
| 486 |
+
query: str,
|
| 487 |
+
payload: Dict[str, Any],
|
| 488 |
+
storage_state_path: Path,
|
| 489 |
+
) -> Tuple[Any, Any]:
|
| 490 |
+
if not query:
|
| 491 |
+
raise ValueError("missing_query")
|
| 492 |
+
require_num = int(payload.get("require_num") or payload.get("limit") or 20)
|
| 493 |
+
search_url = (
|
| 494 |
+
"https://www.xiaohongshu.com/search_result?keyword="
|
| 495 |
+
+ urllib.parse.quote(query, safe="")
|
| 496 |
+
)
|
| 497 |
+
|
| 498 |
+
def job(page: Any) -> Tuple[Any, Any]:
|
| 499 |
+
self._random_delay()
|
| 500 |
+
self._humanize(page)
|
| 501 |
+
self._goto_with_retry(page, search_url)
|
| 502 |
+
self._random_delay()
|
| 503 |
+
self._humanize(page)
|
| 504 |
+
for _ in range(3):
|
| 505 |
+
if require_num <= 0:
|
| 506 |
+
break
|
| 507 |
+
hrefs = self._extract_hrefs(page)
|
| 508 |
+
if len(hrefs) >= require_num:
|
| 509 |
+
break
|
| 510 |
+
try:
|
| 511 |
+
page.mouse.wheel(0, 1600)
|
| 512 |
+
except Exception:
|
| 513 |
+
try:
|
| 514 |
+
page.evaluate("window.scrollBy(0, 1600)")
|
| 515 |
+
except Exception:
|
| 516 |
+
break
|
| 517 |
+
self._random_delay()
|
| 518 |
+
self._humanize(page)
|
| 519 |
+
|
| 520 |
+
html_text = page.content()
|
| 521 |
+
self._raise_if_captcha(html_text)
|
| 522 |
+
hrefs = self._extract_hrefs(page)
|
| 523 |
+
if not hrefs:
|
| 524 |
+
hrefs = self._extract_hrefs_from_html(html_text)
|
| 525 |
+
note_urls = self._collect_note_urls(hrefs, require_num)
|
| 526 |
+
raw = {"query": query, "search_url": page.url, "note_urls": note_urls, "html": html_text}
|
| 527 |
+
normalized = [
|
| 528 |
+
{"query": query, "note_url": u, "note_id": _extract_note_id(u), "crawled_at": _iso_now()}
|
| 529 |
+
for u in note_urls
|
| 530 |
+
]
|
| 531 |
+
return raw, normalized
|
| 532 |
+
|
| 533 |
+
return self._run_with_browser_retry(job, storage_state_path)
|
| 534 |
+
|
| 535 |
+
def _run_user_profile(
|
| 536 |
+
self,
|
| 537 |
+
user_url: str | None,
|
| 538 |
+
user_id: str | None,
|
| 539 |
+
storage_state_path: Path,
|
| 540 |
+
) -> Tuple[Any, Any]:
|
| 541 |
+
if user_url:
|
| 542 |
+
url = str(user_url).strip()
|
| 543 |
+
elif user_id:
|
| 544 |
+
url = f"https://www.xiaohongshu.com/user/profile/{user_id}"
|
| 545 |
+
else:
|
| 546 |
+
raise ValueError("missing_user_id")
|
| 547 |
+
default_user_id = user_id or _extract_user_id(url)
|
| 548 |
+
|
| 549 |
+
def job(page: Any) -> Tuple[Any, Any]:
|
| 550 |
+
self._random_delay()
|
| 551 |
+
self._humanize(page)
|
| 552 |
+
self._goto_with_retry(page, url)
|
| 553 |
+
self._random_delay()
|
| 554 |
+
self._humanize(page)
|
| 555 |
+
html_text = page.content()
|
| 556 |
+
self._raise_if_captcha(html_text)
|
| 557 |
+
extracted = self._extract_profile_fields_from_page(page) or {}
|
| 558 |
+
if not extracted.get("nickname") and not extracted.get("title"):
|
| 559 |
+
extracted.update(_extract_profile_fields_from_html(html_text))
|
| 560 |
+
title = extracted.get("title") or page.title()
|
| 561 |
+
nickname = extracted.get("nickname")
|
| 562 |
+
extracted_user_id = extracted.get("user_id") or default_user_id or _extract_user_id(page.url)
|
| 563 |
+
raw = {
|
| 564 |
+
"url": page.url,
|
| 565 |
+
"user_id": extracted_user_id,
|
| 566 |
+
"title": title,
|
| 567 |
+
"nickname": nickname,
|
| 568 |
+
"html": html_text,
|
| 569 |
+
}
|
| 570 |
+
normalized = {
|
| 571 |
+
"user_url": page.url,
|
| 572 |
+
"user_id": extracted_user_id,
|
| 573 |
+
"title": title,
|
| 574 |
+
"nickname": nickname,
|
| 575 |
+
"crawled_at": _iso_now(),
|
| 576 |
+
}
|
| 577 |
+
return raw, normalized
|
| 578 |
+
|
| 579 |
+
return self._run_with_browser_retry(job, storage_state_path)
|
| 580 |
+
|
| 581 |
+
def _goto_with_retry(self, page: Any, url: str) -> None:
|
| 582 |
+
last_err: Exception | None = None
|
| 583 |
+
attempts = max(1, int(self.retry_max or 1))
|
| 584 |
+
for attempt in range(attempts):
|
| 585 |
+
try:
|
| 586 |
+
page.goto(url, wait_until="domcontentloaded", timeout=30_000)
|
| 587 |
+
return
|
| 588 |
+
except PlaywrightTimeoutError as e:
|
| 589 |
+
last_err = e
|
| 590 |
+
except Exception as e:
|
| 591 |
+
last_err = e
|
| 592 |
+
if attempt < attempts - 1:
|
| 593 |
+
self._sleep_backoff(attempt)
|
| 594 |
+
if last_err is not None:
|
| 595 |
+
raise last_err
|
| 596 |
+
raise RuntimeError("goto_failed")
|
| 597 |
+
|
| 598 |
+
def _extract_hrefs(self, page: Any) -> List[str]:
|
| 599 |
+
selectors = [
|
| 600 |
+
"a[href*='/explore/']",
|
| 601 |
+
"a[href^='/explore/']",
|
| 602 |
+
"a[href*='xiaohongshu.com/explore/']",
|
| 603 |
+
"a[href]",
|
| 604 |
+
]
|
| 605 |
+
for selector in selectors:
|
| 606 |
+
try:
|
| 607 |
+
hrefs = page.eval_on_selector_all(
|
| 608 |
+
selector,
|
| 609 |
+
"els => els.map(el => el.getAttribute('href')).filter(Boolean)",
|
| 610 |
+
)
|
| 611 |
+
if isinstance(hrefs, list) and hrefs:
|
| 612 |
+
return [str(x) for x in hrefs if x]
|
| 613 |
+
except Exception:
|
| 614 |
+
continue
|
| 615 |
+
return []
|
| 616 |
+
|
| 617 |
+
@staticmethod
|
| 618 |
+
def _env_int(name: str, default: int) -> int:
|
| 619 |
+
value = os.getenv(name)
|
| 620 |
+
if value is None or str(value).strip() == "":
|
| 621 |
+
return int(default)
|
| 622 |
+
try:
|
| 623 |
+
return int(str(value).strip())
|
| 624 |
+
except Exception:
|
| 625 |
+
return int(default)
|
| 626 |
+
|
| 627 |
+
@staticmethod
|
| 628 |
+
def _env_float(name: str, default: float) -> float:
|
| 629 |
+
value = os.getenv(name)
|
| 630 |
+
if value is None or str(value).strip() == "":
|
| 631 |
+
return float(default)
|
| 632 |
+
try:
|
| 633 |
+
return float(str(value).strip())
|
| 634 |
+
except Exception:
|
| 635 |
+
return float(default)
|
| 636 |
+
|
| 637 |
+
@staticmethod
|
| 638 |
+
def _env_bool(name: str, default: bool) -> bool:
|
| 639 |
+
value = os.getenv(name)
|
| 640 |
+
if value is None or str(value).strip() == "":
|
| 641 |
+
return bool(default)
|
| 642 |
+
v = str(value).strip().lower()
|
| 643 |
+
if v in ("1", "true", "yes", "y", "on"):
|
| 644 |
+
return True
|
| 645 |
+
if v in ("0", "false", "no", "n", "off"):
|
| 646 |
+
return False
|
| 647 |
+
return bool(default)
|
| 648 |
+
|
| 649 |
+
@staticmethod
|
| 650 |
+
def _truthy(value: Any) -> bool:
|
| 651 |
+
if value is None:
|
| 652 |
+
return False
|
| 653 |
+
if isinstance(value, bool):
|
| 654 |
+
return value
|
| 655 |
+
v = str(value).strip().lower()
|
| 656 |
+
return v in ("1", "true", "yes", "y", "on")
|
| 657 |
+
|
| 658 |
+
def _is_mock_mode(self, payload: Dict[str, Any]) -> bool:
|
| 659 |
+
if self._truthy(payload.get("mock")):
|
| 660 |
+
return True
|
| 661 |
+
if self._truthy(os.getenv("MEDIACRAWLER_MOCK")):
|
| 662 |
+
return True
|
| 663 |
+
return False
|
| 664 |
+
|
| 665 |
+
def _random_delay(self) -> None:
|
| 666 |
+
lo = max(0.0, float(self.delay_min_s or 0.0))
|
| 667 |
+
hi = max(lo, float(self.delay_max_s or lo))
|
| 668 |
+
if hi <= 0:
|
| 669 |
+
return
|
| 670 |
+
time.sleep(random.uniform(lo, hi))
|
| 671 |
+
|
| 672 |
+
@staticmethod
|
| 673 |
+
def _stealth_init_script() -> str:
|
| 674 |
+
return """
|
| 675 |
+
(() => {
|
| 676 |
+
const define = (obj, prop, val) => {
|
| 677 |
+
try {
|
| 678 |
+
Object.defineProperty(obj, prop, { get: () => val, configurable: true });
|
| 679 |
+
} catch (e) {}
|
| 680 |
+
};
|
| 681 |
+
|
| 682 |
+
define(navigator, 'webdriver', undefined);
|
| 683 |
+
define(navigator, 'languages', ['zh-CN', 'zh', 'en-US', 'en']);
|
| 684 |
+
define(navigator, 'plugins', [1, 2, 3, 4, 5]);
|
| 685 |
+
define(navigator, 'hardwareConcurrency', 8);
|
| 686 |
+
define(navigator, 'deviceMemory', 8);
|
| 687 |
+
|
| 688 |
+
if (!window.chrome) {
|
| 689 |
+
try {
|
| 690 |
+
window.chrome = { runtime: {} };
|
| 691 |
+
} catch (e) {}
|
| 692 |
+
}
|
| 693 |
+
|
| 694 |
+
const originalQuery = (navigator.permissions && navigator.permissions.query) ? navigator.permissions.query.bind(navigator.permissions) : null;
|
| 695 |
+
if (originalQuery) {
|
| 696 |
+
try {
|
| 697 |
+
navigator.permissions.query = (parameters) => {
|
| 698 |
+
if (parameters && parameters.name === 'notifications') {
|
| 699 |
+
const state = (typeof Notification !== 'undefined' && Notification.permission) ? Notification.permission : 'default';
|
| 700 |
+
return Promise.resolve({ state });
|
| 701 |
+
}
|
| 702 |
+
return originalQuery(parameters);
|
| 703 |
+
};
|
| 704 |
+
} catch (e) {}
|
| 705 |
+
}
|
| 706 |
+
|
| 707 |
+
const originalGetParameter = (typeof WebGLRenderingContext !== 'undefined' && WebGLRenderingContext.prototype && WebGLRenderingContext.prototype.getParameter)
|
| 708 |
+
? WebGLRenderingContext.prototype.getParameter
|
| 709 |
+
: null;
|
| 710 |
+
if (originalGetParameter) {
|
| 711 |
+
try {
|
| 712 |
+
WebGLRenderingContext.prototype.getParameter = function(parameter) {
|
| 713 |
+
if (parameter === 37445) return 'Intel Inc.';
|
| 714 |
+
if (parameter === 37446) return 'Intel Iris OpenGL Engine';
|
| 715 |
+
return originalGetParameter.apply(this, arguments);
|
| 716 |
+
};
|
| 717 |
+
} catch (e) {}
|
| 718 |
+
}
|
| 719 |
+
})();
|
| 720 |
+
""".strip()
|
| 721 |
+
|
| 722 |
+
def _inject_stealth(self, context: Any, page: Any) -> None:
|
| 723 |
+
if not self.stealth_enabled:
|
| 724 |
+
return
|
| 725 |
+
script = self._stealth_init_script()
|
| 726 |
+
try:
|
| 727 |
+
context.add_init_script(script)
|
| 728 |
+
except Exception:
|
| 729 |
+
pass
|
| 730 |
+
try:
|
| 731 |
+
page.add_init_script(script)
|
| 732 |
+
except Exception:
|
| 733 |
+
pass
|
| 734 |
+
|
| 735 |
+
def _humanize(self, page: Any) -> None:
|
| 736 |
+
if not self.humanize_enabled:
|
| 737 |
+
return
|
| 738 |
+
if random.random() > 0.7:
|
| 739 |
+
return
|
| 740 |
+
try:
|
| 741 |
+
vp = getattr(page, "viewport_size", None) or {}
|
| 742 |
+
w = int(vp.get("width") or 1280)
|
| 743 |
+
h = int(vp.get("height") or 720)
|
| 744 |
+
except Exception:
|
| 745 |
+
w, h = 1280, 720
|
| 746 |
+
|
| 747 |
+
action = random.choice(("move", "wheel", "scroll", "pause"))
|
| 748 |
+
if action == "move":
|
| 749 |
+
x = random.randint(0, max(1, w - 1))
|
| 750 |
+
y = random.randint(0, max(1, h - 1))
|
| 751 |
+
steps = random.randint(5, 18)
|
| 752 |
+
try:
|
| 753 |
+
page.mouse.move(x, y, steps=steps)
|
| 754 |
+
except Exception:
|
| 755 |
+
pass
|
| 756 |
+
elif action == "wheel":
|
| 757 |
+
dx = random.randint(-20, 20)
|
| 758 |
+
dy = random.randint(80, 800)
|
| 759 |
+
try:
|
| 760 |
+
page.mouse.wheel(dx, dy)
|
| 761 |
+
except Exception:
|
| 762 |
+
try:
|
| 763 |
+
page.evaluate(f"window.scrollBy({dx}, {dy})")
|
| 764 |
+
except Exception:
|
| 765 |
+
pass
|
| 766 |
+
elif action == "scroll":
|
| 767 |
+
dy = random.randint(120, 1200)
|
| 768 |
+
try:
|
| 769 |
+
page.evaluate(f"window.scrollBy(0, {dy})")
|
| 770 |
+
except Exception:
|
| 771 |
+
pass
|
| 772 |
+
|
| 773 |
+
time.sleep(random.uniform(0.05, 0.35))
|
| 774 |
+
|
| 775 |
+
@staticmethod
|
| 776 |
+
def _is_captcha_html(html_text: str) -> bool:
|
| 777 |
+
if not html_text:
|
| 778 |
+
return False
|
| 779 |
+
lowered = str(html_text).lower()
|
| 780 |
+
if "captcha" in lowered:
|
| 781 |
+
return True
|
| 782 |
+
if "验证码" in html_text:
|
| 783 |
+
return True
|
| 784 |
+
if "验证" in html_text:
|
| 785 |
+
return True
|
| 786 |
+
return False
|
| 787 |
+
|
| 788 |
+
def _raise_if_captcha(self, html_text: str) -> None:
|
| 789 |
+
if self._is_captcha_html(html_text):
|
| 790 |
+
raise RuntimeError("captcha_detected")
|
| 791 |
+
|
| 792 |
+
def _sleep_backoff(self, attempt: int) -> None:
|
| 793 |
+
base = max(0.1, float(self.retry_base_s or 0.8))
|
| 794 |
+
cap = max(base, float(self.retry_cap_s or 12.0))
|
| 795 |
+
exp = min(cap, base * (2 ** int(attempt)))
|
| 796 |
+
time.sleep(exp + random.uniform(0.0, exp * 0.25))
|
| 797 |
+
|
| 798 |
+
def _proxy_pool(self) -> List[str]:
|
| 799 |
+
values: List[str] = []
|
| 800 |
+
if self.proxy:
|
| 801 |
+
v = str(self.proxy).strip()
|
| 802 |
+
if v:
|
| 803 |
+
values.append(v)
|
| 804 |
+
raw = str(self.proxy_pool_env or "").strip()
|
| 805 |
+
if raw:
|
| 806 |
+
tokens: List[str] = []
|
| 807 |
+
if "," in raw:
|
| 808 |
+
tokens.extend([t.strip() for t in raw.split(",") if t.strip()])
|
| 809 |
+
else:
|
| 810 |
+
tokens.extend([t.strip() for t in raw.split() if t.strip()])
|
| 811 |
+
values.extend(tokens)
|
| 812 |
+
seen: set[str] = set()
|
| 813 |
+
unique: List[str] = []
|
| 814 |
+
for item in values:
|
| 815 |
+
if item in seen:
|
| 816 |
+
continue
|
| 817 |
+
seen.add(item)
|
| 818 |
+
unique.append(item)
|
| 819 |
+
return unique
|
| 820 |
+
|
| 821 |
+
def _pick_proxy(self, attempt: int) -> str | None:
|
| 822 |
+
from ..service.proxy_pool import proxy_pool
|
| 823 |
+
dynamic_proxy = proxy_pool.get_random_proxy()
|
| 824 |
+
if dynamic_proxy:
|
| 825 |
+
return dynamic_proxy
|
| 826 |
+
|
| 827 |
+
pool = self._proxy_pool()
|
| 828 |
+
if not pool:
|
| 829 |
+
return None
|
| 830 |
+
idx = int(attempt) % len(pool)
|
| 831 |
+
return pool[idx]
|
| 832 |
+
|
| 833 |
+
def _run_with_browser_retry(self, job: Any, storage_state_path: Path) -> Tuple[Any, Any]:
|
| 834 |
+
last_err: Exception | None = None
|
| 835 |
+
max_attempts = max(1, int(self.retry_max or 1))
|
| 836 |
+
for attempt in range(max_attempts):
|
| 837 |
+
proxy = self._pick_proxy(attempt)
|
| 838 |
+
proxy_cfg = _proxy_settings(proxy)
|
| 839 |
+
try:
|
| 840 |
+
with sync_playwright() as p:
|
| 841 |
+
browser = p.chromium.launch(headless=self.headless, proxy=proxy_cfg)
|
| 842 |
+
try:
|
| 843 |
+
context = browser.new_context(storage_state=str(storage_state_path))
|
| 844 |
+
page = context.new_page()
|
| 845 |
+
self._inject_stealth(context, page)
|
| 846 |
+
self._humanize(page)
|
| 847 |
+
res = job(page)
|
| 848 |
+
if proxy:
|
| 849 |
+
from ..service.proxy_pool import proxy_pool
|
| 850 |
+
|
| 851 |
+
proxy_pool.report_success(proxy)
|
| 852 |
+
return res
|
| 853 |
+
finally:
|
| 854 |
+
browser.close()
|
| 855 |
+
except Exception as e:
|
| 856 |
+
last_err = e
|
| 857 |
+
reason = _classify_browser_error(str(e))
|
| 858 |
+
if reason in ("captcha", "risk"):
|
| 859 |
+
raise e
|
| 860 |
+
if proxy:
|
| 861 |
+
if reason in ("timeout", "proxy_failed", "rate"):
|
| 862 |
+
try:
|
| 863 |
+
from ..service.proxy_pool import proxy_pool
|
| 864 |
+
|
| 865 |
+
proxy_pool.report_failure(proxy, reason)
|
| 866 |
+
except Exception:
|
| 867 |
+
pass
|
| 868 |
+
if attempt < max_attempts - 1:
|
| 869 |
+
self._sleep_backoff(attempt)
|
| 870 |
+
continue
|
| 871 |
+
break
|
| 872 |
+
raise last_err if last_err is not None else RuntimeError("browser_job_failed")
|
| 873 |
+
|
| 874 |
+
@staticmethod
|
| 875 |
+
def _extract_hrefs_from_html(html_text: str) -> List[str]:
|
| 876 |
+
if not html_text:
|
| 877 |
+
return []
|
| 878 |
+
hrefs: List[str] = []
|
| 879 |
+
for m in _HREF_RE.finditer(html_text):
|
| 880 |
+
href = html.unescape(m.group(1)).strip()
|
| 881 |
+
if href:
|
| 882 |
+
hrefs.append(href)
|
| 883 |
+
return hrefs
|
| 884 |
+
|
| 885 |
+
@staticmethod
|
| 886 |
+
def _collect_note_urls(hrefs: List[str], limit: int) -> List[str]:
|
| 887 |
+
note_urls: List[str] = []
|
| 888 |
+
seen: set[str] = set()
|
| 889 |
+
for href in hrefs or []:
|
| 890 |
+
if "/explore/" not in str(href):
|
| 891 |
+
continue
|
| 892 |
+
u = str(href).strip()
|
| 893 |
+
if u.startswith("//"):
|
| 894 |
+
u = "https:" + u
|
| 895 |
+
if u.startswith("/"):
|
| 896 |
+
u = "https://www.xiaohongshu.com" + u
|
| 897 |
+
if not u.startswith("http"):
|
| 898 |
+
continue
|
| 899 |
+
if u in seen:
|
| 900 |
+
continue
|
| 901 |
+
seen.add(u)
|
| 902 |
+
note_urls.append(u)
|
| 903 |
+
if limit > 0 and len(note_urls) >= int(limit):
|
| 904 |
+
break
|
| 905 |
+
return note_urls
|
| 906 |
+
|
| 907 |
+
@staticmethod
|
| 908 |
+
def _safe_locator_text(page: Any, selector: str) -> str | None:
|
| 909 |
+
try:
|
| 910 |
+
loc = page.locator(selector)
|
| 911 |
+
if loc is None:
|
| 912 |
+
return None
|
| 913 |
+
val = loc.first.inner_text(timeout=1000)
|
| 914 |
+
if isinstance(val, str):
|
| 915 |
+
s = val.strip()
|
| 916 |
+
return s or None
|
| 917 |
+
except Exception:
|
| 918 |
+
return None
|
| 919 |
+
return None
|
| 920 |
+
|
| 921 |
+
@staticmethod
|
| 922 |
+
def _safe_locator_attr(page: Any, selector: str, name: str) -> str | None:
|
| 923 |
+
try:
|
| 924 |
+
loc = page.locator(selector)
|
| 925 |
+
if loc is None:
|
| 926 |
+
return None
|
| 927 |
+
val = loc.first.get_attribute(name, timeout=1000)
|
| 928 |
+
if isinstance(val, str):
|
| 929 |
+
s = val.strip()
|
| 930 |
+
return s or None
|
| 931 |
+
except Exception:
|
| 932 |
+
return None
|
| 933 |
+
return None
|
| 934 |
+
|
| 935 |
+
def _extract_note_fields_from_page(self, page: Any) -> Dict[str, Any]:
|
| 936 |
+
title = (
|
| 937 |
+
self._safe_locator_attr(page, "meta[property='og:title']", "content")
|
| 938 |
+
or self._safe_locator_attr(page, "meta[name='twitter:title']", "content")
|
| 939 |
+
or self._safe_locator_text(page, "h1")
|
| 940 |
+
)
|
| 941 |
+
author = (
|
| 942 |
+
self._safe_locator_text(page, "a[href*='/user/profile/'] span")
|
| 943 |
+
or self._safe_locator_text(page, "a[href*='/user/profile/']")
|
| 944 |
+
or self._safe_locator_text(page, "[class*='author'] [class*='name']")
|
| 945 |
+
)
|
| 946 |
+
publish_text = (
|
| 947 |
+
self._safe_locator_text(page, "time")
|
| 948 |
+
or self._safe_locator_text(page, "[class*='time']")
|
| 949 |
+
or self._safe_locator_text(page, "[class*='date']")
|
| 950 |
+
)
|
| 951 |
+
publish_time = _coerce_timestamp_to_iso(publish_text)
|
| 952 |
+
author_id = None
|
| 953 |
+
try:
|
| 954 |
+
href = self._safe_locator_attr(page, "a[href*='/user/profile/']", "href")
|
| 955 |
+
author_id = _extract_user_id(href)
|
| 956 |
+
except Exception:
|
| 957 |
+
author_id = None
|
| 958 |
+
return {"title": title, "author": author, "author_id": author_id, "publish_time": publish_time}
|
| 959 |
+
|
| 960 |
+
def _extract_profile_fields_from_page(self, page: Any) -> Dict[str, Any]:
|
| 961 |
+
title = (
|
| 962 |
+
self._safe_locator_attr(page, "meta[property='og:title']", "content")
|
| 963 |
+
or self._safe_locator_attr(page, "meta[name='twitter:title']", "content")
|
| 964 |
+
or page.title()
|
| 965 |
+
)
|
| 966 |
+
nickname = (
|
| 967 |
+
self._safe_locator_text(page, "h1")
|
| 968 |
+
or self._safe_locator_text(page, "[class*='nickname']")
|
| 969 |
+
or self._safe_locator_text(page, "[class*='name']")
|
| 970 |
+
)
|
| 971 |
+
user_id = _extract_user_id(getattr(page, "url", None))
|
| 972 |
+
return {"title": title, "nickname": nickname, "user_id": user_id}
|
| 973 |
+
|
| 974 |
+
def _run_note_url_mock(self, note_url: str, payload: Dict[str, Any]) -> Tuple[Any, Any]:
|
| 975 |
+
url = str(note_url or "").strip()
|
| 976 |
+
if not url:
|
| 977 |
+
raise ValueError("missing_note_url")
|
| 978 |
+
html_text = str(payload.get("mock_html") or payload.get("html") or "").strip()
|
| 979 |
+
if html_text == "":
|
| 980 |
+
html_text = "<html><head><title>mock</title></head><body></body></html>"
|
| 981 |
+
extracted = _extract_note_fields_from_html(html_text)
|
| 982 |
+
note_id = _extract_note_id(url)
|
| 983 |
+
raw = {
|
| 984 |
+
"url": url,
|
| 985 |
+
"note_id": note_id,
|
| 986 |
+
"title": extracted.get("title"),
|
| 987 |
+
"author": extracted.get("author"),
|
| 988 |
+
"author_id": extracted.get("author_id"),
|
| 989 |
+
"publish_time": extracted.get("publish_time"),
|
| 990 |
+
"html": html_text,
|
| 991 |
+
}
|
| 992 |
+
normalized = {
|
| 993 |
+
"note_url": url,
|
| 994 |
+
"note_id": note_id,
|
| 995 |
+
"title": extracted.get("title"),
|
| 996 |
+
"author": extracted.get("author"),
|
| 997 |
+
"author_id": extracted.get("author_id"),
|
| 998 |
+
"publish_time": extracted.get("publish_time"),
|
| 999 |
+
"crawled_at": _iso_now(),
|
| 1000 |
+
}
|
| 1001 |
+
return raw, normalized
|
| 1002 |
+
|
| 1003 |
+
def _run_search_mock(self, query: str, payload: Dict[str, Any]) -> Tuple[Any, Any]:
|
| 1004 |
+
q = str(query or "").strip()
|
| 1005 |
+
if q == "":
|
| 1006 |
+
raise ValueError("missing_query")
|
| 1007 |
+
require_num = int(payload.get("require_num") or payload.get("limit") or 20)
|
| 1008 |
+
html_text = str(payload.get("mock_html") or payload.get("html") or "").strip()
|
| 1009 |
+
hrefs = self._extract_hrefs_from_html(html_text)
|
| 1010 |
+
note_urls = self._collect_note_urls(hrefs, require_num)
|
| 1011 |
+
raw = {
|
| 1012 |
+
"query": q,
|
| 1013 |
+
"search_url": "mock://search",
|
| 1014 |
+
"note_urls": note_urls,
|
| 1015 |
+
"html": html_text,
|
| 1016 |
+
}
|
| 1017 |
+
normalized = [
|
| 1018 |
+
{"query": q, "note_url": u, "note_id": _extract_note_id(u), "crawled_at": _iso_now()}
|
| 1019 |
+
for u in note_urls
|
| 1020 |
+
]
|
| 1021 |
+
return raw, normalized
|
| 1022 |
+
|
| 1023 |
+
def _run_user_profile_mock(
|
| 1024 |
+
self, user_url: str | None, user_id: str | None, payload: Dict[str, Any]
|
| 1025 |
+
) -> Tuple[Any, Any]:
|
| 1026 |
+
if user_url:
|
| 1027 |
+
url = str(user_url).strip()
|
| 1028 |
+
elif user_id:
|
| 1029 |
+
url = f"https://www.xiaohongshu.com/user/profile/{user_id}"
|
| 1030 |
+
else:
|
| 1031 |
+
raise ValueError("missing_user_id")
|
| 1032 |
+
html_text = str(payload.get("mock_html") or payload.get("html") or "").strip()
|
| 1033 |
+
if html_text == "":
|
| 1034 |
+
html_text = "<html><head><title>mock</title></head><body></body></html>"
|
| 1035 |
+
extracted = _extract_profile_fields_from_html(html_text)
|
| 1036 |
+
resolved_user_id = user_id or extracted.get("user_id") or _extract_user_id(url)
|
| 1037 |
+
raw = {
|
| 1038 |
+
"url": url,
|
| 1039 |
+
"user_id": resolved_user_id,
|
| 1040 |
+
"title": extracted.get("title"),
|
| 1041 |
+
"nickname": extracted.get("nickname"),
|
| 1042 |
+
"html": html_text,
|
| 1043 |
+
}
|
| 1044 |
+
normalized = {
|
| 1045 |
+
"user_url": url,
|
| 1046 |
+
"user_id": resolved_user_id,
|
| 1047 |
+
"title": extracted.get("title"),
|
| 1048 |
+
"nickname": extracted.get("nickname"),
|
| 1049 |
+
"crawled_at": _iso_now(),
|
| 1050 |
+
}
|
| 1051 |
+
return raw, normalized
|
| 1052 |
+
|
| 1053 |
+
def _build_meta(
|
| 1054 |
+
self,
|
| 1055 |
+
*,
|
| 1056 |
+
task_id: str,
|
| 1057 |
+
operator: str,
|
| 1058 |
+
source_type: str,
|
| 1059 |
+
source_ref: str,
|
| 1060 |
+
ok: bool,
|
| 1061 |
+
error_kind: str | None,
|
| 1062 |
+
error_message: str | None,
|
| 1063 |
+
storage_state_path: str | None,
|
| 1064 |
+
) -> Dict[str, Any]:
|
| 1065 |
+
meta: Dict[str, Any] = {
|
| 1066 |
+
"task_id": task_id,
|
| 1067 |
+
"source_engine": "browser",
|
| 1068 |
+
"engine_name": self.name,
|
| 1069 |
+
"source_type": source_type,
|
| 1070 |
+
"source_ref": source_ref,
|
| 1071 |
+
"operator": operator,
|
| 1072 |
+
"ingested_at": _iso_now(),
|
| 1073 |
+
"dedup_key": _dedup_key(source_type, source_ref),
|
| 1074 |
+
"ok": bool(ok),
|
| 1075 |
+
}
|
| 1076 |
+
if storage_state_path:
|
| 1077 |
+
meta["storage_state_path"] = storage_state_path
|
| 1078 |
+
if not ok:
|
| 1079 |
+
meta["error_kind"] = error_kind or "parse"
|
| 1080 |
+
meta["error_message"] = error_message or "unknown_error"
|
| 1081 |
+
return meta
|
engines/spider_xhs.py
ADDED
|
@@ -0,0 +1,304 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import hashlib
|
| 4 |
+
import os
|
| 5 |
+
import urllib.parse
|
| 6 |
+
from datetime import datetime, timezone
|
| 7 |
+
from typing import Any, Dict, List, Tuple
|
| 8 |
+
|
| 9 |
+
from apis.xhs_pc_apis import XHS_Apis
|
| 10 |
+
from xhs_utils.data_util import handle_note_info, handle_user_info
|
| 11 |
+
from xhs_utils.response_guard import get_dict, get_list
|
| 12 |
+
|
| 13 |
+
from ..service.tasks import TaskRecord
|
| 14 |
+
from .base import EngineRunOutput
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def _iso_now() -> str:
|
| 18 |
+
return datetime.now(timezone.utc).isoformat()
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def _dedup_key(source_type: str, source_ref: str) -> str:
|
| 22 |
+
payload = f"{source_type}:{source_ref}".encode("utf-8", errors="ignore")
|
| 23 |
+
return hashlib.sha1(payload).hexdigest()
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def _classify_httpclient_msg(msg: str) -> str:
|
| 27 |
+
m = (msg or "").lower()
|
| 28 |
+
if "proxyerror" in m or "proxy" in m or "tunnel" in m:
|
| 29 |
+
return "proxy_failed"
|
| 30 |
+
if "timeout" in m:
|
| 31 |
+
return "timeout"
|
| 32 |
+
if "auth_required" in m or "redirect_to_login" in m or "missing_cookie" in m:
|
| 33 |
+
return "auth"
|
| 34 |
+
if "rate_limited" in m or "http_status=429" in m:
|
| 35 |
+
return "rate"
|
| 36 |
+
if "risk_control" in m or "captcha" in m or "验证码" in msg or "验证" in msg:
|
| 37 |
+
return "risk"
|
| 38 |
+
if "invalid_json" in m:
|
| 39 |
+
return "parse"
|
| 40 |
+
return "parse"
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def _proxy_dict(proxy: str | None) -> dict | None:
|
| 44 |
+
if proxy is None or proxy == "":
|
| 45 |
+
return None
|
| 46 |
+
return {"http": proxy, "https": proxy}
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def _get_cookie(payload: Dict[str, Any]) -> Tuple[str | None, str | None]:
|
| 50 |
+
cookies = payload.get("cookies") or payload.get("cookie") or payload.get("cookies_str")
|
| 51 |
+
if cookies:
|
| 52 |
+
return str(cookies), None
|
| 53 |
+
env_cookie = os.getenv("COOKIES") or os.getenv("XHS_COOKIES")
|
| 54 |
+
if env_cookie:
|
| 55 |
+
return env_cookie, None
|
| 56 |
+
return None, "missing_cookie"
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def _parse_user_id(payload: Dict[str, Any]) -> str | None:
|
| 60 |
+
user_id = payload.get("user_id") or payload.get("uid")
|
| 61 |
+
if user_id:
|
| 62 |
+
return str(user_id)
|
| 63 |
+
user_url = payload.get("user_url") or payload.get("url")
|
| 64 |
+
if not user_url:
|
| 65 |
+
return None
|
| 66 |
+
try:
|
| 67 |
+
parsed = urllib.parse.urlparse(str(user_url))
|
| 68 |
+
return parsed.path.split("/")[-1] or None
|
| 69 |
+
except Exception:
|
| 70 |
+
return None
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
class SpiderXHSEngine:
|
| 74 |
+
name = "spider_xhs"
|
| 75 |
+
|
| 76 |
+
def __init__(self, *, proxy: str | None = None):
|
| 77 |
+
self.proxy = proxy
|
| 78 |
+
self._apis = XHS_Apis()
|
| 79 |
+
|
| 80 |
+
def run(self, task: TaskRecord) -> EngineRunOutput:
|
| 81 |
+
source_type = task.task_type
|
| 82 |
+
payload = task.payload or {}
|
| 83 |
+
operator = str(payload.get("operator") or "system")
|
| 84 |
+
|
| 85 |
+
raw: Any = None
|
| 86 |
+
normalized: Any = None
|
| 87 |
+
ok = False
|
| 88 |
+
error_kind: str | None = None
|
| 89 |
+
error_message: str | None = None
|
| 90 |
+
source_ref: str = ""
|
| 91 |
+
proxy_used: str | None = None
|
| 92 |
+
|
| 93 |
+
cookies_str, cookie_err = _get_cookie(payload)
|
| 94 |
+
if cookie_err is not None:
|
| 95 |
+
error_kind = _classify_httpclient_msg(cookie_err)
|
| 96 |
+
error_message = cookie_err
|
| 97 |
+
source_ref = str(payload.get("note_url") or payload.get("query") or payload.get("user_id") or "")
|
| 98 |
+
meta = self._build_meta(
|
| 99 |
+
task_id=task.id,
|
| 100 |
+
operator=operator,
|
| 101 |
+
source_type=source_type,
|
| 102 |
+
source_ref=source_ref,
|
| 103 |
+
ok=False,
|
| 104 |
+
error_kind=error_kind,
|
| 105 |
+
error_message=error_message,
|
| 106 |
+
)
|
| 107 |
+
return EngineRunOutput(raw=raw, normalized=normalized, meta=meta)
|
| 108 |
+
|
| 109 |
+
proxies = None
|
| 110 |
+
if isinstance(payload.get("proxies"), dict):
|
| 111 |
+
proxies = payload.get("proxies")
|
| 112 |
+
if isinstance(proxies, dict):
|
| 113 |
+
candidate = proxies.get("http") or proxies.get("https")
|
| 114 |
+
if candidate:
|
| 115 |
+
proxy_used = str(candidate)
|
| 116 |
+
else:
|
| 117 |
+
explicit_proxy = payload.get("proxy") or payload.get("proxies")
|
| 118 |
+
proxy_value = explicit_proxy or self.proxy
|
| 119 |
+
if not proxy_value:
|
| 120 |
+
from ..service.proxy_pool import proxy_pool
|
| 121 |
+
|
| 122 |
+
proxy_value = proxy_pool.get_random_proxy()
|
| 123 |
+
if proxy_value:
|
| 124 |
+
proxy_used = str(proxy_value)
|
| 125 |
+
proxies = _proxy_dict(proxy_used)
|
| 126 |
+
|
| 127 |
+
try:
|
| 128 |
+
if source_type == "note_url":
|
| 129 |
+
source_ref = str(payload.get("note_url") or payload.get("url") or "")
|
| 130 |
+
raw, normalized = self._run_note_url(source_ref, cookies_str, proxies)
|
| 131 |
+
elif source_type == "search":
|
| 132 |
+
source_ref = str(payload.get("query") or payload.get("keyword") or "")
|
| 133 |
+
raw, normalized = self._run_search(source_ref, payload, cookies_str, proxies)
|
| 134 |
+
elif source_type == "user_profile":
|
| 135 |
+
user_id = _parse_user_id(payload)
|
| 136 |
+
source_ref = str(user_id or "")
|
| 137 |
+
raw, normalized = self._run_user_profile(user_id, cookies_str, proxies)
|
| 138 |
+
else:
|
| 139 |
+
error_kind = "parse"
|
| 140 |
+
error_message = f"unsupported_task_type={source_type}"
|
| 141 |
+
meta = self._build_meta(
|
| 142 |
+
task_id=task.id,
|
| 143 |
+
operator=operator,
|
| 144 |
+
source_type=source_type,
|
| 145 |
+
source_ref=source_ref,
|
| 146 |
+
ok=False,
|
| 147 |
+
error_kind=error_kind,
|
| 148 |
+
error_message=error_message,
|
| 149 |
+
)
|
| 150 |
+
return EngineRunOutput(raw=raw, normalized=normalized, meta=meta)
|
| 151 |
+
|
| 152 |
+
ok = True
|
| 153 |
+
except Exception as e:
|
| 154 |
+
error_message = str(e)
|
| 155 |
+
error_kind = _classify_httpclient_msg(error_message)
|
| 156 |
+
|
| 157 |
+
try:
|
| 158 |
+
if proxy_used:
|
| 159 |
+
from ..service.proxy_pool import proxy_pool
|
| 160 |
+
|
| 161 |
+
if ok:
|
| 162 |
+
proxy_pool.report_success(proxy_used)
|
| 163 |
+
else:
|
| 164 |
+
reason = error_kind or "parse"
|
| 165 |
+
if reason in ("timeout", "proxy_failed", "rate"):
|
| 166 |
+
proxy_pool.report_failure(proxy_used, reason)
|
| 167 |
+
except Exception:
|
| 168 |
+
pass
|
| 169 |
+
|
| 170 |
+
meta = self._build_meta(
|
| 171 |
+
task_id=task.id,
|
| 172 |
+
operator=operator,
|
| 173 |
+
source_type=source_type,
|
| 174 |
+
source_ref=source_ref,
|
| 175 |
+
ok=ok,
|
| 176 |
+
error_kind=error_kind,
|
| 177 |
+
error_message=error_message,
|
| 178 |
+
)
|
| 179 |
+
return EngineRunOutput(raw=raw, normalized=normalized, meta=meta)
|
| 180 |
+
|
| 181 |
+
def _run_note_url(
|
| 182 |
+
self,
|
| 183 |
+
note_url: str,
|
| 184 |
+
cookies_str: str,
|
| 185 |
+
proxies: dict | None,
|
| 186 |
+
) -> Tuple[Any, Any]:
|
| 187 |
+
if not note_url:
|
| 188 |
+
raise ValueError("missing_note_url")
|
| 189 |
+
success, msg, note_info = self._apis.get_note_info(note_url, cookies_str, proxies)
|
| 190 |
+
raw = note_info
|
| 191 |
+
if not success:
|
| 192 |
+
raise RuntimeError(msg)
|
| 193 |
+
data = get_dict(note_info, "data", context="get_note_info")
|
| 194 |
+
items = get_list(data, "items", context="get_note_info")
|
| 195 |
+
if not items:
|
| 196 |
+
raise ValueError("empty_items")
|
| 197 |
+
item = items[0]
|
| 198 |
+
item["url"] = note_url
|
| 199 |
+
normalized = handle_note_info(item)
|
| 200 |
+
return raw, normalized
|
| 201 |
+
|
| 202 |
+
def _run_search(
|
| 203 |
+
self,
|
| 204 |
+
query: str,
|
| 205 |
+
payload: Dict[str, Any],
|
| 206 |
+
cookies_str: str,
|
| 207 |
+
proxies: dict | None,
|
| 208 |
+
) -> Tuple[Any, Any]:
|
| 209 |
+
if not query:
|
| 210 |
+
raise ValueError("missing_query")
|
| 211 |
+
require_num = int(payload.get("require_num") or payload.get("limit") or 20)
|
| 212 |
+
sort_type_choice = int(payload.get("sort_type_choice") or 0)
|
| 213 |
+
note_type = int(payload.get("note_type") or 0)
|
| 214 |
+
note_time = int(payload.get("note_time") or 0)
|
| 215 |
+
note_range = int(payload.get("note_range") or 0)
|
| 216 |
+
pos_distance = int(payload.get("pos_distance") or 0)
|
| 217 |
+
geo = payload.get("geo") or ""
|
| 218 |
+
|
| 219 |
+
success, msg, notes = self._apis.search_some_note(
|
| 220 |
+
query,
|
| 221 |
+
require_num,
|
| 222 |
+
cookies_str,
|
| 223 |
+
sort_type_choice,
|
| 224 |
+
note_type,
|
| 225 |
+
note_time,
|
| 226 |
+
note_range,
|
| 227 |
+
pos_distance,
|
| 228 |
+
geo,
|
| 229 |
+
proxies,
|
| 230 |
+
)
|
| 231 |
+
raw = notes
|
| 232 |
+
if not success:
|
| 233 |
+
raise RuntimeError(msg)
|
| 234 |
+
|
| 235 |
+
normalized: List[Dict[str, Any]] = []
|
| 236 |
+
for item in notes or []:
|
| 237 |
+
if isinstance(item, dict) and item.get("model_type") and item.get("model_type") != "note":
|
| 238 |
+
continue
|
| 239 |
+
if not isinstance(item, dict):
|
| 240 |
+
continue
|
| 241 |
+
note_id = item.get("id") or item.get("note_id")
|
| 242 |
+
xsec_token = item.get("xsec_token")
|
| 243 |
+
note_url = None
|
| 244 |
+
if note_id and xsec_token:
|
| 245 |
+
note_url = f"https://www.xiaohongshu.com/explore/{note_id}?xsec_token={xsec_token}"
|
| 246 |
+
title = (
|
| 247 |
+
(item.get("note_card") or {}).get("title")
|
| 248 |
+
if isinstance(item.get("note_card"), dict)
|
| 249 |
+
else item.get("title")
|
| 250 |
+
)
|
| 251 |
+
normalized.append(
|
| 252 |
+
{
|
| 253 |
+
"query": query,
|
| 254 |
+
"note_id": note_id,
|
| 255 |
+
"note_url": note_url,
|
| 256 |
+
"xsec_token": xsec_token,
|
| 257 |
+
"title": title,
|
| 258 |
+
}
|
| 259 |
+
)
|
| 260 |
+
|
| 261 |
+
return raw, normalized
|
| 262 |
+
|
| 263 |
+
def _run_user_profile(
|
| 264 |
+
self,
|
| 265 |
+
user_id: str | None,
|
| 266 |
+
cookies_str: str,
|
| 267 |
+
proxies: dict | None,
|
| 268 |
+
) -> Tuple[Any, Any]:
|
| 269 |
+
if not user_id:
|
| 270 |
+
raise ValueError("missing_user_id")
|
| 271 |
+
success, msg, user_info = self._apis.get_user_info(user_id, cookies_str, proxies)
|
| 272 |
+
raw = user_info
|
| 273 |
+
if not success:
|
| 274 |
+
raise RuntimeError(msg)
|
| 275 |
+
data = get_dict(user_info, "data", context="get_user_info")
|
| 276 |
+
normalized = handle_user_info(data, user_id)
|
| 277 |
+
return raw, normalized
|
| 278 |
+
|
| 279 |
+
def _build_meta(
|
| 280 |
+
self,
|
| 281 |
+
*,
|
| 282 |
+
task_id: str,
|
| 283 |
+
operator: str,
|
| 284 |
+
source_type: str,
|
| 285 |
+
source_ref: str,
|
| 286 |
+
ok: bool,
|
| 287 |
+
error_kind: str | None,
|
| 288 |
+
error_message: str | None,
|
| 289 |
+
) -> Dict[str, Any]:
|
| 290 |
+
meta: Dict[str, Any] = {
|
| 291 |
+
"task_id": task_id,
|
| 292 |
+
"source_engine": "api",
|
| 293 |
+
"engine_name": self.name,
|
| 294 |
+
"source_type": source_type,
|
| 295 |
+
"source_ref": source_ref,
|
| 296 |
+
"operator": operator,
|
| 297 |
+
"ingested_at": _iso_now(),
|
| 298 |
+
"dedup_key": _dedup_key(source_type, source_ref),
|
| 299 |
+
"ok": bool(ok),
|
| 300 |
+
}
|
| 301 |
+
if not ok:
|
| 302 |
+
meta["error_kind"] = error_kind or "parse"
|
| 303 |
+
meta["error_message"] = error_message or "unknown_error"
|
| 304 |
+
return meta
|
extension/background.js
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
const getActiveTab = () =>
|
| 2 |
+
new Promise((resolve) => {
|
| 3 |
+
chrome.tabs.query({ active: true, currentWindow: true }, (tabs) => resolve(tabs && tabs[0] ? tabs[0] : null));
|
| 4 |
+
});
|
| 5 |
+
|
| 6 |
+
const storageGet = (keys) =>
|
| 7 |
+
new Promise((resolve) => {
|
| 8 |
+
chrome.storage.local.get(keys, (items) => resolve(items || {}));
|
| 9 |
+
});
|
| 10 |
+
|
| 11 |
+
const storageSet = (items) =>
|
| 12 |
+
new Promise((resolve) => {
|
| 13 |
+
chrome.storage.local.set(items, () => resolve());
|
| 14 |
+
});
|
| 15 |
+
|
| 16 |
+
const sendToTab = (tabId, message) =>
|
| 17 |
+
new Promise((resolve, reject) => {
|
| 18 |
+
chrome.tabs.sendMessage(tabId, message, (resp) => {
|
| 19 |
+
const err = chrome.runtime.lastError;
|
| 20 |
+
if (err) {
|
| 21 |
+
reject(new Error(err.message));
|
| 22 |
+
return;
|
| 23 |
+
}
|
| 24 |
+
resolve(resp);
|
| 25 |
+
});
|
| 26 |
+
});
|
| 27 |
+
|
| 28 |
+
const badge = async (text) => {
|
| 29 |
+
try {
|
| 30 |
+
await chrome.action.setBadgeBackgroundColor({ color: "#2D7D46" });
|
| 31 |
+
await chrome.action.setBadgeText({ text: text || "" });
|
| 32 |
+
} catch (e) {}
|
| 33 |
+
};
|
| 34 |
+
|
| 35 |
+
const badgeError = async (text) => {
|
| 36 |
+
try {
|
| 37 |
+
await chrome.action.setBadgeBackgroundColor({ color: "#B42318" });
|
| 38 |
+
await chrome.action.setBadgeText({ text: text || "" });
|
| 39 |
+
} catch (e) {}
|
| 40 |
+
};
|
| 41 |
+
|
| 42 |
+
const normalizeBaseUrl = (url) => {
|
| 43 |
+
const u = String(url || "").trim();
|
| 44 |
+
if (!u) return "";
|
| 45 |
+
return u.endsWith("/") ? u.slice(0, -1) : u;
|
| 46 |
+
};
|
| 47 |
+
|
| 48 |
+
const extractTaskId = (text) => {
|
| 49 |
+
const t = String(text || "").trim();
|
| 50 |
+
if (!t) return "";
|
| 51 |
+
const m = t.match(/[0-9a-fA-F]{32}/);
|
| 52 |
+
return m ? m[0] : t;
|
| 53 |
+
};
|
| 54 |
+
|
| 55 |
+
const getOrPromptServiceUrl = async (tabId) => {
|
| 56 |
+
const items = await storageGet(["service_url"]);
|
| 57 |
+
const existing = normalizeBaseUrl(items.service_url);
|
| 58 |
+
if (existing) return existing;
|
| 59 |
+
const resp = await sendToTab(tabId, {
|
| 60 |
+
type: "PROMPT",
|
| 61 |
+
text: "Spider_XHS 服务地址(例如 http://localhost:8000)",
|
| 62 |
+
defaultValue: "http://localhost:8000",
|
| 63 |
+
});
|
| 64 |
+
const v = normalizeBaseUrl(resp && resp.value);
|
| 65 |
+
if (v) await storageSet({ service_url: v });
|
| 66 |
+
return v;
|
| 67 |
+
};
|
| 68 |
+
|
| 69 |
+
const getTaskId = async (tabId) => {
|
| 70 |
+
let clip = "";
|
| 71 |
+
try {
|
| 72 |
+
const resp = await sendToTab(tabId, { type: "READ_CLIPBOARD" });
|
| 73 |
+
clip = extractTaskId(resp && resp.text);
|
| 74 |
+
} catch (e) {}
|
| 75 |
+
if (clip) return clip;
|
| 76 |
+
const resp = await sendToTab(tabId, { type: "PROMPT", text: "task_id", defaultValue: "" });
|
| 77 |
+
return extractTaskId(resp && resp.value);
|
| 78 |
+
};
|
| 79 |
+
|
| 80 |
+
const collectPage = async (tabId) => {
|
| 81 |
+
const resp = await sendToTab(tabId, { type: "COLLECT_PAGE" });
|
| 82 |
+
return resp || {};
|
| 83 |
+
};
|
| 84 |
+
|
| 85 |
+
chrome.action.onClicked.addListener(async () => {
|
| 86 |
+
await badge("...");
|
| 87 |
+
try {
|
| 88 |
+
const tab = await getActiveTab();
|
| 89 |
+
const tabId = tab && tab.id;
|
| 90 |
+
if (!tabId) {
|
| 91 |
+
await badgeError("NO");
|
| 92 |
+
return;
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
const serviceUrl = await getOrPromptServiceUrl(tabId);
|
| 96 |
+
if (!serviceUrl) {
|
| 97 |
+
await badgeError("URL");
|
| 98 |
+
return;
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
const taskId = await getTaskId(tabId);
|
| 102 |
+
if (!taskId) {
|
| 103 |
+
await badgeError("ID");
|
| 104 |
+
return;
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
const page = await collectPage(tabId);
|
| 108 |
+
const url = String(page.url || "").trim();
|
| 109 |
+
const html = String(page.html || "");
|
| 110 |
+
|
| 111 |
+
const body = {
|
| 112 |
+
task_id: taskId,
|
| 113 |
+
raw: { url, html },
|
| 114 |
+
normalized: { url, kind: "page" },
|
| 115 |
+
meta: { source_engine: "extension_rpa", source_type: "page", source_ref: url, ingested_at: new Date().toISOString(), ok: true },
|
| 116 |
+
};
|
| 117 |
+
|
| 118 |
+
const endpoint = `${serviceUrl}/api/v1/import/extension`;
|
| 119 |
+
const resp = await fetch(endpoint, {
|
| 120 |
+
method: "POST",
|
| 121 |
+
headers: { "Content-Type": "application/json" },
|
| 122 |
+
body: JSON.stringify(body),
|
| 123 |
+
});
|
| 124 |
+
|
| 125 |
+
if (resp.ok) {
|
| 126 |
+
await badge("OK");
|
| 127 |
+
} else {
|
| 128 |
+
await badgeError(String(resp.status || "ERR"));
|
| 129 |
+
}
|
| 130 |
+
} catch (e) {
|
| 131 |
+
await badgeError("ERR");
|
| 132 |
+
} finally {
|
| 133 |
+
setTimeout(() => badge(""), 2000);
|
| 134 |
+
}
|
| 135 |
+
});
|
| 136 |
+
|
extension/content.js
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
const safeText = (v) => {
|
| 2 |
+
try {
|
| 3 |
+
return String(v == null ? "" : v);
|
| 4 |
+
} catch (e) {
|
| 5 |
+
return "";
|
| 6 |
+
}
|
| 7 |
+
};
|
| 8 |
+
|
| 9 |
+
chrome.runtime.onMessage.addListener((message, _sender, sendResponse) => {
|
| 10 |
+
const type = message && message.type;
|
| 11 |
+
if (type === "COLLECT_PAGE") {
|
| 12 |
+
sendResponse({ url: location.href, html: document.documentElement ? document.documentElement.outerHTML : "" });
|
| 13 |
+
return true;
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
if (type === "PROMPT") {
|
| 17 |
+
const text = safeText(message && message.text);
|
| 18 |
+
const defaultValue = safeText(message && message.defaultValue);
|
| 19 |
+
const value = window.prompt(text, defaultValue);
|
| 20 |
+
sendResponse({ value: value == null ? "" : safeText(value) });
|
| 21 |
+
return true;
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
if (type === "READ_CLIPBOARD") {
|
| 25 |
+
navigator.clipboard
|
| 26 |
+
.readText()
|
| 27 |
+
.then((text) => sendResponse({ text: safeText(text) }))
|
| 28 |
+
.catch(() => sendResponse({ text: "" }));
|
| 29 |
+
return true;
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
sendResponse({});
|
| 33 |
+
return true;
|
| 34 |
+
});
|
| 35 |
+
|
extension/manifest.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"manifest_version": 3,
|
| 3 |
+
"name": "Spider_XHS Extension RPA Import",
|
| 4 |
+
"version": "0.1.0",
|
| 5 |
+
"action": {
|
| 6 |
+
"default_title": "Import to Spider_XHS"
|
| 7 |
+
},
|
| 8 |
+
"background": {
|
| 9 |
+
"service_worker": "background.js"
|
| 10 |
+
},
|
| 11 |
+
"permissions": ["activeTab", "scripting", "storage"],
|
| 12 |
+
"host_permissions": ["<all_urls>"],
|
| 13 |
+
"content_scripts": [
|
| 14 |
+
{
|
| 15 |
+
"matches": ["<all_urls>"],
|
| 16 |
+
"js": ["content.js"],
|
| 17 |
+
"run_at": "document_idle"
|
| 18 |
+
}
|
| 19 |
+
]
|
| 20 |
+
}
|
frontend/.gitignore
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Logs
|
| 2 |
+
logs
|
| 3 |
+
*.log
|
| 4 |
+
npm-debug.log*
|
| 5 |
+
yarn-debug.log*
|
| 6 |
+
yarn-error.log*
|
| 7 |
+
pnpm-debug.log*
|
| 8 |
+
lerna-debug.log*
|
| 9 |
+
|
| 10 |
+
node_modules
|
| 11 |
+
dist
|
| 12 |
+
dist-ssr
|
| 13 |
+
*.local
|
| 14 |
+
|
| 15 |
+
# Editor directories and files
|
| 16 |
+
.vscode/*
|
| 17 |
+
!.vscode/extensions.json
|
| 18 |
+
.idea
|
| 19 |
+
.DS_Store
|
| 20 |
+
*.suo
|
| 21 |
+
*.ntvs*
|
| 22 |
+
*.njsproj
|
| 23 |
+
*.sln
|
| 24 |
+
*.sw?
|
frontend/README.md
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# React + TypeScript + Vite
|
| 2 |
+
|
| 3 |
+
This template provides a minimal setup to get React working in Vite with HMR and some ESLint rules.
|
| 4 |
+
|
| 5 |
+
Currently, two official plugins are available:
|
| 6 |
+
|
| 7 |
+
- [@vitejs/plugin-react](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react) uses [Oxc](https://oxc.rs)
|
| 8 |
+
- [@vitejs/plugin-react-swc](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react-swc) uses [SWC](https://swc.rs/)
|
| 9 |
+
|
| 10 |
+
## React Compiler
|
| 11 |
+
|
| 12 |
+
The React Compiler is not enabled on this template because of its impact on dev & build performances. To add it, see [this documentation](https://react.dev/learn/react-compiler/installation).
|
| 13 |
+
|
| 14 |
+
## Expanding the ESLint configuration
|
| 15 |
+
|
| 16 |
+
If you are developing a production application, we recommend updating the configuration to enable type-aware lint rules:
|
| 17 |
+
|
| 18 |
+
```js
|
| 19 |
+
export default defineConfig([
|
| 20 |
+
globalIgnores(['dist']),
|
| 21 |
+
{
|
| 22 |
+
files: ['**/*.{ts,tsx}'],
|
| 23 |
+
extends: [
|
| 24 |
+
// Other configs...
|
| 25 |
+
|
| 26 |
+
// Remove tseslint.configs.recommended and replace with this
|
| 27 |
+
tseslint.configs.recommendedTypeChecked,
|
| 28 |
+
// Alternatively, use this for stricter rules
|
| 29 |
+
tseslint.configs.strictTypeChecked,
|
| 30 |
+
// Optionally, add this for stylistic rules
|
| 31 |
+
tseslint.configs.stylisticTypeChecked,
|
| 32 |
+
|
| 33 |
+
// Other configs...
|
| 34 |
+
],
|
| 35 |
+
languageOptions: {
|
| 36 |
+
parserOptions: {
|
| 37 |
+
project: ['./tsconfig.node.json', './tsconfig.app.json'],
|
| 38 |
+
tsconfigRootDir: import.meta.dirname,
|
| 39 |
+
},
|
| 40 |
+
// other options...
|
| 41 |
+
},
|
| 42 |
+
},
|
| 43 |
+
])
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
You can also install [eslint-plugin-react-x](https://github.com/Rel1cx/eslint-react/tree/main/packages/plugins/eslint-plugin-react-x) and [eslint-plugin-react-dom](https://github.com/Rel1cx/eslint-react/tree/main/packages/plugins/eslint-plugin-react-dom) for React-specific lint rules:
|
| 47 |
+
|
| 48 |
+
```js
|
| 49 |
+
// eslint.config.js
|
| 50 |
+
import reactX from 'eslint-plugin-react-x'
|
| 51 |
+
import reactDom from 'eslint-plugin-react-dom'
|
| 52 |
+
|
| 53 |
+
export default defineConfig([
|
| 54 |
+
globalIgnores(['dist']),
|
| 55 |
+
{
|
| 56 |
+
files: ['**/*.{ts,tsx}'],
|
| 57 |
+
extends: [
|
| 58 |
+
// Other configs...
|
| 59 |
+
// Enable lint rules for React
|
| 60 |
+
reactX.configs['recommended-typescript'],
|
| 61 |
+
// Enable lint rules for React DOM
|
| 62 |
+
reactDom.configs.recommended,
|
| 63 |
+
],
|
| 64 |
+
languageOptions: {
|
| 65 |
+
parserOptions: {
|
| 66 |
+
project: ['./tsconfig.node.json', './tsconfig.app.json'],
|
| 67 |
+
tsconfigRootDir: import.meta.dirname,
|
| 68 |
+
},
|
| 69 |
+
// other options...
|
| 70 |
+
},
|
| 71 |
+
},
|
| 72 |
+
])
|
| 73 |
+
```
|
frontend/eslint.config.js
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import js from '@eslint/js'
|
| 2 |
+
import globals from 'globals'
|
| 3 |
+
import reactHooks from 'eslint-plugin-react-hooks'
|
| 4 |
+
import reactRefresh from 'eslint-plugin-react-refresh'
|
| 5 |
+
import tseslint from 'typescript-eslint'
|
| 6 |
+
import { defineConfig, globalIgnores } from 'eslint/config'
|
| 7 |
+
|
| 8 |
+
export default defineConfig([
|
| 9 |
+
globalIgnores(['dist']),
|
| 10 |
+
{
|
| 11 |
+
files: ['**/*.{ts,tsx}'],
|
| 12 |
+
extends: [
|
| 13 |
+
js.configs.recommended,
|
| 14 |
+
tseslint.configs.recommended,
|
| 15 |
+
reactHooks.configs.flat.recommended,
|
| 16 |
+
reactRefresh.configs.vite,
|
| 17 |
+
],
|
| 18 |
+
languageOptions: {
|
| 19 |
+
ecmaVersion: 2020,
|
| 20 |
+
globals: globals.browser,
|
| 21 |
+
},
|
| 22 |
+
},
|
| 23 |
+
])
|
frontend/index.html
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!doctype html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8" />
|
| 5 |
+
<link rel="icon" type="image/svg+xml" href="/favicon.svg" />
|
| 6 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
| 7 |
+
<title>frontend</title>
|
| 8 |
+
</head>
|
| 9 |
+
<body>
|
| 10 |
+
<div id="root"></div>
|
| 11 |
+
<script type="module" src="/src/main.tsx"></script>
|
| 12 |
+
</body>
|
| 13 |
+
</html>
|
frontend/package-lock.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
frontend/package.json
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "frontend",
|
| 3 |
+
"private": true,
|
| 4 |
+
"version": "0.0.0",
|
| 5 |
+
"type": "module",
|
| 6 |
+
"scripts": {
|
| 7 |
+
"dev": "vite",
|
| 8 |
+
"build": "tsc -b && vite build",
|
| 9 |
+
"lint": "eslint .",
|
| 10 |
+
"preview": "vite preview"
|
| 11 |
+
},
|
| 12 |
+
"dependencies": {
|
| 13 |
+
"@ant-design/icons": "^6.0.0",
|
| 14 |
+
"antd": "^5.27.0",
|
| 15 |
+
"axios": "^1.12.2",
|
| 16 |
+
"react": "^19.2.4",
|
| 17 |
+
"react-dom": "^19.2.4",
|
| 18 |
+
"react-router-dom": "^7.9.3"
|
| 19 |
+
},
|
| 20 |
+
"devDependencies": {
|
| 21 |
+
"@eslint/js": "^9.39.4",
|
| 22 |
+
"@types/node": "^24.12.2",
|
| 23 |
+
"@types/react": "^19.2.14",
|
| 24 |
+
"@types/react-dom": "^19.2.3",
|
| 25 |
+
"@vitejs/plugin-react": "^6.0.1",
|
| 26 |
+
"eslint": "^9.39.4",
|
| 27 |
+
"eslint-plugin-react-hooks": "^7.0.1",
|
| 28 |
+
"eslint-plugin-react-refresh": "^0.5.2",
|
| 29 |
+
"globals": "^17.4.0",
|
| 30 |
+
"typescript": "~6.0.2",
|
| 31 |
+
"typescript-eslint": "^8.58.0",
|
| 32 |
+
"vite": "^8.0.4"
|
| 33 |
+
}
|
| 34 |
+
}
|
frontend/public/favicon.svg
ADDED
|
|
frontend/public/icons.svg
ADDED
|
|
frontend/src/App.css
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.counter {
|
| 2 |
+
font-size: 16px;
|
| 3 |
+
padding: 5px 10px;
|
| 4 |
+
border-radius: 5px;
|
| 5 |
+
color: var(--accent);
|
| 6 |
+
background: var(--accent-bg);
|
| 7 |
+
border: 2px solid transparent;
|
| 8 |
+
transition: border-color 0.3s;
|
| 9 |
+
margin-bottom: 24px;
|
| 10 |
+
|
| 11 |
+
&:hover {
|
| 12 |
+
border-color: var(--accent-border);
|
| 13 |
+
}
|
| 14 |
+
&:focus-visible {
|
| 15 |
+
outline: 2px solid var(--accent);
|
| 16 |
+
outline-offset: 2px;
|
| 17 |
+
}
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
.hero {
|
| 21 |
+
position: relative;
|
| 22 |
+
|
| 23 |
+
.base,
|
| 24 |
+
.framework,
|
| 25 |
+
.vite {
|
| 26 |
+
inset-inline: 0;
|
| 27 |
+
margin: 0 auto;
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
.base {
|
| 31 |
+
width: 170px;
|
| 32 |
+
position: relative;
|
| 33 |
+
z-index: 0;
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
.framework,
|
| 37 |
+
.vite {
|
| 38 |
+
position: absolute;
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
.framework {
|
| 42 |
+
z-index: 1;
|
| 43 |
+
top: 34px;
|
| 44 |
+
height: 28px;
|
| 45 |
+
transform: perspective(2000px) rotateZ(300deg) rotateX(44deg) rotateY(39deg)
|
| 46 |
+
scale(1.4);
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
.vite {
|
| 50 |
+
z-index: 0;
|
| 51 |
+
top: 107px;
|
| 52 |
+
height: 26px;
|
| 53 |
+
width: auto;
|
| 54 |
+
transform: perspective(2000px) rotateZ(300deg) rotateX(40deg) rotateY(39deg)
|
| 55 |
+
scale(0.8);
|
| 56 |
+
}
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
#center {
|
| 60 |
+
display: flex;
|
| 61 |
+
flex-direction: column;
|
| 62 |
+
gap: 25px;
|
| 63 |
+
place-content: center;
|
| 64 |
+
place-items: center;
|
| 65 |
+
flex-grow: 1;
|
| 66 |
+
|
| 67 |
+
@media (max-width: 1024px) {
|
| 68 |
+
padding: 32px 20px 24px;
|
| 69 |
+
gap: 18px;
|
| 70 |
+
}
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
#next-steps {
|
| 74 |
+
display: flex;
|
| 75 |
+
border-top: 1px solid var(--border);
|
| 76 |
+
text-align: left;
|
| 77 |
+
|
| 78 |
+
& > div {
|
| 79 |
+
flex: 1 1 0;
|
| 80 |
+
padding: 32px;
|
| 81 |
+
@media (max-width: 1024px) {
|
| 82 |
+
padding: 24px 20px;
|
| 83 |
+
}
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
.icon {
|
| 87 |
+
margin-bottom: 16px;
|
| 88 |
+
width: 22px;
|
| 89 |
+
height: 22px;
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
@media (max-width: 1024px) {
|
| 93 |
+
flex-direction: column;
|
| 94 |
+
text-align: center;
|
| 95 |
+
}
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
#docs {
|
| 99 |
+
border-right: 1px solid var(--border);
|
| 100 |
+
|
| 101 |
+
@media (max-width: 1024px) {
|
| 102 |
+
border-right: none;
|
| 103 |
+
border-bottom: 1px solid var(--border);
|
| 104 |
+
}
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
#next-steps ul {
|
| 108 |
+
list-style: none;
|
| 109 |
+
padding: 0;
|
| 110 |
+
display: flex;
|
| 111 |
+
gap: 8px;
|
| 112 |
+
margin: 32px 0 0;
|
| 113 |
+
|
| 114 |
+
.logo {
|
| 115 |
+
height: 18px;
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
a {
|
| 119 |
+
color: var(--text-h);
|
| 120 |
+
font-size: 16px;
|
| 121 |
+
border-radius: 6px;
|
| 122 |
+
background: var(--social-bg);
|
| 123 |
+
display: flex;
|
| 124 |
+
padding: 6px 12px;
|
| 125 |
+
align-items: center;
|
| 126 |
+
gap: 8px;
|
| 127 |
+
text-decoration: none;
|
| 128 |
+
transition: box-shadow 0.3s;
|
| 129 |
+
|
| 130 |
+
&:hover {
|
| 131 |
+
box-shadow: var(--shadow);
|
| 132 |
+
}
|
| 133 |
+
.button-icon {
|
| 134 |
+
height: 18px;
|
| 135 |
+
width: 18px;
|
| 136 |
+
}
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
@media (max-width: 1024px) {
|
| 140 |
+
margin-top: 20px;
|
| 141 |
+
flex-wrap: wrap;
|
| 142 |
+
justify-content: center;
|
| 143 |
+
|
| 144 |
+
li {
|
| 145 |
+
flex: 1 1 calc(50% - 8px);
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
a {
|
| 149 |
+
width: 100%;
|
| 150 |
+
justify-content: center;
|
| 151 |
+
box-sizing: border-box;
|
| 152 |
+
}
|
| 153 |
+
}
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
#spacer {
|
| 157 |
+
height: 88px;
|
| 158 |
+
border-top: 1px solid var(--border);
|
| 159 |
+
@media (max-width: 1024px) {
|
| 160 |
+
height: 48px;
|
| 161 |
+
}
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
.ticks {
|
| 165 |
+
position: relative;
|
| 166 |
+
width: 100%;
|
| 167 |
+
|
| 168 |
+
&::before,
|
| 169 |
+
&::after {
|
| 170 |
+
content: '';
|
| 171 |
+
position: absolute;
|
| 172 |
+
top: -4.5px;
|
| 173 |
+
border: 5px solid transparent;
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
&::before {
|
| 177 |
+
left: 0;
|
| 178 |
+
border-left-color: var(--border);
|
| 179 |
+
}
|
| 180 |
+
&::after {
|
| 181 |
+
right: 0;
|
| 182 |
+
border-right-color: var(--border);
|
| 183 |
+
}
|
| 184 |
+
}
|
frontend/src/App.tsx
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { Navigate, Route, Routes } from 'react-router-dom'
|
| 2 |
+
import AppLayout from './layouts/AppLayout'
|
| 3 |
+
import CleanedNotesPage from './pages/CleanedNotesPage'
|
| 4 |
+
import DashboardPage from './pages/DashboardPage'
|
| 5 |
+
import ErrorsPage from './pages/ErrorsPage'
|
| 6 |
+
import MetricsPage from './pages/MetricsPage'
|
| 7 |
+
import RawNotesPage from './pages/RawNotesPage'
|
| 8 |
+
import RpaPage from './pages/RpaPage'
|
| 9 |
+
import ResourcesAccountsPage from './pages/ResourcesAccountsPage'
|
| 10 |
+
import ResourcesProxiesPage from './pages/ResourcesProxiesPage'
|
| 11 |
+
import ResourcesSessionsPage from './pages/ResourcesSessionsPage'
|
| 12 |
+
import TaskDetailPage from './pages/TaskDetailPage'
|
| 13 |
+
import TasksPage from './pages/TasksPage'
|
| 14 |
+
import AIGenerationPage from './pages/AIGenerationPage'
|
| 15 |
+
import ComplianceReviewPage from './pages/ComplianceReviewPage'
|
| 16 |
+
import LeadsPage from './pages/LeadsPage'
|
| 17 |
+
|
| 18 |
+
export default function App() {
|
| 19 |
+
return (
|
| 20 |
+
<Routes>
|
| 21 |
+
<Route element={<AppLayout />}>
|
| 22 |
+
<Route index element={<Navigate to="/dashboard" replace />} />
|
| 23 |
+
<Route path="/dashboard" element={<DashboardPage />} />
|
| 24 |
+
<Route path="/tasks" element={<TasksPage />} />
|
| 25 |
+
<Route path="/tasks/:id" element={<TaskDetailPage />} />
|
| 26 |
+
<Route path="/rpa" element={<RpaPage />} />
|
| 27 |
+
<Route path="/errors" element={<ErrorsPage />} />
|
| 28 |
+
<Route path="/metrics" element={<MetricsPage />} />
|
| 29 |
+
<Route path="/resources" element={<Navigate to="/resources/accounts" replace />} />
|
| 30 |
+
<Route path="/resources/accounts" element={<ResourcesAccountsPage />} />
|
| 31 |
+
<Route path="/resources/sessions" element={<ResourcesSessionsPage />} />
|
| 32 |
+
<Route path="/resources/proxies" element={<ResourcesProxiesPage />} />
|
| 33 |
+
<Route path="/content/raw-notes" element={<RawNotesPage />} />
|
| 34 |
+
<Route path="/content/cleaned-notes" element={<CleanedNotesPage />} />
|
| 35 |
+
<Route path="/ai/generation" element={<AIGenerationPage />} />
|
| 36 |
+
<Route path="/compliance/review" element={<ComplianceReviewPage />} />
|
| 37 |
+
<Route path="/leads" element={<LeadsPage />} />
|
| 38 |
+
<Route path="*" element={<Navigate to="/dashboard" replace />} />
|
| 39 |
+
</Route>
|
| 40 |
+
</Routes>
|
| 41 |
+
)
|
| 42 |
+
}
|
frontend/src/assets/hero.png
ADDED
|
frontend/src/assets/react.svg
ADDED
|
|
frontend/src/assets/vite.svg
ADDED
|
|
frontend/src/components/JsonViewer.tsx
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { Tree, Typography } from 'antd'
|
| 2 |
+
import type { DataNode } from 'antd/es/tree'
|
| 3 |
+
import { useMemo } from 'react'
|
| 4 |
+
|
| 5 |
+
export type JsonViewerProps = {
|
| 6 |
+
value: unknown
|
| 7 |
+
height?: number
|
| 8 |
+
defaultExpandAll?: boolean
|
| 9 |
+
}
|
| 10 |
+
|
| 11 |
+
function isRecord(value: unknown): value is Record<string, unknown> {
|
| 12 |
+
return !!value && typeof value === 'object' && !Array.isArray(value)
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
function preview(value: unknown) {
|
| 16 |
+
if (value === null) return 'null'
|
| 17 |
+
if (value === undefined) return 'undefined'
|
| 18 |
+
if (typeof value === 'string') return JSON.stringify(value)
|
| 19 |
+
if (typeof value === 'number' || typeof value === 'boolean') return String(value)
|
| 20 |
+
if (Array.isArray(value)) return `[${value.length}]`
|
| 21 |
+
if (isRecord(value)) return `{${Object.keys(value).length}}`
|
| 22 |
+
return String(value)
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
function toTreeData(value: unknown, path: string): DataNode[] {
|
| 26 |
+
if (Array.isArray(value)) {
|
| 27 |
+
return value.map((item, idx) => {
|
| 28 |
+
const nextPath = `${path}[${idx}]`
|
| 29 |
+
const hasChildren = Array.isArray(item) || isRecord(item)
|
| 30 |
+
return {
|
| 31 |
+
key: nextPath,
|
| 32 |
+
title: (
|
| 33 |
+
<span>
|
| 34 |
+
<Typography.Text code>{idx}</Typography.Text>
|
| 35 |
+
<Typography.Text type="secondary" style={{ marginLeft: 8 }}>
|
| 36 |
+
{preview(item)}
|
| 37 |
+
</Typography.Text>
|
| 38 |
+
</span>
|
| 39 |
+
),
|
| 40 |
+
children: hasChildren ? toTreeData(item, nextPath) : undefined,
|
| 41 |
+
}
|
| 42 |
+
})
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
if (isRecord(value)) {
|
| 46 |
+
return Object.entries(value).map(([k, v]) => {
|
| 47 |
+
const nextPath = path ? `${path}.${k}` : k
|
| 48 |
+
const hasChildren = Array.isArray(v) || isRecord(v)
|
| 49 |
+
return {
|
| 50 |
+
key: nextPath,
|
| 51 |
+
title: (
|
| 52 |
+
<span>
|
| 53 |
+
<Typography.Text code>{k}</Typography.Text>
|
| 54 |
+
<Typography.Text type="secondary" style={{ marginLeft: 8 }}>
|
| 55 |
+
{preview(v)}
|
| 56 |
+
</Typography.Text>
|
| 57 |
+
</span>
|
| 58 |
+
),
|
| 59 |
+
children: hasChildren ? toTreeData(v, nextPath) : undefined,
|
| 60 |
+
}
|
| 61 |
+
})
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
return [
|
| 65 |
+
{
|
| 66 |
+
key: path || 'value',
|
| 67 |
+
title: <Typography.Text type="secondary">{preview(value)}</Typography.Text>,
|
| 68 |
+
},
|
| 69 |
+
]
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
export default function JsonViewer({
|
| 73 |
+
value,
|
| 74 |
+
height = 360,
|
| 75 |
+
defaultExpandAll = true,
|
| 76 |
+
}: JsonViewerProps) {
|
| 77 |
+
const treeData = useMemo<DataNode[]>(() => toTreeData(value, ''), [value])
|
| 78 |
+
|
| 79 |
+
if (value === null || value === undefined) {
|
| 80 |
+
return <Typography.Text type="secondary">空</Typography.Text>
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
return (
|
| 84 |
+
<Tree
|
| 85 |
+
blockNode
|
| 86 |
+
showLine
|
| 87 |
+
height={height}
|
| 88 |
+
defaultExpandAll={defaultExpandAll}
|
| 89 |
+
treeData={treeData}
|
| 90 |
+
/>
|
| 91 |
+
)
|
| 92 |
+
}
|
| 93 |
+
|
frontend/src/index.css
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
html,
|
| 2 |
+
body,
|
| 3 |
+
#root {
|
| 4 |
+
height: 100%;
|
| 5 |
+
}
|
| 6 |
+
|
| 7 |
+
body {
|
| 8 |
+
margin: 0;
|
| 9 |
+
}
|
frontend/src/layouts/AppLayout.tsx
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import {
|
| 2 |
+
BugOutlined,
|
| 3 |
+
DatabaseOutlined,
|
| 4 |
+
DashboardOutlined,
|
| 5 |
+
FileTextOutlined,
|
| 6 |
+
LineChartOutlined,
|
| 7 |
+
RobotOutlined,
|
| 8 |
+
SnippetsOutlined,
|
| 9 |
+
UnorderedListOutlined,
|
| 10 |
+
MessageOutlined,
|
| 11 |
+
SafetyCertificateOutlined,
|
| 12 |
+
PhoneOutlined,
|
| 13 |
+
} from '@ant-design/icons'
|
| 14 |
+
import { Layout, Menu, Typography, theme } from 'antd'
|
| 15 |
+
import type { MenuProps } from 'antd'
|
| 16 |
+
import { useMemo, useState } from 'react'
|
| 17 |
+
import { Link, Outlet, useLocation } from 'react-router-dom'
|
| 18 |
+
|
| 19 |
+
const { Header, Content, Sider } = Layout
|
| 20 |
+
|
| 21 |
+
const menuItems: MenuProps['items'] = [
|
| 22 |
+
{
|
| 23 |
+
key: '/dashboard',
|
| 24 |
+
icon: <DashboardOutlined />,
|
| 25 |
+
label: <Link to="/dashboard">健康概览</Link>,
|
| 26 |
+
},
|
| 27 |
+
{
|
| 28 |
+
key: '/tasks',
|
| 29 |
+
icon: <UnorderedListOutlined />,
|
| 30 |
+
label: <Link to="/tasks">任务中心</Link>,
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
key: '/resources',
|
| 34 |
+
icon: <DatabaseOutlined />,
|
| 35 |
+
label: '资源池',
|
| 36 |
+
children: [
|
| 37 |
+
{
|
| 38 |
+
key: '/resources/accounts',
|
| 39 |
+
label: <Link to="/resources/accounts">账号池</Link>,
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
key: '/resources/sessions',
|
| 43 |
+
label: <Link to="/resources/sessions">会话池</Link>,
|
| 44 |
+
},
|
| 45 |
+
{
|
| 46 |
+
key: '/resources/proxies',
|
| 47 |
+
label: <Link to="/resources/proxies">代理池</Link>,
|
| 48 |
+
},
|
| 49 |
+
],
|
| 50 |
+
},
|
| 51 |
+
{
|
| 52 |
+
key: '/errors',
|
| 53 |
+
icon: <BugOutlined />,
|
| 54 |
+
label: <Link to="/errors">错误中心</Link>,
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
key: '/content/raw-notes',
|
| 58 |
+
icon: <FileTextOutlined />,
|
| 59 |
+
label: <Link to="/content/raw-notes">原始快照</Link>,
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
key: '/content/cleaned-notes',
|
| 63 |
+
icon: <SnippetsOutlined />,
|
| 64 |
+
label: <Link to="/content/cleaned-notes">清洗笔记库</Link>,
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
key: '/ai/generation',
|
| 68 |
+
icon: <MessageOutlined />,
|
| 69 |
+
label: <Link to="/ai/generation">AI 生产内容</Link>,
|
| 70 |
+
},
|
| 71 |
+
{
|
| 72 |
+
key: '/compliance/review',
|
| 73 |
+
icon: <SafetyCertificateOutlined />,
|
| 74 |
+
label: <Link to="/compliance/review">合规检测</Link>,
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
key: '/leads',
|
| 78 |
+
icon: <PhoneOutlined />,
|
| 79 |
+
label: <Link to="/leads">线索转化池</Link>,
|
| 80 |
+
},
|
| 81 |
+
{
|
| 82 |
+
key: '/rpa',
|
| 83 |
+
icon: <RobotOutlined />,
|
| 84 |
+
label: <Link to="/rpa">RPA 兜底</Link>,
|
| 85 |
+
},
|
| 86 |
+
{
|
| 87 |
+
key: '/metrics',
|
| 88 |
+
icon: <LineChartOutlined />,
|
| 89 |
+
label: <Link to="/metrics">监控指标</Link>,
|
| 90 |
+
},
|
| 91 |
+
]
|
| 92 |
+
|
| 93 |
+
export default function AppLayout() {
|
| 94 |
+
const location = useLocation()
|
| 95 |
+
const {
|
| 96 |
+
token: { colorBgContainer, borderRadiusLG, colorBorderSecondary },
|
| 97 |
+
} = theme.useToken()
|
| 98 |
+
|
| 99 |
+
const selectedKeys = useMemo(() => {
|
| 100 |
+
if (location.pathname.startsWith('/tasks')) return ['/tasks']
|
| 101 |
+
if (location.pathname.startsWith('/errors')) return ['/errors']
|
| 102 |
+
return [location.pathname]
|
| 103 |
+
}, [location.pathname])
|
| 104 |
+
|
| 105 |
+
const defaultOpenKeys = useMemo(() => {
|
| 106 |
+
if (location.pathname.startsWith('/resources')) return ['/resources']
|
| 107 |
+
return []
|
| 108 |
+
}, [location.pathname])
|
| 109 |
+
|
| 110 |
+
const [openKeys, setOpenKeys] = useState(defaultOpenKeys)
|
| 111 |
+
|
| 112 |
+
const onOpenChange = (keys: string[]) => {
|
| 113 |
+
setOpenKeys(keys)
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
return (
|
| 117 |
+
<Layout style={{ minHeight: '100vh' }}>
|
| 118 |
+
<Sider
|
| 119 |
+
breakpoint="lg"
|
| 120 |
+
collapsedWidth={56}
|
| 121 |
+
style={{ borderRight: `1px solid ${colorBorderSecondary}` }}
|
| 122 |
+
>
|
| 123 |
+
<div
|
| 124 |
+
style={{
|
| 125 |
+
height: 56,
|
| 126 |
+
display: 'flex',
|
| 127 |
+
alignItems: 'center',
|
| 128 |
+
paddingInline: 16,
|
| 129 |
+
}}
|
| 130 |
+
>
|
| 131 |
+
<Typography.Text strong style={{ color: '#fff' }}>
|
| 132 |
+
Spider XHS
|
| 133 |
+
</Typography.Text>
|
| 134 |
+
</div>
|
| 135 |
+
<Menu
|
| 136 |
+
theme="dark"
|
| 137 |
+
mode="inline"
|
| 138 |
+
selectedKeys={selectedKeys}
|
| 139 |
+
openKeys={openKeys}
|
| 140 |
+
onOpenChange={onOpenChange}
|
| 141 |
+
items={menuItems}
|
| 142 |
+
/>
|
| 143 |
+
</Sider>
|
| 144 |
+
<Layout>
|
| 145 |
+
<Header
|
| 146 |
+
style={{
|
| 147 |
+
paddingInline: 16,
|
| 148 |
+
display: 'flex',
|
| 149 |
+
alignItems: 'center',
|
| 150 |
+
background: colorBgContainer,
|
| 151 |
+
borderBottom: `1px solid ${colorBorderSecondary}`,
|
| 152 |
+
}}
|
| 153 |
+
>
|
| 154 |
+
<Typography.Text>Ops Console</Typography.Text>
|
| 155 |
+
</Header>
|
| 156 |
+
<Content style={{ padding: 16 }}>
|
| 157 |
+
<div
|
| 158 |
+
style={{
|
| 159 |
+
background: colorBgContainer,
|
| 160 |
+
padding: 16,
|
| 161 |
+
minHeight: 280,
|
| 162 |
+
borderRadius: borderRadiusLG,
|
| 163 |
+
}}
|
| 164 |
+
>
|
| 165 |
+
<Outlet />
|
| 166 |
+
</div>
|
| 167 |
+
</Content>
|
| 168 |
+
</Layout>
|
| 169 |
+
</Layout>
|
| 170 |
+
)
|
| 171 |
+
}
|
frontend/src/main.tsx
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { StrictMode } from 'react'
|
| 2 |
+
import { createRoot } from 'react-dom/client'
|
| 3 |
+
import { BrowserRouter } from 'react-router-dom'
|
| 4 |
+
import 'antd/dist/reset.css'
|
| 5 |
+
import './index.css'
|
| 6 |
+
import App from './App.tsx'
|
| 7 |
+
|
| 8 |
+
createRoot(document.getElementById('root')!).render(
|
| 9 |
+
<StrictMode>
|
| 10 |
+
<BrowserRouter>
|
| 11 |
+
<App />
|
| 12 |
+
</BrowserRouter>
|
| 13 |
+
</StrictMode>,
|
| 14 |
+
)
|
frontend/src/pages/AIGenerationPage.tsx
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { Card, Table, Typography, Space, Tag, Button } from 'antd'
|
| 2 |
+
import { useCallback, useEffect, useState } from 'react'
|
| 3 |
+
import { listGeneratedPosts, type GeneratedPostRecord } from '../lib/business'
|
| 4 |
+
|
| 5 |
+
export default function AIGenerationPage() {
|
| 6 |
+
const [loading, setLoading] = useState(false)
|
| 7 |
+
const [data, setData] = useState<GeneratedPostRecord[]>([])
|
| 8 |
+
const [total, setTotal] = useState(0)
|
| 9 |
+
const [page, setPage] = useState(1)
|
| 10 |
+
const [pageSize, setPageSize] = useState(20)
|
| 11 |
+
|
| 12 |
+
const refresh = useCallback(async (current: number, size: number) => {
|
| 13 |
+
setLoading(true)
|
| 14 |
+
try {
|
| 15 |
+
const res = await listGeneratedPosts({ limit: size, offset: (current - 1) * size })
|
| 16 |
+
setData(res.posts)
|
| 17 |
+
setTotal(res.total)
|
| 18 |
+
} catch (e) {
|
| 19 |
+
console.error(e)
|
| 20 |
+
} finally {
|
| 21 |
+
setLoading(false)
|
| 22 |
+
}
|
| 23 |
+
}, [])
|
| 24 |
+
|
| 25 |
+
useEffect(() => {
|
| 26 |
+
refresh(page, pageSize)
|
| 27 |
+
}, [page, pageSize, refresh])
|
| 28 |
+
|
| 29 |
+
const columns = [
|
| 30 |
+
{ title: 'ID', dataIndex: 'id', width: 80 },
|
| 31 |
+
{
|
| 32 |
+
title: '生成内容',
|
| 33 |
+
dataIndex: 'content',
|
| 34 |
+
ellipsis: true,
|
| 35 |
+
render: (v: string) => <Typography.Text ellipsis>{v || '—'}</Typography.Text>,
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
title: '状态',
|
| 39 |
+
dataIndex: 'status',
|
| 40 |
+
width: 100,
|
| 41 |
+
render: (v: string) => <Tag color={v === 'published' ? 'green' : 'blue'}>{v || '—'}</Tag>,
|
| 42 |
+
},
|
| 43 |
+
{ title: '创建时间', dataIndex: 'created_at', width: 180 },
|
| 44 |
+
]
|
| 45 |
+
|
| 46 |
+
return (
|
| 47 |
+
<div style={{ display: 'grid', gap: 16 }}>
|
| 48 |
+
<Space align="center" style={{ justifyContent: 'space-between' }}>
|
| 49 |
+
<Typography.Title level={3} style={{ margin: 0 }}>
|
| 50 |
+
AI 生产内容
|
| 51 |
+
</Typography.Title>
|
| 52 |
+
<Button onClick={() => refresh(page, pageSize)} loading={loading}>
|
| 53 |
+
刷新
|
| 54 |
+
</Button>
|
| 55 |
+
</Space>
|
| 56 |
+
|
| 57 |
+
<Card>
|
| 58 |
+
<Table<GeneratedPostRecord>
|
| 59 |
+
rowKey="id"
|
| 60 |
+
size="middle"
|
| 61 |
+
columns={columns}
|
| 62 |
+
dataSource={data}
|
| 63 |
+
loading={loading}
|
| 64 |
+
pagination={{
|
| 65 |
+
current: page,
|
| 66 |
+
pageSize,
|
| 67 |
+
total,
|
| 68 |
+
onChange: (p, s) => {
|
| 69 |
+
setPage(p)
|
| 70 |
+
setPageSize(s)
|
| 71 |
+
},
|
| 72 |
+
}}
|
| 73 |
+
/>
|
| 74 |
+
</Card>
|
| 75 |
+
</div>
|
| 76 |
+
)
|
| 77 |
+
}
|
frontend/src/pages/CleanedNotesPage.tsx
ADDED
|
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { Alert, Button, Card, Drawer, Empty, Input, Space, Table, Typography } from 'antd'
|
| 2 |
+
import { useCallback, useEffect, useMemo, useState } from 'react'
|
| 3 |
+
import { useSearchParams } from 'react-router-dom'
|
| 4 |
+
import type { ApiError } from '../lib/api'
|
| 5 |
+
import { isOrchestratorDbUnavailable, listCleanedNotes, type CleanedNoteRecord } from '../lib/content'
|
| 6 |
+
|
| 7 |
+
export default function CleanedNotesPage() {
|
| 8 |
+
const [searchParams, setSearchParams] = useSearchParams()
|
| 9 |
+
const [loading, setLoading] = useState(false)
|
| 10 |
+
const [error, setError] = useState<ApiError | null>(null)
|
| 11 |
+
const [notes, setNotes] = useState<CleanedNoteRecord[]>([])
|
| 12 |
+
const [total, setTotal] = useState(0)
|
| 13 |
+
const [drawerOpen, setDrawerOpen] = useState(false)
|
| 14 |
+
const [active, setActive] = useState<CleanedNoteRecord | null>(null)
|
| 15 |
+
|
| 16 |
+
const page = useMemo(() => {
|
| 17 |
+
const raw = Number(searchParams.get('page') || '1')
|
| 18 |
+
return Number.isFinite(raw) && raw > 0 ? raw : 1
|
| 19 |
+
}, [searchParams])
|
| 20 |
+
|
| 21 |
+
const pageSize = useMemo(() => {
|
| 22 |
+
const raw = Number(searchParams.get('page_size') || '20')
|
| 23 |
+
return Number.isFinite(raw) && raw > 0 ? Math.min(raw, 200) : 20
|
| 24 |
+
}, [searchParams])
|
| 25 |
+
|
| 26 |
+
const query = useMemo(() => String(searchParams.get('query') || '').trim(), [searchParams])
|
| 27 |
+
const [queryInput, setQueryInput] = useState(query)
|
| 28 |
+
|
| 29 |
+
useEffect(() => {
|
| 30 |
+
setQueryInput(query)
|
| 31 |
+
}, [query])
|
| 32 |
+
|
| 33 |
+
const setParam = useCallback(
|
| 34 |
+
(patch: Record<string, string | undefined>) => {
|
| 35 |
+
const next = new URLSearchParams(searchParams)
|
| 36 |
+
Object.entries(patch).forEach(([k, v]) => {
|
| 37 |
+
const val = String(v || '').trim()
|
| 38 |
+
if (val) next.set(k, val)
|
| 39 |
+
else next.delete(k)
|
| 40 |
+
})
|
| 41 |
+
setSearchParams(next)
|
| 42 |
+
},
|
| 43 |
+
[searchParams, setSearchParams],
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
const loadList = useCallback(async () => {
|
| 47 |
+
setLoading(true)
|
| 48 |
+
setError(null)
|
| 49 |
+
try {
|
| 50 |
+
const res = await listCleanedNotes({
|
| 51 |
+
limit: pageSize,
|
| 52 |
+
offset: (page - 1) * pageSize,
|
| 53 |
+
query: query || undefined,
|
| 54 |
+
})
|
| 55 |
+
setNotes(res.notes || [])
|
| 56 |
+
setTotal(res.total || 0)
|
| 57 |
+
} catch (e) {
|
| 58 |
+
setError(e as ApiError)
|
| 59 |
+
} finally {
|
| 60 |
+
setLoading(false)
|
| 61 |
+
}
|
| 62 |
+
}, [page, pageSize, query])
|
| 63 |
+
|
| 64 |
+
useEffect(() => {
|
| 65 |
+
void loadList()
|
| 66 |
+
}, [loadList])
|
| 67 |
+
|
| 68 |
+
const columns = useMemo(
|
| 69 |
+
() => [
|
| 70 |
+
{ title: 'ID', dataIndex: 'id', width: 90 },
|
| 71 |
+
{
|
| 72 |
+
title: '原始作者',
|
| 73 |
+
dataIndex: 'raw_author',
|
| 74 |
+
width: 160,
|
| 75 |
+
render: (v: unknown) => (v ? String(v) : '—'),
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
title: '原始 URL',
|
| 79 |
+
dataIndex: 'raw_url',
|
| 80 |
+
width: 340,
|
| 81 |
+
ellipsis: true,
|
| 82 |
+
render: (v: unknown) => {
|
| 83 |
+
const url = String(v || '').trim()
|
| 84 |
+
if (!url) return '—'
|
| 85 |
+
return (
|
| 86 |
+
<a href={url} target="_blank" rel="noreferrer">
|
| 87 |
+
{url}
|
| 88 |
+
</a>
|
| 89 |
+
)
|
| 90 |
+
},
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
title: '清洗内容',
|
| 94 |
+
dataIndex: 'cleaned_content',
|
| 95 |
+
ellipsis: true,
|
| 96 |
+
render: (_: unknown, record: CleanedNoteRecord) => {
|
| 97 |
+
const text = String(record.cleaned_content || '').trim()
|
| 98 |
+
return (
|
| 99 |
+
<Space>
|
| 100 |
+
<Typography.Text ellipsis style={{ maxWidth: 520 }}>
|
| 101 |
+
{text || '—'}
|
| 102 |
+
</Typography.Text>
|
| 103 |
+
<Button
|
| 104 |
+
size="small"
|
| 105 |
+
disabled={!text}
|
| 106 |
+
onClick={() => {
|
| 107 |
+
setActive(record)
|
| 108 |
+
setDrawerOpen(true)
|
| 109 |
+
}}
|
| 110 |
+
>
|
| 111 |
+
查看
|
| 112 |
+
</Button>
|
| 113 |
+
</Space>
|
| 114 |
+
)
|
| 115 |
+
},
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
title: '创建时间',
|
| 119 |
+
dataIndex: 'created_at',
|
| 120 |
+
width: 200,
|
| 121 |
+
render: (v: unknown) => (v ? String(v) : '—'),
|
| 122 |
+
},
|
| 123 |
+
],
|
| 124 |
+
[],
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
const dbMissing = isOrchestratorDbUnavailable(error)
|
| 128 |
+
const cleanedText = String(active?.cleaned_content || '').trim()
|
| 129 |
+
|
| 130 |
+
return (
|
| 131 |
+
<div style={{ display: 'grid', gap: 16 }}>
|
| 132 |
+
<Typography.Title level={3} style={{ margin: 0 }}>
|
| 133 |
+
清洗笔记库
|
| 134 |
+
</Typography.Title>
|
| 135 |
+
|
| 136 |
+
<Card
|
| 137 |
+
title="GET /api/v1/content/cleaned-notes"
|
| 138 |
+
extra={
|
| 139 |
+
<Space wrap>
|
| 140 |
+
<Input.Search
|
| 141 |
+
allowClear
|
| 142 |
+
placeholder="搜索 cleaned_content / author / url / content"
|
| 143 |
+
value={queryInput}
|
| 144 |
+
onChange={(e) => setQueryInput(e.target.value)}
|
| 145 |
+
onSearch={(value) => setParam({ query: value.trim() || undefined, page: '1' })}
|
| 146 |
+
style={{ width: 360 }}
|
| 147 |
+
/>
|
| 148 |
+
<Button onClick={loadList} disabled={loading}>
|
| 149 |
+
刷新
|
| 150 |
+
</Button>
|
| 151 |
+
</Space>
|
| 152 |
+
}
|
| 153 |
+
loading={loading}
|
| 154 |
+
>
|
| 155 |
+
{dbMissing ? (
|
| 156 |
+
<Empty
|
| 157 |
+
image={Empty.PRESENTED_IMAGE_SIMPLE}
|
| 158 |
+
description={
|
| 159 |
+
<div style={{ maxWidth: 680 }}>
|
| 160 |
+
<Typography.Text strong>内容库数据库不可用</Typography.Text>
|
| 161 |
+
<Typography.Paragraph style={{ marginTop: 8, marginBottom: 0 }}>
|
| 162 |
+
服务端返回 503,通常表示 ORCHESTRATOR_DB_PATH 未配置、路径不存在或 DB 尚未初始化。
|
| 163 |
+
<br />
|
| 164 |
+
处理方式:
|
| 165 |
+
<br />
|
| 166 |
+
1)配置 ORCHESTRATOR_DB_PATH 指向可读的 sqlite 文件(默认 orchestrator/data/mvp.db)
|
| 167 |
+
<br />
|
| 168 |
+
2)初始化 DB:python Spider_XHS/orchestrator/db_init.py
|
| 169 |
+
</Typography.Paragraph>
|
| 170 |
+
</div>
|
| 171 |
+
}
|
| 172 |
+
/>
|
| 173 |
+
) : error ? (
|
| 174 |
+
<Alert
|
| 175 |
+
type="error"
|
| 176 |
+
showIcon
|
| 177 |
+
message={error.status ? `HTTP ${error.status}` : '请求失败'}
|
| 178 |
+
description={error.message}
|
| 179 |
+
style={{ marginBottom: 16 }}
|
| 180 |
+
/>
|
| 181 |
+
) : null}
|
| 182 |
+
|
| 183 |
+
<Table
|
| 184 |
+
size="middle"
|
| 185 |
+
rowKey="id"
|
| 186 |
+
columns={columns}
|
| 187 |
+
dataSource={notes}
|
| 188 |
+
pagination={{
|
| 189 |
+
current: page,
|
| 190 |
+
pageSize,
|
| 191 |
+
total,
|
| 192 |
+
showSizeChanger: true,
|
| 193 |
+
pageSizeOptions: [10, 20, 50, 100, 200],
|
| 194 |
+
onChange: (nextPage, nextSize) => setParam({ page: String(nextPage), page_size: String(nextSize) }),
|
| 195 |
+
}}
|
| 196 |
+
/>
|
| 197 |
+
</Card>
|
| 198 |
+
|
| 199 |
+
<Drawer
|
| 200 |
+
title={active ? `cleaned_note #${active.id}` : 'cleaned_note'}
|
| 201 |
+
open={drawerOpen}
|
| 202 |
+
onClose={() => setDrawerOpen(false)}
|
| 203 |
+
width={720}
|
| 204 |
+
>
|
| 205 |
+
{active ? (
|
| 206 |
+
<div style={{ display: 'grid', gap: 12 }}>
|
| 207 |
+
<div>
|
| 208 |
+
<Typography.Text type="secondary">raw_note_id:</Typography.Text>
|
| 209 |
+
<Typography.Text>{active.raw_note_id ?? '—'}</Typography.Text>
|
| 210 |
+
</div>
|
| 211 |
+
<div>
|
| 212 |
+
<Typography.Text type="secondary">raw_author:</Typography.Text>
|
| 213 |
+
<Typography.Text>{active.raw_author || '—'}</Typography.Text>
|
| 214 |
+
</div>
|
| 215 |
+
<div>
|
| 216 |
+
<Typography.Text type="secondary">raw_url:</Typography.Text>
|
| 217 |
+
{active.raw_url ? (
|
| 218 |
+
<a href={active.raw_url} target="_blank" rel="noreferrer">
|
| 219 |
+
{active.raw_url}
|
| 220 |
+
</a>
|
| 221 |
+
) : (
|
| 222 |
+
<Typography.Text>—</Typography.Text>
|
| 223 |
+
)}
|
| 224 |
+
</div>
|
| 225 |
+
<div>
|
| 226 |
+
<Typography.Text type="secondary">created_at:</Typography.Text>
|
| 227 |
+
<Typography.Text>{active.created_at || '—'}</Typography.Text>
|
| 228 |
+
</div>
|
| 229 |
+
<div>
|
| 230 |
+
<Typography.Text type="secondary">cleaned_content:</Typography.Text>
|
| 231 |
+
<pre
|
| 232 |
+
style={{
|
| 233 |
+
marginTop: 8,
|
| 234 |
+
marginBottom: 0,
|
| 235 |
+
padding: 12,
|
| 236 |
+
background: '#fafafa',
|
| 237 |
+
border: '1px solid #f0f0f0',
|
| 238 |
+
borderRadius: 8,
|
| 239 |
+
whiteSpace: 'pre-wrap',
|
| 240 |
+
wordBreak: 'break-word',
|
| 241 |
+
maxHeight: 520,
|
| 242 |
+
overflow: 'auto',
|
| 243 |
+
}}
|
| 244 |
+
>
|
| 245 |
+
{cleanedText || '—'}
|
| 246 |
+
</pre>
|
| 247 |
+
</div>
|
| 248 |
+
</div>
|
| 249 |
+
) : null}
|
| 250 |
+
</Drawer>
|
| 251 |
+
</div>
|
| 252 |
+
)
|
| 253 |
+
}
|
frontend/src/pages/ComplianceReviewPage.tsx
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { Card, Table, Typography, Space, Tag, Button } from 'antd'
|
| 2 |
+
import { useCallback, useEffect, useState } from 'react'
|
| 3 |
+
import { listGeneratedPosts, type GeneratedPostRecord } from '../lib/business'
|
| 4 |
+
|
| 5 |
+
export default function ComplianceReviewPage() {
|
| 6 |
+
const [loading, setLoading] = useState(false)
|
| 7 |
+
const [data, setData] = useState<GeneratedPostRecord[]>([])
|
| 8 |
+
const [total, setTotal] = useState(0)
|
| 9 |
+
const [page, setPage] = useState(1)
|
| 10 |
+
const [pageSize, setPageSize] = useState(20)
|
| 11 |
+
|
| 12 |
+
const refresh = useCallback(async (current: number, size: number) => {
|
| 13 |
+
setLoading(true)
|
| 14 |
+
try {
|
| 15 |
+
const res = await listGeneratedPosts({ limit: size, offset: (current - 1) * size })
|
| 16 |
+
setData(res.posts)
|
| 17 |
+
setTotal(res.total)
|
| 18 |
+
} catch (e) {
|
| 19 |
+
console.error(e)
|
| 20 |
+
} finally {
|
| 21 |
+
setLoading(false)
|
| 22 |
+
}
|
| 23 |
+
}, [])
|
| 24 |
+
|
| 25 |
+
useEffect(() => {
|
| 26 |
+
refresh(page, pageSize)
|
| 27 |
+
}, [page, pageSize, refresh])
|
| 28 |
+
|
| 29 |
+
const columns = [
|
| 30 |
+
{ title: 'ID', dataIndex: 'id', width: 80 },
|
| 31 |
+
{
|
| 32 |
+
title: 'Content',
|
| 33 |
+
dataIndex: 'content',
|
| 34 |
+
ellipsis: true,
|
| 35 |
+
render: (v: string) => <Typography.Text ellipsis>{v || '—'}</Typography.Text>,
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
title: 'Compliance Status',
|
| 39 |
+
dataIndex: 'compliance_status',
|
| 40 |
+
width: 150,
|
| 41 |
+
render: (v: string) => {
|
| 42 |
+
const color = v === 'passed' ? 'success' : v === 'rejected' ? 'error' : 'warning'
|
| 43 |
+
return <Tag color={color}>{v || '—'}</Tag>
|
| 44 |
+
},
|
| 45 |
+
},
|
| 46 |
+
{
|
| 47 |
+
title: 'Medical Risk',
|
| 48 |
+
dataIndex: 'medical_risk_level',
|
| 49 |
+
width: 120,
|
| 50 |
+
render: (v: string) => {
|
| 51 |
+
const color = v === 'CRITICAL' ? 'red' : v === 'HIGH' ? 'volcano' : v === 'MEDIUM' ? 'orange' : 'green'
|
| 52 |
+
return <Tag color={color}>{v || '—'}</Tag>
|
| 53 |
+
},
|
| 54 |
+
},
|
| 55 |
+
{
|
| 56 |
+
title: 'Review Status',
|
| 57 |
+
dataIndex: 'review_status',
|
| 58 |
+
width: 150,
|
| 59 |
+
render: (v: string) => {
|
| 60 |
+
const color = v === 'approved' ? 'success' : v === 'rejected' ? 'error' : 'default'
|
| 61 |
+
return <Tag color={color}>{v || '—'}</Tag>
|
| 62 |
+
},
|
| 63 |
+
},
|
| 64 |
+
{ title: 'Hit Words', dataIndex: 'hit_words', width: 200, ellipsis: true },
|
| 65 |
+
]
|
| 66 |
+
|
| 67 |
+
return (
|
| 68 |
+
<div style={{ display: 'grid', gap: 16 }}>
|
| 69 |
+
<Space align="center" style={{ justifyContent: 'space-between' }}>
|
| 70 |
+
<Typography.Title level={3} style={{ margin: 0 }}>
|
| 71 |
+
合规审核
|
| 72 |
+
</Typography.Title>
|
| 73 |
+
<Button onClick={() => refresh(page, pageSize)} loading={loading}>
|
| 74 |
+
刷新
|
| 75 |
+
</Button>
|
| 76 |
+
</Space>
|
| 77 |
+
|
| 78 |
+
<Card>
|
| 79 |
+
<Table<GeneratedPostRecord>
|
| 80 |
+
rowKey="id"
|
| 81 |
+
size="middle"
|
| 82 |
+
columns={columns}
|
| 83 |
+
dataSource={data}
|
| 84 |
+
loading={loading}
|
| 85 |
+
pagination={{
|
| 86 |
+
current: page,
|
| 87 |
+
pageSize,
|
| 88 |
+
total,
|
| 89 |
+
onChange: (p, s) => {
|
| 90 |
+
setPage(p)
|
| 91 |
+
setPageSize(s)
|
| 92 |
+
},
|
| 93 |
+
}}
|
| 94 |
+
/>
|
| 95 |
+
</Card>
|
| 96 |
+
</div>
|
| 97 |
+
)
|
| 98 |
+
}
|
frontend/src/pages/DashboardPage.tsx
ADDED
|
@@ -0,0 +1,232 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import {
|
| 2 |
+
BarChartOutlined,
|
| 3 |
+
RobotOutlined,
|
| 4 |
+
UnorderedListOutlined,
|
| 5 |
+
} from '@ant-design/icons'
|
| 6 |
+
import { Alert, Button, Card, Col, Row, Space, Statistic, Tag, Typography } from 'antd'
|
| 7 |
+
import { useCallback, useEffect, useMemo, useState } from 'react'
|
| 8 |
+
import { useNavigate } from 'react-router-dom'
|
| 9 |
+
import type { ApiError } from '../lib/api'
|
| 10 |
+
import { apiGetWrapped } from '../lib/api'
|
| 11 |
+
|
| 12 |
+
export default function DashboardPage() {
|
| 13 |
+
const navigate = useNavigate()
|
| 14 |
+
const [loading, setLoading] = useState(false)
|
| 15 |
+
const [error, setError] = useState<ApiError | null>(null)
|
| 16 |
+
const [data, setData] = useState<HealthData | null>(null)
|
| 17 |
+
const [missingFields, setMissingFields] = useState<string[]>([])
|
| 18 |
+
|
| 19 |
+
const refresh = useCallback(async () => {
|
| 20 |
+
setLoading(true)
|
| 21 |
+
setError(null)
|
| 22 |
+
try {
|
| 23 |
+
const res = await apiGetWrapped<unknown>('health')
|
| 24 |
+
const parsed = parseHealthData(res)
|
| 25 |
+
setData(parsed.data)
|
| 26 |
+
setMissingFields(parsed.missingFields)
|
| 27 |
+
} catch (e) {
|
| 28 |
+
setError(e as ApiError)
|
| 29 |
+
} finally {
|
| 30 |
+
setLoading(false)
|
| 31 |
+
}
|
| 32 |
+
}, [])
|
| 33 |
+
|
| 34 |
+
useEffect(() => {
|
| 35 |
+
refresh()
|
| 36 |
+
}, [refresh])
|
| 37 |
+
|
| 38 |
+
const statusCounts = useMemo(() => {
|
| 39 |
+
if (!data?.task_status_counts) return null
|
| 40 |
+
const entries = Object.entries(data.task_status_counts).filter(
|
| 41 |
+
([, v]) => typeof v === 'number' && Number.isFinite(v),
|
| 42 |
+
)
|
| 43 |
+
if (entries.length === 0) return null
|
| 44 |
+
entries.sort((a, b) => b[1] - a[1])
|
| 45 |
+
return entries
|
| 46 |
+
}, [data?.task_status_counts])
|
| 47 |
+
|
| 48 |
+
const failureRatePercent = useMemo(() => {
|
| 49 |
+
if (typeof data?.recent_failure_rate !== 'number') return undefined
|
| 50 |
+
if (!Number.isFinite(data.recent_failure_rate)) return undefined
|
| 51 |
+
return data.recent_failure_rate * 100
|
| 52 |
+
}, [data?.recent_failure_rate])
|
| 53 |
+
|
| 54 |
+
return (
|
| 55 |
+
<div style={{ display: 'grid', gap: 16 }}>
|
| 56 |
+
<Space align="center" style={{ justifyContent: 'space-between' }}>
|
| 57 |
+
<Typography.Title level={3} style={{ margin: 0 }}>
|
| 58 |
+
Dashboard
|
| 59 |
+
</Typography.Title>
|
| 60 |
+
<Button onClick={refresh} loading={loading}>
|
| 61 |
+
刷新
|
| 62 |
+
</Button>
|
| 63 |
+
</Space>
|
| 64 |
+
|
| 65 |
+
{error ? (
|
| 66 |
+
<Alert
|
| 67 |
+
type="error"
|
| 68 |
+
showIcon
|
| 69 |
+
message="健康检查请求失败"
|
| 70 |
+
description={`${error.status ? `HTTP ${error.status}: ` : ''}${error.message}`}
|
| 71 |
+
/>
|
| 72 |
+
) : null}
|
| 73 |
+
|
| 74 |
+
{!error && missingFields.length > 0 ? (
|
| 75 |
+
<Alert
|
| 76 |
+
type="warning"
|
| 77 |
+
showIcon
|
| 78 |
+
message="部分字段缺失"
|
| 79 |
+
description={`未在响应中识别到:${missingFields.join('、')}`}
|
| 80 |
+
/>
|
| 81 |
+
) : null}
|
| 82 |
+
|
| 83 |
+
<Row gutter={[16, 16]}>
|
| 84 |
+
<Col xs={24} sm={12} md={8} lg={6} xl={4}>
|
| 85 |
+
<Card loading={loading}>
|
| 86 |
+
<Statistic title="队列长度" value={data?.queue_length ?? '—'} />
|
| 87 |
+
</Card>
|
| 88 |
+
</Col>
|
| 89 |
+
<Col xs={24} sm={12} md={8} lg={6} xl={4}>
|
| 90 |
+
<Card loading={loading}>
|
| 91 |
+
<Statistic title="在途任务" value={data?.inflight_count ?? '—'} />
|
| 92 |
+
</Card>
|
| 93 |
+
</Col>
|
| 94 |
+
<Col xs={24} sm={12} md={8} lg={6} xl={4}>
|
| 95 |
+
<Card loading={loading}>
|
| 96 |
+
<Statistic title="累计成功" value={data?.success_count ?? '—'} />
|
| 97 |
+
</Card>
|
| 98 |
+
</Col>
|
| 99 |
+
<Col xs={24} sm={12} md={8} lg={6} xl={4}>
|
| 100 |
+
<Card loading={loading}>
|
| 101 |
+
<Statistic title="累计失败" value={data?.fail_count ?? '—'} />
|
| 102 |
+
</Card>
|
| 103 |
+
</Col>
|
| 104 |
+
<Col xs={24} sm={12} md={8} lg={6} xl={4}>
|
| 105 |
+
<Card loading={loading}>
|
| 106 |
+
<Statistic
|
| 107 |
+
title="近期失败率"
|
| 108 |
+
value={failureRatePercent ?? '—'}
|
| 109 |
+
precision={failureRatePercent !== undefined ? 2 : undefined}
|
| 110 |
+
suffix={failureRatePercent !== undefined ? '%' : undefined}
|
| 111 |
+
/>
|
| 112 |
+
{typeof data?.recent_failure_window_seconds === 'number' &&
|
| 113 |
+
Number.isFinite(data.recent_failure_window_seconds) ? (
|
| 114 |
+
<Typography.Text type="secondary">
|
| 115 |
+
窗口:{data.recent_failure_window_seconds}s
|
| 116 |
+
</Typography.Text>
|
| 117 |
+
) : null}
|
| 118 |
+
</Card>
|
| 119 |
+
</Col>
|
| 120 |
+
</Row>
|
| 121 |
+
|
| 122 |
+
{statusCounts ? (
|
| 123 |
+
<Card title="任务状态分布" loading={loading}>
|
| 124 |
+
<Space wrap>
|
| 125 |
+
{statusCounts.map(([status, count]) => (
|
| 126 |
+
<Tag key={status} color={statusColor(status)}>
|
| 127 |
+
{status}: {count}
|
| 128 |
+
</Tag>
|
| 129 |
+
))}
|
| 130 |
+
</Space>
|
| 131 |
+
</Card>
|
| 132 |
+
) : null}
|
| 133 |
+
|
| 134 |
+
<Card title="快捷入口">
|
| 135 |
+
<Space wrap>
|
| 136 |
+
<Button icon={<UnorderedListOutlined />} onClick={() => navigate('/tasks')}>
|
| 137 |
+
任务中心
|
| 138 |
+
</Button>
|
| 139 |
+
<Button icon={<RobotOutlined />} onClick={() => navigate('/rpa')}>
|
| 140 |
+
等待 RPA
|
| 141 |
+
</Button>
|
| 142 |
+
<Button icon={<BarChartOutlined />} onClick={() => navigate('/metrics')}>
|
| 143 |
+
监控指标
|
| 144 |
+
</Button>
|
| 145 |
+
</Space>
|
| 146 |
+
</Card>
|
| 147 |
+
</div>
|
| 148 |
+
)
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
type HealthData = {
|
| 152 |
+
status?: string
|
| 153 |
+
queue_length?: number
|
| 154 |
+
inflight_count?: number
|
| 155 |
+
success_count?: number
|
| 156 |
+
fail_count?: number
|
| 157 |
+
recent_failure_rate?: number
|
| 158 |
+
recent_failure_window_seconds?: number
|
| 159 |
+
task_status_counts?: Record<string, number>
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
function asRecord(value: unknown): Record<string, unknown> | null {
|
| 163 |
+
if (!value || typeof value !== 'object') return null
|
| 164 |
+
if (Array.isArray(value)) return null
|
| 165 |
+
return value as Record<string, unknown>
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
function asNumber(value: unknown): number | undefined {
|
| 169 |
+
if (typeof value === 'number' && Number.isFinite(value)) return value
|
| 170 |
+
if (typeof value === 'string') {
|
| 171 |
+
const v = value.trim()
|
| 172 |
+
if (v === '') return undefined
|
| 173 |
+
const n = Number(v)
|
| 174 |
+
if (Number.isFinite(n)) return n
|
| 175 |
+
}
|
| 176 |
+
return undefined
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
function parseHealthData(value: unknown) {
|
| 180 |
+
const obj = asRecord(value) ?? {}
|
| 181 |
+
|
| 182 |
+
const data: HealthData = {
|
| 183 |
+
status: typeof obj.status === 'string' ? obj.status : undefined,
|
| 184 |
+
queue_length: asNumber(obj.queue_length),
|
| 185 |
+
inflight_count: asNumber(obj.inflight_count),
|
| 186 |
+
success_count: asNumber(obj.success_count),
|
| 187 |
+
fail_count: asNumber(obj.fail_count),
|
| 188 |
+
recent_failure_rate: asNumber(obj.recent_failure_rate),
|
| 189 |
+
recent_failure_window_seconds: asNumber(obj.recent_failure_window_seconds),
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
const statusCountsRaw =
|
| 193 |
+
asRecord(obj.task_status_counts) ??
|
| 194 |
+
asRecord(obj.task_status_count) ??
|
| 195 |
+
asRecord(obj.status_counts) ??
|
| 196 |
+
asRecord(obj.task_counts_by_status)
|
| 197 |
+
|
| 198 |
+
if (statusCountsRaw) {
|
| 199 |
+
const normalized: Record<string, number> = {}
|
| 200 |
+
for (const [k, v] of Object.entries(statusCountsRaw)) {
|
| 201 |
+
const n = asNumber(v)
|
| 202 |
+
if (n !== undefined) normalized[String(k)] = Math.max(0, Math.trunc(n))
|
| 203 |
+
}
|
| 204 |
+
if (Object.keys(normalized).length > 0) data.task_status_counts = normalized
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
const missing: string[] = []
|
| 208 |
+
for (const k of [
|
| 209 |
+
'queue_length',
|
| 210 |
+
'inflight_count',
|
| 211 |
+
'success_count',
|
| 212 |
+
'fail_count',
|
| 213 |
+
'recent_failure_rate',
|
| 214 |
+
]) {
|
| 215 |
+
if (asNumber(obj[k]) === undefined) missing.push(k)
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
return { data, missingFields: missing }
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
function statusColor(status: string) {
|
| 222 |
+
const v = String(status || '').toLowerCase()
|
| 223 |
+
if (v === 'succeeded') return 'green'
|
| 224 |
+
if (v === 'failed') return 'red'
|
| 225 |
+
if (v === 'queued') return 'blue'
|
| 226 |
+
if (v === 'running' || v === 'retrying' || v === 'fallback_running') return 'processing'
|
| 227 |
+
if (v === 'waiting_rpa' || v === 'rpa_running') return 'gold'
|
| 228 |
+
if (v === 'risk_paused') return 'orange'
|
| 229 |
+
if (v === 'rpa_imported') return 'cyan'
|
| 230 |
+
if (v === 'rpa_failed') return 'volcano'
|
| 231 |
+
return 'default'
|
| 232 |
+
}
|
frontend/src/pages/ErrorsPage.tsx
ADDED
|
@@ -0,0 +1,276 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { Alert, Button, Card, Select, Space, Table, Tag, Typography } from 'antd'
|
| 2 |
+
import { useCallback, useEffect, useMemo, useRef, useState } from 'react'
|
| 3 |
+
import { useNavigate, useSearchParams } from 'react-router-dom'
|
| 4 |
+
import type { ApiError } from '../lib/api'
|
| 5 |
+
import { getErrorSummary, type ErrorSummaryResponse } from '../lib/errors'
|
| 6 |
+
import type { TaskRecord } from '../lib/tasks'
|
| 7 |
+
|
| 8 |
+
export default function ErrorsPage() {
|
| 9 |
+
const navigate = useNavigate()
|
| 10 |
+
const [searchParams, setSearchParams] = useSearchParams()
|
| 11 |
+
const initializedRef = useRef(false)
|
| 12 |
+
|
| 13 |
+
const [loading, setLoading] = useState(false)
|
| 14 |
+
const [error, setError] = useState<ApiError | null>(null)
|
| 15 |
+
const [summary, setSummary] = useState<ErrorSummaryResponse | null>(null)
|
| 16 |
+
|
| 17 |
+
const page = useMemo(() => {
|
| 18 |
+
const raw = Number(searchParams.get('page') || '1')
|
| 19 |
+
return Number.isFinite(raw) && raw > 0 ? raw : 1
|
| 20 |
+
}, [searchParams])
|
| 21 |
+
|
| 22 |
+
const pageSize = useMemo(() => {
|
| 23 |
+
const raw = Number(searchParams.get('page_size') || '20')
|
| 24 |
+
return Number.isFinite(raw) && raw > 0 ? Math.min(raw, 200) : 20
|
| 25 |
+
}, [searchParams])
|
| 26 |
+
|
| 27 |
+
const scanLimit = useMemo(() => {
|
| 28 |
+
const raw = Number(searchParams.get('scan_limit') || '1000')
|
| 29 |
+
return Number.isFinite(raw) && raw > 0 ? Math.min(Math.floor(raw), 20000) : 1000
|
| 30 |
+
}, [searchParams])
|
| 31 |
+
|
| 32 |
+
const filters = useMemo(() => {
|
| 33 |
+
function splitListParam(name: string) {
|
| 34 |
+
const raw = String(searchParams.get(name) || '').trim()
|
| 35 |
+
if (!raw) return []
|
| 36 |
+
return raw
|
| 37 |
+
.split(',')
|
| 38 |
+
.map((v) => v.trim())
|
| 39 |
+
.filter((v) => v !== '')
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
return {
|
| 43 |
+
status: splitListParam('status'),
|
| 44 |
+
error_kind: splitListParam('error_kind'),
|
| 45 |
+
}
|
| 46 |
+
}, [searchParams])
|
| 47 |
+
|
| 48 |
+
const setParam = useCallback(
|
| 49 |
+
(patch: Record<string, string | undefined>) => {
|
| 50 |
+
const next = new URLSearchParams(searchParams)
|
| 51 |
+
Object.entries(patch).forEach(([k, v]) => {
|
| 52 |
+
const val = String(v || '').trim()
|
| 53 |
+
if (val) next.set(k, val)
|
| 54 |
+
else next.delete(k)
|
| 55 |
+
})
|
| 56 |
+
setSearchParams(next)
|
| 57 |
+
},
|
| 58 |
+
[searchParams, setSearchParams],
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
useEffect(() => {
|
| 62 |
+
if (initializedRef.current) return
|
| 63 |
+
initializedRef.current = true
|
| 64 |
+
const patch: Record<string, string | undefined> = {}
|
| 65 |
+
if (!searchParams.get('status')) patch.status = 'failed,rpa_failed'
|
| 66 |
+
if (!searchParams.get('scan_limit')) patch.scan_limit = '1000'
|
| 67 |
+
if (Object.keys(patch).length) setParam(patch)
|
| 68 |
+
}, [searchParams, setParam])
|
| 69 |
+
|
| 70 |
+
const load = useCallback(async () => {
|
| 71 |
+
setLoading(true)
|
| 72 |
+
setError(null)
|
| 73 |
+
try {
|
| 74 |
+
const res = await getErrorSummary({
|
| 75 |
+
scan_limit: scanLimit,
|
| 76 |
+
limit: pageSize,
|
| 77 |
+
offset: (page - 1) * pageSize,
|
| 78 |
+
status: filters.status,
|
| 79 |
+
error_kind: filters.error_kind,
|
| 80 |
+
})
|
| 81 |
+
setSummary(res)
|
| 82 |
+
} catch (e) {
|
| 83 |
+
setError(e as ApiError)
|
| 84 |
+
} finally {
|
| 85 |
+
setLoading(false)
|
| 86 |
+
}
|
| 87 |
+
}, [filters.error_kind, filters.status, page, pageSize, scanLimit])
|
| 88 |
+
|
| 89 |
+
useEffect(() => {
|
| 90 |
+
void load()
|
| 91 |
+
}, [load])
|
| 92 |
+
|
| 93 |
+
const kindEntries = useMemo(() => {
|
| 94 |
+
const counts = summary?.error_kind_counts || {}
|
| 95 |
+
return Object.entries(counts).sort((a, b) => b[1] - a[1])
|
| 96 |
+
}, [summary])
|
| 97 |
+
|
| 98 |
+
function kindColor(kind: string) {
|
| 99 |
+
const k = String(kind || '').toLowerCase()
|
| 100 |
+
if (k === 'auth') return 'red'
|
| 101 |
+
if (k === 'risk') return 'volcano'
|
| 102 |
+
if (k === 'rate') return 'gold'
|
| 103 |
+
if (k === 'timeout') return 'orange'
|
| 104 |
+
if (k === 'parse') return 'purple'
|
| 105 |
+
if (k === 'missing_dependency') return 'geekblue'
|
| 106 |
+
return 'blue'
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
function toggleErrorKind(kind: string) {
|
| 110 |
+
const k = String(kind || '').trim()
|
| 111 |
+
if (!k) return
|
| 112 |
+
const set = new Set(filters.error_kind)
|
| 113 |
+
if (set.has(k)) set.delete(k)
|
| 114 |
+
else set.add(k)
|
| 115 |
+
setParam({ error_kind: Array.from(set).join(',') || undefined, page: '1' })
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
function formatTs(value: number | null | undefined) {
|
| 119 |
+
if (!value) return '-'
|
| 120 |
+
const d = new Date(value * 1000)
|
| 121 |
+
return Number.isFinite(d.getTime()) ? d.toLocaleString() : '-'
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
const columns = useMemo(
|
| 125 |
+
() => [
|
| 126 |
+
{
|
| 127 |
+
title: 'ID',
|
| 128 |
+
dataIndex: 'id',
|
| 129 |
+
render: (id: string) => (
|
| 130 |
+
<Button type="link" style={{ paddingInline: 0 }} onClick={() => navigate(`/tasks/${encodeURIComponent(id)}`)}>
|
| 131 |
+
{id}
|
| 132 |
+
</Button>
|
| 133 |
+
),
|
| 134 |
+
},
|
| 135 |
+
{
|
| 136 |
+
title: '状态',
|
| 137 |
+
dataIndex: 'status',
|
| 138 |
+
render: (status: string) => {
|
| 139 |
+
const color =
|
| 140 |
+
status === 'succeeded'
|
| 141 |
+
? 'green'
|
| 142 |
+
: status === 'failed' || status === 'rpa_failed'
|
| 143 |
+
? 'red'
|
| 144 |
+
: status === 'waiting_rpa' || status === 'risk_paused'
|
| 145 |
+
? 'gold'
|
| 146 |
+
: 'blue'
|
| 147 |
+
return <Tag color={color}>{status}</Tag>
|
| 148 |
+
},
|
| 149 |
+
},
|
| 150 |
+
{ title: '类型', dataIndex: 'task_type' },
|
| 151 |
+
{ title: '目标', dataIndex: 'target' },
|
| 152 |
+
{ title: '引擎', dataIndex: 'engine', render: (v: unknown) => (v ? String(v) : '-') },
|
| 153 |
+
{ title: '重试', dataIndex: 'retry_count' },
|
| 154 |
+
{
|
| 155 |
+
title: '错误类型',
|
| 156 |
+
render: (_: unknown, record: TaskRecord) => record.error?.kind || '-',
|
| 157 |
+
},
|
| 158 |
+
{
|
| 159 |
+
title: '错误信息',
|
| 160 |
+
render: (_: unknown, record: TaskRecord) =>
|
| 161 |
+
typeof record.error?.message === 'string' && record.error.message.trim() ? record.error.message : '-',
|
| 162 |
+
},
|
| 163 |
+
{ title: '创建时间', dataIndex: 'created', render: (v: number) => formatTs(v) },
|
| 164 |
+
{ title: '完成时间', dataIndex: 'finished', render: (v: number | null | undefined) => formatTs(v) },
|
| 165 |
+
],
|
| 166 |
+
[navigate],
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
return (
|
| 170 |
+
<div style={{ display: 'grid', gap: 16 }}>
|
| 171 |
+
<Space align="baseline">
|
| 172 |
+
<Typography.Title level={3} style={{ margin: 0 }}>
|
| 173 |
+
错误中心
|
| 174 |
+
</Typography.Title>
|
| 175 |
+
{summary ? (
|
| 176 |
+
<Typography.Text type="secondary">
|
| 177 |
+
scanned {summary.scanned} / {summary.scan_limit}
|
| 178 |
+
</Typography.Text>
|
| 179 |
+
) : null}
|
| 180 |
+
</Space>
|
| 181 |
+
|
| 182 |
+
{error ? (
|
| 183 |
+
<Alert
|
| 184 |
+
type="error"
|
| 185 |
+
showIcon
|
| 186 |
+
message={error.status ? `HTTP ${error.status}` : '请求失败'}
|
| 187 |
+
description={error.message}
|
| 188 |
+
/>
|
| 189 |
+
) : null}
|
| 190 |
+
|
| 191 |
+
<Card
|
| 192 |
+
title="error_kind 分布"
|
| 193 |
+
extra={
|
| 194 |
+
<Space wrap>
|
| 195 |
+
<Select
|
| 196 |
+
value={scanLimit}
|
| 197 |
+
style={{ width: 160 }}
|
| 198 |
+
options={[
|
| 199 |
+
{ value: 200, label: 'scan 200' },
|
| 200 |
+
{ value: 500, label: 'scan 500' },
|
| 201 |
+
{ value: 1000, label: 'scan 1000' },
|
| 202 |
+
{ value: 5000, label: 'scan 5000' },
|
| 203 |
+
{ value: 20000, label: 'scan 20000' },
|
| 204 |
+
]}
|
| 205 |
+
onChange={(v) => setParam({ scan_limit: String(v), page: '1' })}
|
| 206 |
+
/>
|
| 207 |
+
<Button onClick={load} disabled={loading}>
|
| 208 |
+
刷新
|
| 209 |
+
</Button>
|
| 210 |
+
</Space>
|
| 211 |
+
}
|
| 212 |
+
loading={loading}
|
| 213 |
+
>
|
| 214 |
+
{kindEntries.length ? (
|
| 215 |
+
<Space wrap size={[8, 8]}>
|
| 216 |
+
{kindEntries.map(([kind, count]) => (
|
| 217 |
+
<Tag
|
| 218 |
+
key={kind}
|
| 219 |
+
color={filters.error_kind.includes(kind) ? kindColor(kind) : undefined}
|
| 220 |
+
style={{ cursor: 'pointer', userSelect: 'none' }}
|
| 221 |
+
onClick={() => toggleErrorKind(kind)}
|
| 222 |
+
>
|
| 223 |
+
{kind}: {count}
|
| 224 |
+
</Tag>
|
| 225 |
+
))}
|
| 226 |
+
</Space>
|
| 227 |
+
) : (
|
| 228 |
+
<Typography.Text type="secondary">暂无</Typography.Text>
|
| 229 |
+
)}
|
| 230 |
+
</Card>
|
| 231 |
+
|
| 232 |
+
<Card
|
| 233 |
+
title="失败任务列表"
|
| 234 |
+
extra={
|
| 235 |
+
<Space wrap>
|
| 236 |
+
<Select
|
| 237 |
+
mode="tags"
|
| 238 |
+
allowClear
|
| 239 |
+
placeholder="status"
|
| 240 |
+
style={{ width: 220 }}
|
| 241 |
+
value={filters.status}
|
| 242 |
+
onChange={(v) => setParam({ status: v.length ? v.join(',') : undefined, page: '1' })}
|
| 243 |
+
/>
|
| 244 |
+
<Select
|
| 245 |
+
mode="tags"
|
| 246 |
+
allowClear
|
| 247 |
+
placeholder="error_kind"
|
| 248 |
+
style={{ width: 220 }}
|
| 249 |
+
value={filters.error_kind}
|
| 250 |
+
onChange={(v) => setParam({ error_kind: v.length ? v.join(',') : undefined, page: '1' })}
|
| 251 |
+
/>
|
| 252 |
+
<Button onClick={load} disabled={loading}>
|
| 253 |
+
刷新
|
| 254 |
+
</Button>
|
| 255 |
+
</Space>
|
| 256 |
+
}
|
| 257 |
+
loading={loading}
|
| 258 |
+
>
|
| 259 |
+
<Table
|
| 260 |
+
size="middle"
|
| 261 |
+
rowKey="id"
|
| 262 |
+
columns={columns}
|
| 263 |
+
dataSource={summary?.tasks || []}
|
| 264 |
+
pagination={{
|
| 265 |
+
current: page,
|
| 266 |
+
pageSize,
|
| 267 |
+
total: summary?.total || 0,
|
| 268 |
+
showSizeChanger: true,
|
| 269 |
+
pageSizeOptions: [10, 20, 50, 100, 200],
|
| 270 |
+
onChange: (nextPage, nextSize) => setParam({ page: String(nextPage), page_size: String(nextSize) }),
|
| 271 |
+
}}
|
| 272 |
+
/>
|
| 273 |
+
</Card>
|
| 274 |
+
</div>
|
| 275 |
+
)
|
| 276 |
+
}
|
frontend/src/pages/LeadsPage.tsx
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { Card, Table, Typography, Space, Tag, Button } from 'antd'
|
| 2 |
+
import { useCallback, useEffect, useState } from 'react'
|
| 3 |
+
import { listLeads, type LeadRecord } from '../lib/business'
|
| 4 |
+
|
| 5 |
+
export default function LeadsPage() {
|
| 6 |
+
const [loading, setLoading] = useState(false)
|
| 7 |
+
const [data, setData] = useState<LeadRecord[]>([])
|
| 8 |
+
const [total, setTotal] = useState(0)
|
| 9 |
+
const [page, setPage] = useState(1)
|
| 10 |
+
const [pageSize, setPageSize] = useState(20)
|
| 11 |
+
|
| 12 |
+
const refresh = useCallback(async (current: number, size: number) => {
|
| 13 |
+
setLoading(true)
|
| 14 |
+
try {
|
| 15 |
+
const res = await listLeads({ limit: size, offset: (current - 1) * size })
|
| 16 |
+
setData(res.leads)
|
| 17 |
+
setTotal(res.total)
|
| 18 |
+
} catch (e) {
|
| 19 |
+
console.error(e)
|
| 20 |
+
} finally {
|
| 21 |
+
setLoading(false)
|
| 22 |
+
}
|
| 23 |
+
}, [])
|
| 24 |
+
|
| 25 |
+
useEffect(() => {
|
| 26 |
+
refresh(page, pageSize)
|
| 27 |
+
}, [page, pageSize, refresh])
|
| 28 |
+
|
| 29 |
+
const columns = [
|
| 30 |
+
{ title: 'ID', dataIndex: 'id', width: 80 },
|
| 31 |
+
{
|
| 32 |
+
title: '联系信息 (Contact Info)',
|
| 33 |
+
dataIndex: 'contact_info',
|
| 34 |
+
ellipsis: true,
|
| 35 |
+
render: (v: string) => <Typography.Text ellipsis>{v || '—'}</Typography.Text>,
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
title: '状态',
|
| 39 |
+
dataIndex: 'status',
|
| 40 |
+
width: 100,
|
| 41 |
+
render: (v: string) => <Tag color={v === 'new' ? 'blue' : 'green'}>{v || '—'}</Tag>,
|
| 42 |
+
},
|
| 43 |
+
{ title: '创建时间', dataIndex: 'created_at', width: 180 },
|
| 44 |
+
{ title: '上次同步', dataIndex: 'last_sync_at', width: 180 },
|
| 45 |
+
]
|
| 46 |
+
|
| 47 |
+
return (
|
| 48 |
+
<div style={{ display: 'grid', gap: 16 }}>
|
| 49 |
+
<Space align="center" style={{ justifyContent: 'space-between' }}>
|
| 50 |
+
<Typography.Title level={3} style={{ margin: 0 }}>
|
| 51 |
+
线索转化池
|
| 52 |
+
</Typography.Title>
|
| 53 |
+
<Button onClick={() => refresh(page, pageSize)} loading={loading}>
|
| 54 |
+
刷新
|
| 55 |
+
</Button>
|
| 56 |
+
</Space>
|
| 57 |
+
|
| 58 |
+
<Card>
|
| 59 |
+
<Table<LeadRecord>
|
| 60 |
+
rowKey="id"
|
| 61 |
+
size="middle"
|
| 62 |
+
columns={columns}
|
| 63 |
+
dataSource={data}
|
| 64 |
+
loading={loading}
|
| 65 |
+
pagination={{
|
| 66 |
+
current: page,
|
| 67 |
+
pageSize,
|
| 68 |
+
total,
|
| 69 |
+
onChange: (p, s) => {
|
| 70 |
+
setPage(p)
|
| 71 |
+
setPageSize(s)
|
| 72 |
+
},
|
| 73 |
+
}}
|
| 74 |
+
/>
|
| 75 |
+
</Card>
|
| 76 |
+
</div>
|
| 77 |
+
)
|
| 78 |
+
}
|
frontend/src/pages/MetricsPage.tsx
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { Button, Card, Input, Space, Typography } from 'antd'
|
| 2 |
+
import { useCallback, useEffect, useMemo, useState } from 'react'
|
| 3 |
+
import type { ApiError } from '../lib/api'
|
| 4 |
+
import { apiGetText } from '../lib/api'
|
| 5 |
+
|
| 6 |
+
export default function MetricsPage() {
|
| 7 |
+
const [loading, setLoading] = useState(false)
|
| 8 |
+
const [data, setData] = useState<string>('')
|
| 9 |
+
const [error, setError] = useState<ApiError | null>(null)
|
| 10 |
+
const [keyword, setKeyword] = useState('')
|
| 11 |
+
|
| 12 |
+
const refresh = useCallback(async () => {
|
| 13 |
+
setLoading(true)
|
| 14 |
+
setError(null)
|
| 15 |
+
try {
|
| 16 |
+
const res = await apiGetText('metrics')
|
| 17 |
+
setData(res)
|
| 18 |
+
} catch (e) {
|
| 19 |
+
setError(e as ApiError)
|
| 20 |
+
} finally {
|
| 21 |
+
setLoading(false)
|
| 22 |
+
}
|
| 23 |
+
}, [])
|
| 24 |
+
|
| 25 |
+
useEffect(() => {
|
| 26 |
+
refresh()
|
| 27 |
+
}, [refresh])
|
| 28 |
+
|
| 29 |
+
const filtered = useMemo(() => {
|
| 30 |
+
const raw = data || ''
|
| 31 |
+
const kw = keyword.trim()
|
| 32 |
+
if (!kw) return { text: raw, matchedLines: undefined as number | undefined, totalLines: undefined as number | undefined }
|
| 33 |
+
const lines = raw.split('\n')
|
| 34 |
+
const lowered = kw.toLowerCase()
|
| 35 |
+
const kept = lines.filter((line) => line.toLowerCase().includes(lowered))
|
| 36 |
+
return { text: kept.join('\n'), matchedLines: kept.length, totalLines: lines.length }
|
| 37 |
+
}, [data, keyword])
|
| 38 |
+
|
| 39 |
+
return (
|
| 40 |
+
<div style={{ display: 'grid', gap: 16 }}>
|
| 41 |
+
<Typography.Title level={3} style={{ margin: 0 }}>
|
| 42 |
+
Metrics
|
| 43 |
+
</Typography.Title>
|
| 44 |
+
|
| 45 |
+
<Card
|
| 46 |
+
title="/api/v1/metrics"
|
| 47 |
+
loading={loading}
|
| 48 |
+
extra={
|
| 49 |
+
<Space>
|
| 50 |
+
<Input
|
| 51 |
+
value={keyword}
|
| 52 |
+
allowClear
|
| 53 |
+
placeholder="关键字过滤(前端过滤)"
|
| 54 |
+
style={{ width: 240 }}
|
| 55 |
+
onChange={(e) => setKeyword(e.target.value)}
|
| 56 |
+
/>
|
| 57 |
+
<Button onClick={refresh} disabled={loading}>
|
| 58 |
+
刷新
|
| 59 |
+
</Button>
|
| 60 |
+
</Space>
|
| 61 |
+
}
|
| 62 |
+
>
|
| 63 |
+
{error ? (
|
| 64 |
+
<Typography.Text type="danger">
|
| 65 |
+
{error.status ? `HTTP ${error.status}: ` : ''}
|
| 66 |
+
{error.message}
|
| 67 |
+
</Typography.Text>
|
| 68 |
+
) : (
|
| 69 |
+
<div style={{ display: 'grid', gap: 8 }}>
|
| 70 |
+
{filtered.matchedLines !== undefined ? (
|
| 71 |
+
<Typography.Text type="secondary">
|
| 72 |
+
匹配行数:{filtered.matchedLines}/{filtered.totalLines}
|
| 73 |
+
</Typography.Text>
|
| 74 |
+
) : null}
|
| 75 |
+
<pre style={{ margin: 0, whiteSpace: 'pre-wrap' }}>{filtered.text}</pre>
|
| 76 |
+
</div>
|
| 77 |
+
)}
|
| 78 |
+
</Card>
|
| 79 |
+
</div>
|
| 80 |
+
)
|
| 81 |
+
}
|
frontend/src/pages/RawNotesPage.tsx
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { Alert, Button, Card, Empty, Input, Space, Table, Typography } from 'antd'
|
| 2 |
+
import { useCallback, useEffect, useMemo, useState } from 'react'
|
| 3 |
+
import { useSearchParams } from 'react-router-dom'
|
| 4 |
+
import type { ApiError } from '../lib/api'
|
| 5 |
+
import { isOrchestratorDbUnavailable, listRawNotes, type RawNoteRecord } from '../lib/content'
|
| 6 |
+
|
| 7 |
+
export default function RawNotesPage() {
|
| 8 |
+
const [searchParams, setSearchParams] = useSearchParams()
|
| 9 |
+
const [loading, setLoading] = useState(false)
|
| 10 |
+
const [error, setError] = useState<ApiError | null>(null)
|
| 11 |
+
const [notes, setNotes] = useState<RawNoteRecord[]>([])
|
| 12 |
+
const [total, setTotal] = useState(0)
|
| 13 |
+
|
| 14 |
+
const page = useMemo(() => {
|
| 15 |
+
const raw = Number(searchParams.get('page') || '1')
|
| 16 |
+
return Number.isFinite(raw) && raw > 0 ? raw : 1
|
| 17 |
+
}, [searchParams])
|
| 18 |
+
|
| 19 |
+
const pageSize = useMemo(() => {
|
| 20 |
+
const raw = Number(searchParams.get('page_size') || '20')
|
| 21 |
+
return Number.isFinite(raw) && raw > 0 ? Math.min(raw, 200) : 20
|
| 22 |
+
}, [searchParams])
|
| 23 |
+
|
| 24 |
+
const query = useMemo(() => String(searchParams.get('query') || '').trim(), [searchParams])
|
| 25 |
+
const [queryInput, setQueryInput] = useState(query)
|
| 26 |
+
|
| 27 |
+
useEffect(() => {
|
| 28 |
+
setQueryInput(query)
|
| 29 |
+
}, [query])
|
| 30 |
+
|
| 31 |
+
const setParam = useCallback(
|
| 32 |
+
(patch: Record<string, string | undefined>) => {
|
| 33 |
+
const next = new URLSearchParams(searchParams)
|
| 34 |
+
Object.entries(patch).forEach(([k, v]) => {
|
| 35 |
+
const val = String(v || '').trim()
|
| 36 |
+
if (val) next.set(k, val)
|
| 37 |
+
else next.delete(k)
|
| 38 |
+
})
|
| 39 |
+
setSearchParams(next)
|
| 40 |
+
},
|
| 41 |
+
[searchParams, setSearchParams],
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
const loadList = useCallback(async () => {
|
| 45 |
+
setLoading(true)
|
| 46 |
+
setError(null)
|
| 47 |
+
try {
|
| 48 |
+
const res = await listRawNotes({
|
| 49 |
+
limit: pageSize,
|
| 50 |
+
offset: (page - 1) * pageSize,
|
| 51 |
+
query: query || undefined,
|
| 52 |
+
})
|
| 53 |
+
setNotes(res.notes || [])
|
| 54 |
+
setTotal(res.total || 0)
|
| 55 |
+
} catch (e) {
|
| 56 |
+
setError(e as ApiError)
|
| 57 |
+
} finally {
|
| 58 |
+
setLoading(false)
|
| 59 |
+
}
|
| 60 |
+
}, [page, pageSize, query])
|
| 61 |
+
|
| 62 |
+
useEffect(() => {
|
| 63 |
+
void loadList()
|
| 64 |
+
}, [loadList])
|
| 65 |
+
|
| 66 |
+
const columns = useMemo(
|
| 67 |
+
() => [
|
| 68 |
+
{ title: 'ID', dataIndex: 'id', width: 90 },
|
| 69 |
+
{
|
| 70 |
+
title: '作者',
|
| 71 |
+
dataIndex: 'author',
|
| 72 |
+
width: 160,
|
| 73 |
+
render: (v: unknown) => (v ? String(v) : '—'),
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
title: 'URL',
|
| 77 |
+
dataIndex: 'url',
|
| 78 |
+
width: 340,
|
| 79 |
+
ellipsis: true,
|
| 80 |
+
render: (v: unknown) => {
|
| 81 |
+
const url = String(v || '').trim()
|
| 82 |
+
if (!url) return '—'
|
| 83 |
+
return (
|
| 84 |
+
<a href={url} target="_blank" rel="noreferrer">
|
| 85 |
+
{url}
|
| 86 |
+
</a>
|
| 87 |
+
)
|
| 88 |
+
},
|
| 89 |
+
},
|
| 90 |
+
{
|
| 91 |
+
title: '内容',
|
| 92 |
+
dataIndex: 'content',
|
| 93 |
+
ellipsis: true,
|
| 94 |
+
render: (v: unknown) => {
|
| 95 |
+
const text = String(v || '').trim()
|
| 96 |
+
return text ? <Typography.Text>{text}</Typography.Text> : '—'
|
| 97 |
+
},
|
| 98 |
+
},
|
| 99 |
+
{
|
| 100 |
+
title: '来源',
|
| 101 |
+
dataIndex: 'source_platform',
|
| 102 |
+
width: 120,
|
| 103 |
+
render: (v: unknown) => (v ? String(v) : '—'),
|
| 104 |
+
},
|
| 105 |
+
{
|
| 106 |
+
title: '创建时间',
|
| 107 |
+
dataIndex: 'created_at',
|
| 108 |
+
width: 200,
|
| 109 |
+
render: (v: unknown) => (v ? String(v) : '—'),
|
| 110 |
+
},
|
| 111 |
+
],
|
| 112 |
+
[],
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
const dbMissing = isOrchestratorDbUnavailable(error)
|
| 116 |
+
|
| 117 |
+
return (
|
| 118 |
+
<div style={{ display: 'grid', gap: 16 }}>
|
| 119 |
+
<Typography.Title level={3} style={{ margin: 0 }}>
|
| 120 |
+
原始笔记库
|
| 121 |
+
</Typography.Title>
|
| 122 |
+
|
| 123 |
+
<Card
|
| 124 |
+
title="GET /api/v1/content/raw-notes"
|
| 125 |
+
extra={
|
| 126 |
+
<Space wrap>
|
| 127 |
+
<Input.Search
|
| 128 |
+
allowClear
|
| 129 |
+
placeholder="搜索 author / url / content"
|
| 130 |
+
value={queryInput}
|
| 131 |
+
onChange={(e) => setQueryInput(e.target.value)}
|
| 132 |
+
onSearch={(value) => setParam({ query: value.trim() || undefined, page: '1' })}
|
| 133 |
+
style={{ width: 320 }}
|
| 134 |
+
/>
|
| 135 |
+
<Button onClick={loadList} disabled={loading}>
|
| 136 |
+
刷新
|
| 137 |
+
</Button>
|
| 138 |
+
</Space>
|
| 139 |
+
}
|
| 140 |
+
loading={loading}
|
| 141 |
+
>
|
| 142 |
+
{dbMissing ? (
|
| 143 |
+
<Empty
|
| 144 |
+
image={Empty.PRESENTED_IMAGE_SIMPLE}
|
| 145 |
+
description={
|
| 146 |
+
<div style={{ maxWidth: 680 }}>
|
| 147 |
+
<Typography.Text strong>内容库数据库不可用</Typography.Text>
|
| 148 |
+
<Typography.Paragraph style={{ marginTop: 8, marginBottom: 0 }}>
|
| 149 |
+
服务端返回 503,通常表示 ORCHESTRATOR_DB_PATH 未配置、路径不存在或 DB 尚未初始化。
|
| 150 |
+
<br />
|
| 151 |
+
处理方式:
|
| 152 |
+
<br />
|
| 153 |
+
1)配置 ORCHESTRATOR_DB_PATH 指向可读的 sqlite 文件(默认 orchestrator/data/mvp.db)
|
| 154 |
+
<br />
|
| 155 |
+
2)初始化 DB:python Spider_XHS/orchestrator/db_init.py
|
| 156 |
+
</Typography.Paragraph>
|
| 157 |
+
</div>
|
| 158 |
+
}
|
| 159 |
+
/>
|
| 160 |
+
) : error ? (
|
| 161 |
+
<Alert
|
| 162 |
+
type="error"
|
| 163 |
+
showIcon
|
| 164 |
+
message={error.status ? `HTTP ${error.status}` : '请求失败'}
|
| 165 |
+
description={error.message}
|
| 166 |
+
style={{ marginBottom: 16 }}
|
| 167 |
+
/>
|
| 168 |
+
) : null}
|
| 169 |
+
|
| 170 |
+
<Table
|
| 171 |
+
size="middle"
|
| 172 |
+
rowKey="id"
|
| 173 |
+
columns={columns}
|
| 174 |
+
dataSource={notes}
|
| 175 |
+
pagination={{
|
| 176 |
+
current: page,
|
| 177 |
+
pageSize,
|
| 178 |
+
total,
|
| 179 |
+
showSizeChanger: true,
|
| 180 |
+
pageSizeOptions: [10, 20, 50, 100, 200],
|
| 181 |
+
onChange: (nextPage, nextSize) => setParam({ page: String(nextPage), page_size: String(nextSize) }),
|
| 182 |
+
}}
|
| 183 |
+
/>
|
| 184 |
+
</Card>
|
| 185 |
+
</div>
|
| 186 |
+
)
|
| 187 |
+
}
|
frontend/src/pages/ResourcesAccountsPage.tsx
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { Alert, Button, Card, Space, Table, Tag, Tooltip, Typography, Popconfirm, message } from 'antd'
|
| 2 |
+
import { useCallback, useEffect, useMemo, useState } from 'react'
|
| 3 |
+
import type { ApiError } from '../lib/api'
|
| 4 |
+
import { getResourceAccounts, cooldownAccount, disableAccount, type AccountPoolSnapshotResponse, type AccountSnapshotItem } from '../lib/resources'
|
| 5 |
+
|
| 6 |
+
export default function ResourcesAccountsPage() {
|
| 7 |
+
const [loading, setLoading] = useState(false)
|
| 8 |
+
const [error, setError] = useState<ApiError | null>(null)
|
| 9 |
+
const [data, setData] = useState<AccountPoolSnapshotResponse | null>(null)
|
| 10 |
+
|
| 11 |
+
const refresh = useCallback(async () => {
|
| 12 |
+
setLoading(true)
|
| 13 |
+
setError(null)
|
| 14 |
+
try {
|
| 15 |
+
const res = await getResourceAccounts()
|
| 16 |
+
setData(res)
|
| 17 |
+
} catch (e) {
|
| 18 |
+
setError(e as ApiError)
|
| 19 |
+
} finally {
|
| 20 |
+
setLoading(false)
|
| 21 |
+
}
|
| 22 |
+
}, [])
|
| 23 |
+
|
| 24 |
+
useEffect(() => {
|
| 25 |
+
refresh()
|
| 26 |
+
}, [refresh])
|
| 27 |
+
|
| 28 |
+
const handleCooldown = async (accountId: string) => {
|
| 29 |
+
try {
|
| 30 |
+
await cooldownAccount(accountId, 900) // Default 15 minutes
|
| 31 |
+
message.success(`账号 ${accountId} 已手动冷却 15 分钟`)
|
| 32 |
+
refresh()
|
| 33 |
+
} catch (e) {
|
| 34 |
+
message.error(`冷却失败: ${(e as ApiError).message}`)
|
| 35 |
+
}
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
const handleDisable = async (accountId: string) => {
|
| 39 |
+
try {
|
| 40 |
+
await disableAccount(accountId)
|
| 41 |
+
message.success(`账号 ${accountId} 已手动禁用`)
|
| 42 |
+
refresh()
|
| 43 |
+
} catch (e) {
|
| 44 |
+
message.error(`禁用失败: ${(e as ApiError).message}`)
|
| 45 |
+
}
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
const columns = useMemo(
|
| 49 |
+
() => [
|
| 50 |
+
{ title: 'id', dataIndex: 'id', width: 220, ellipsis: true },
|
| 51 |
+
{
|
| 52 |
+
title: 'tags',
|
| 53 |
+
dataIndex: 'tags',
|
| 54 |
+
render: (tags: string[]) => (
|
| 55 |
+
<Space wrap>
|
| 56 |
+
{(tags || []).length ? (tags || []).map((t) => <Tag key={t}>{t}</Tag>) : <Typography.Text>—</Typography.Text>}
|
| 57 |
+
</Space>
|
| 58 |
+
),
|
| 59 |
+
},
|
| 60 |
+
{
|
| 61 |
+
title: 'risk_score',
|
| 62 |
+
dataIndex: 'risk_score',
|
| 63 |
+
width: 120,
|
| 64 |
+
render: (v: number) => <Typography.Text>{typeof v === 'number' && Number.isFinite(v) ? v : '—'}</Typography.Text>,
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
title: 'enabled',
|
| 68 |
+
dataIndex: 'enabled',
|
| 69 |
+
width: 100,
|
| 70 |
+
render: (v: boolean) => <Tag color={v ? 'green' : 'red'}>{v ? 'true' : 'false'}</Tag>,
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
title: 'available',
|
| 74 |
+
dataIndex: 'available',
|
| 75 |
+
width: 110,
|
| 76 |
+
render: (v: boolean) => <Tag color={v ? 'green' : 'red'}>{v ? 'true' : 'false'}</Tag>,
|
| 77 |
+
},
|
| 78 |
+
{
|
| 79 |
+
title: 'cooldown_remaining_s',
|
| 80 |
+
dataIndex: 'cooldown_remaining_s',
|
| 81 |
+
width: 170,
|
| 82 |
+
render: (v: number) =>
|
| 83 |
+
typeof v === 'number' && Number.isFinite(v) ? (
|
| 84 |
+
<Typography.Text>{Math.max(0, Math.round(v * 100) / 100)}</Typography.Text>
|
| 85 |
+
) : (
|
| 86 |
+
'—'
|
| 87 |
+
),
|
| 88 |
+
},
|
| 89 |
+
{ title: 'cooldown_until', dataIndex: 'cooldown_until', width: 190, render: (v: string | null) => v || '—' },
|
| 90 |
+
{ title: 'last_error_kind', dataIndex: 'last_error_kind', width: 150, render: (v: string | null) => v || '—' },
|
| 91 |
+
{ title: 'last_error_at', dataIndex: 'last_error_at', width: 190, render: (v: string | null) => v || '—' },
|
| 92 |
+
{
|
| 93 |
+
title: '操作',
|
| 94 |
+
key: 'action',
|
| 95 |
+
fixed: 'right',
|
| 96 |
+
width: 150,
|
| 97 |
+
render: (_: any, record: AccountSnapshotItem) => (
|
| 98 |
+
<Space size="small">
|
| 99 |
+
<Popconfirm title="确认手动冷却该账号 15 分钟?" onConfirm={() => handleCooldown(record.id)}>
|
| 100 |
+
<Button type="link" size="small">冷却</Button>
|
| 101 |
+
</Popconfirm>
|
| 102 |
+
<Popconfirm title="确认永久禁用该账号?" onConfirm={() => handleDisable(record.id)}>
|
| 103 |
+
<Button type="link" size="small" danger>禁用</Button>
|
| 104 |
+
</Popconfirm>
|
| 105 |
+
</Space>
|
| 106 |
+
)
|
| 107 |
+
}
|
| 108 |
+
],
|
| 109 |
+
[],
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
return (
|
| 113 |
+
<div style={{ display: 'grid', gap: 16 }}>
|
| 114 |
+
<Space align="center" style={{ justifyContent: 'space-between' }}>
|
| 115 |
+
<Space wrap align="center">
|
| 116 |
+
<Typography.Title level={3} style={{ margin: 0 }}>
|
| 117 |
+
账号池
|
| 118 |
+
</Typography.Title>
|
| 119 |
+
<Tag color="blue">只读</Tag>
|
| 120 |
+
<Tag color="gold">规划项</Tag>
|
| 121 |
+
</Space>
|
| 122 |
+
<Space wrap>
|
| 123 |
+
<Tooltip title="规划项:暂不支持新增/编辑/禁用/冷却等写操作">
|
| 124 |
+
<span>
|
| 125 |
+
<Button disabled>新增账号</Button>
|
| 126 |
+
</span>
|
| 127 |
+
</Tooltip>
|
| 128 |
+
<Tooltip title="规划项:暂不支持写操作">
|
| 129 |
+
<span>
|
| 130 |
+
<Button disabled>编辑</Button>
|
| 131 |
+
</span>
|
| 132 |
+
</Tooltip>
|
| 133 |
+
<Tooltip title="规划项:暂不支持写操作">
|
| 134 |
+
<span>
|
| 135 |
+
<Button disabled>禁用</Button>
|
| 136 |
+
</span>
|
| 137 |
+
</Tooltip>
|
| 138 |
+
<Tooltip title="规划项:暂不支持写操作">
|
| 139 |
+
<span>
|
| 140 |
+
<Button disabled>冷却</Button>
|
| 141 |
+
</span>
|
| 142 |
+
</Tooltip>
|
| 143 |
+
<Button onClick={refresh} loading={loading}>
|
| 144 |
+
刷新
|
| 145 |
+
</Button>
|
| 146 |
+
</Space>
|
| 147 |
+
</Space>
|
| 148 |
+
|
| 149 |
+
<Card title="GET /api/v1/resources/accounts" loading={loading}>
|
| 150 |
+
{error ? (
|
| 151 |
+
<Alert
|
| 152 |
+
type="error"
|
| 153 |
+
showIcon
|
| 154 |
+
message="请求失败"
|
| 155 |
+
description={`${error.status ? `HTTP ${error.status}: ` : ''}${error.message}`}
|
| 156 |
+
style={{ marginBottom: 16 }}
|
| 157 |
+
/>
|
| 158 |
+
) : null}
|
| 159 |
+
|
| 160 |
+
<Space wrap style={{ marginBottom: 12 }}>
|
| 161 |
+
<Typography.Text type="secondary">now: {data?.now || '—'}</Typography.Text>
|
| 162 |
+
<Typography.Text type="secondary">seed: {typeof data?.seed === 'number' ? data.seed : '—'}</Typography.Text>
|
| 163 |
+
<Typography.Text type="secondary">accounts: {data?.accounts?.length ?? 0}</Typography.Text>
|
| 164 |
+
</Space>
|
| 165 |
+
|
| 166 |
+
<Table<AccountSnapshotItem>
|
| 167 |
+
rowKey="id"
|
| 168 |
+
size="middle"
|
| 169 |
+
columns={columns as any}
|
| 170 |
+
dataSource={data?.accounts || []}
|
| 171 |
+
pagination={{ pageSize: 20, showSizeChanger: true, pageSizeOptions: [10, 20, 50, 100] }}
|
| 172 |
+
scroll={{ x: 1400 }}
|
| 173 |
+
/>
|
| 174 |
+
</Card>
|
| 175 |
+
</div>
|
| 176 |
+
)
|
| 177 |
+
}
|
| 178 |
+
|