Commit
Β·
5ab87e0
0
Parent(s):
init
Browse files- .gitattributes +36 -0
- .gitignore +5 -0
- README.md +12 -0
- app.py +0 -0
- eval_benchmarks.py +468 -0
- re_call/__init__.py +8 -0
- re_call/prompts.py +184 -0
- re_call/re_call.py +1490 -0
- requirements.txt +16 -0
- run_question.py +275 -0
- tokenizer-info/added_tokens.json +3 -0
- tokenizer-info/merges.txt +0 -0
- tokenizer-info/special_tokens_map.json +3 -0
- tokenizer-info/tokenizer.json +3 -0
- tokenizer-info/tokenizer_config.json +3 -0
- tokenizer-info/vocab.json +3 -0
- web_agents_5/compressor.py +314 -0
- web_agents_5/config.py +38 -0
- web_agents_5/fetchers/__init__.py +0 -0
- web_agents_5/fetchers/basic_fetcher.py +42 -0
- web_agents_5/fetchers/crawl4ai_fetcher.py +104 -0
- web_agents_5/fetchers/github_fetcher.py +64 -0
- web_agents_5/fetchers/jina_fetcher.py +151 -0
- web_agents_5/fetchers/pdf_fetcher.py +53 -0
- web_agents_5/fetchers/reddit_fetcher.py +324 -0
- web_agents_5/fetchers/youtube_fetcher.py +50 -0
- web_agents_5/fetchers_async.py +155 -0
- web_agents_5/host_serper2.sh +6 -0
- web_agents_5/sandbox_serper.py +90 -0
- web_agents_5/search_api.py +63 -0
- web_agents_5/utils.py +145 -0
- web_agents_5/web_helpers.py +64 -0
.gitattributes
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*.json filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
.gradio/
|
| 3 |
+
temp/*
|
| 4 |
+
.cache/*
|
| 5 |
+
nohup.out
|
README.md
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Fathom DeepResearch
|
| 3 |
+
emoji: π
|
| 4 |
+
colorFrom: green
|
| 5 |
+
colorTo: red
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 5.44.1
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
license: mit
|
| 11 |
+
short_description: Use the fathom search 4b model interactively
|
| 12 |
+
---
|
app.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_benchmarks.py
ADDED
|
@@ -0,0 +1,468 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# eval_benchmark_multithreaded.py
|
| 2 |
+
"""Unified benchmarking script for ReCall, ZeroSearch, and R1βSearcher
|
| 3 |
+
with optional multiβthreaded execution.
|
| 4 |
+
|
| 5 |
+
Example usage (singleβthreaded)
|
| 6 |
+
-------------------------------
|
| 7 |
+
```bash
|
| 8 |
+
python eval_benchmark.py \
|
| 9 |
+
--dataset frames \
|
| 10 |
+
--agent r1-searcher \
|
| 11 |
+
--model-url http://0.0.0.0:1233 \
|
| 12 |
+
--out-base /tmp/evals \
|
| 13 |
+
--mode single
|
| 14 |
+
```
|
| 15 |
+
|
| 16 |
+
Example usage (multiβthreaded, 128 workers)
|
| 17 |
+
------------------------------------------
|
| 18 |
+
```bash
|
| 19 |
+
python eval_benchmark.py \
|
| 20 |
+
--dataset frames \
|
| 21 |
+
--agent recall \
|
| 22 |
+
--model-url http://0.0.0.0:1231 \
|
| 23 |
+
--out-base /tmp/evals \
|
| 24 |
+
--mode multi \
|
| 25 |
+
--workers 128
|
| 26 |
+
```
|
| 27 |
+
The script will:
|
| 28 |
+
1. Load the specified dataset JSONL file that contains objects with keys
|
| 29 |
+
`question` and `answer`.
|
| 30 |
+
2. Build the chosen agent wrapper (`recall`, `zerosearch`, or `r1-searcher`).
|
| 31 |
+
3. Stream one JSONL line per example with *all* details needed for analysis.
|
| 32 |
+
4. Optionally run the evaluation loop in parallel using a configurable number
|
| 33 |
+
of worker threads.
|
| 34 |
+
5. Automatically construct the output path as:
|
| 35 |
+
```
|
| 36 |
+
{out_base}/{model_name}/{dataset}.jsonl
|
| 37 |
+
```
|
| 38 |
+
where `model_name` is derived from the `--model-url` (characters after the
|
| 39 |
+
last `/`).
|
| 40 |
+
"""
|
| 41 |
+
from __future__ import annotations
|
| 42 |
+
|
| 43 |
+
import argparse
|
| 44 |
+
import json
|
| 45 |
+
import logging
|
| 46 |
+
import os
|
| 47 |
+
import pathlib
|
| 48 |
+
import re
|
| 49 |
+
import threading
|
| 50 |
+
import time
|
| 51 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 52 |
+
from typing import Dict, List
|
| 53 |
+
|
| 54 |
+
import unicodedata
|
| 55 |
+
from openai import OpenAI, APIStatusError
|
| 56 |
+
from tqdm import tqdm
|
| 57 |
+
|
| 58 |
+
# --------------------------------------------------------------------
|
| 59 |
+
# Agent imports (ensure PYTHONPATH is set appropriately)
|
| 60 |
+
# --------------------------------------------------------------------
|
| 61 |
+
from re_call import ReCall # user's wrapper
|
| 62 |
+
# from re_call import ZeroSearchInference, ZeroSearchConfig
|
| 63 |
+
# from re_call import R1Searcher, R1SearchConfig as R1Cfg
|
| 64 |
+
# from re_call import O1Cfg, O1Searcher
|
| 65 |
+
from pathlib import Path
|
| 66 |
+
# from re_call import SDSCfg, SDSSearcher
|
| 67 |
+
|
| 68 |
+
# --------------------------------------------------------------------
|
| 69 |
+
# Environment Keys βΒ override with real keys or environment variables
|
| 70 |
+
# --------------------------------------------------------------------
|
| 71 |
+
#for recall
|
| 72 |
+
# search_env = "from search_api import web_search, web_visit"
|
| 73 |
+
# search_schemas =[
|
| 74 |
+
# {
|
| 75 |
+
# "name": "web_search",
|
| 76 |
+
# "description": "Google search and return links to web-pages with a brief snippet given a text query",
|
| 77 |
+
# "parameters": {
|
| 78 |
+
# "type": "object",
|
| 79 |
+
# "properties": {
|
| 80 |
+
# "query": {"type": "string"},
|
| 81 |
+
# },
|
| 82 |
+
# "required": ["query"],
|
| 83 |
+
# },
|
| 84 |
+
# },
|
| 85 |
+
# {
|
| 86 |
+
# "name": "web_visit",
|
| 87 |
+
# "description": "Visit webpage and return its content",
|
| 88 |
+
# "parameters": {
|
| 89 |
+
# "type": "object",
|
| 90 |
+
# "properties": {
|
| 91 |
+
# "url": {"type": "string", "description": "The URL of the webpage to visit. Must be a single URL"},
|
| 92 |
+
# },
|
| 93 |
+
# "required": ["url"],
|
| 94 |
+
# },
|
| 95 |
+
# }
|
| 96 |
+
# ]
|
| 97 |
+
# for recall
|
| 98 |
+
search_env = "from search_api import search_urls, open_url, search_and_parse_query, query_url"
|
| 99 |
+
search_schemas =[
|
| 100 |
+
{
|
| 101 |
+
"name": "search_urls",
|
| 102 |
+
"description": "Google search and return links to web-pages with a brief snippet given a text query",
|
| 103 |
+
"parameters": {
|
| 104 |
+
"type": "object",
|
| 105 |
+
"properties": {
|
| 106 |
+
"query": {"type": "string"},
|
| 107 |
+
"top_k": {"type": "integer", "default": 10},
|
| 108 |
+
},
|
| 109 |
+
"required": ["query"],
|
| 110 |
+
},
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
"name": "query_url",
|
| 114 |
+
"description": "Visit webpage and return evidence based retrival for the provided goal",
|
| 115 |
+
"parameters": {
|
| 116 |
+
"type": "object",
|
| 117 |
+
"properties": {
|
| 118 |
+
"url": {"type": "string", "description": "The URL of the webpage to visit. Must be a single URL"},
|
| 119 |
+
"goal": {"type": "string", "description": "The specific information goal for visiting webpage"},
|
| 120 |
+
},
|
| 121 |
+
"required": ["url", "goal"],
|
| 122 |
+
},
|
| 123 |
+
}
|
| 124 |
+
]
|
| 125 |
+
|
| 126 |
+
EXECUTOR_URL = os.environ["HOST_SERPER_URL"]
|
| 127 |
+
DATA_ROOT = pathlib.Path("./eval_datasets")
|
| 128 |
+
SEM = threading.Semaphore(3) # limit concurrent judge calls
|
| 129 |
+
JUDGE_MODEL = "gpt-4.1-mini"
|
| 130 |
+
|
| 131 |
+
try:
|
| 132 |
+
base = Path(__file__).resolve().parent
|
| 133 |
+
except NameError: # e.g., REPL/Jupyter
|
| 134 |
+
base = Path.cwd()
|
| 135 |
+
|
| 136 |
+
TOKENIZER_DIR = (base / "tokenizer-info").resolve()
|
| 137 |
+
|
| 138 |
+
# βββββββββββββββββββββββββ tokenizer ββββββββββββββββββββββββββββββββββββββββ
|
| 139 |
+
try:
|
| 140 |
+
from transformers import AutoTokenizer
|
| 141 |
+
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, trust_remote_code=True)
|
| 142 |
+
except Exception as e:
|
| 143 |
+
import sys
|
| 144 |
+
sys.exit(f"β Could not load Qwen3 tokenizer: {e}")
|
| 145 |
+
|
| 146 |
+
import hashlib
|
| 147 |
+
|
| 148 |
+
def get_uid(sample: dict) -> str:
|
| 149 |
+
"""Generate a UID using SHA256 hash of question."""
|
| 150 |
+
return hashlib.sha256(sample["question"].strip().encode("utf-8")).hexdigest()
|
| 151 |
+
|
| 152 |
+
# --------------------------------------------------------------------
|
| 153 |
+
# Regex & utilities
|
| 154 |
+
# --------------------------------------------------------------------
|
| 155 |
+
def extract_answer_tagged(text: str) -> str:
|
| 156 |
+
|
| 157 |
+
ANS_RE = re.compile(r"<answer>(.*?)</answer>", re.S)
|
| 158 |
+
match = ANS_RE.findall(text)
|
| 159 |
+
if match :
|
| 160 |
+
return match[-1].strip().lower()
|
| 161 |
+
else:
|
| 162 |
+
print("No answer tags found")
|
| 163 |
+
return text[-200:] #because o1-searcher fails to follow format
|
| 164 |
+
|
| 165 |
+
def extract_answer_boxed(response):
|
| 166 |
+
def remove_boxed(s):
|
| 167 |
+
if "\\boxed " in s:
|
| 168 |
+
left = "\\boxed "
|
| 169 |
+
assert s[:len(left)] == left
|
| 170 |
+
return s[len(left):]
|
| 171 |
+
|
| 172 |
+
left = "\\boxed{"
|
| 173 |
+
|
| 174 |
+
assert s[:len(left)] == left
|
| 175 |
+
assert s[-1] == "}"
|
| 176 |
+
|
| 177 |
+
return s[len(left):-1]
|
| 178 |
+
|
| 179 |
+
def last_boxed_only_string(string):
|
| 180 |
+
idx = string.rfind("\\boxed")
|
| 181 |
+
if "\\boxed " in string:
|
| 182 |
+
return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
|
| 183 |
+
if idx < 0:
|
| 184 |
+
idx = string.rfind("\\fbox")
|
| 185 |
+
if idx < 0:
|
| 186 |
+
return None
|
| 187 |
+
|
| 188 |
+
i = idx
|
| 189 |
+
right_brace_idx = None
|
| 190 |
+
num_left_braces_open = 0
|
| 191 |
+
while i < len(string):
|
| 192 |
+
if string[i] == "{":
|
| 193 |
+
num_left_braces_open += 1
|
| 194 |
+
if string[i] == "}":
|
| 195 |
+
num_left_braces_open -= 1
|
| 196 |
+
if num_left_braces_open == 0:
|
| 197 |
+
right_brace_idx = i
|
| 198 |
+
break
|
| 199 |
+
i += 1
|
| 200 |
+
|
| 201 |
+
if right_brace_idx is None:
|
| 202 |
+
retval = None
|
| 203 |
+
else:
|
| 204 |
+
retval = string[idx:right_brace_idx + 1]
|
| 205 |
+
|
| 206 |
+
return retval
|
| 207 |
+
answer = remove_boxed(last_boxed_only_string(response))
|
| 208 |
+
return answer
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
JUDGE_SYS = """
|
| 213 |
+
You are an impartial judge evaluating the correctness of a model's answer against a ground-truth answer for a given question. Your task is to:
|
| 214 |
+
1. Compare the model's answer to the ground-truth answer.
|
| 215 |
+
2. Determine if the model's answer is correct or incorrect.
|
| 216 |
+
|
| 217 |
+
**Input Format:**
|
| 218 |
+
- Question: {question}
|
| 219 |
+
- Ground Truth: {ground_truth}
|
| 220 |
+
- Model Answer: {model_answer}
|
| 221 |
+
|
| 222 |
+
**Output Format:**
|
| 223 |
+
correct/incorrect/unknown
|
| 224 |
+
|
| 225 |
+
**Guidelines:**
|
| 226 |
+
- The model's answer is correct if it matches the ground-truth answer in meaning and content it is case-insensitive, ignore minor punctuation or formatting differences.
|
| 227 |
+
- If the model's answer contains additional information, it is still correct as long as the core answer matches the ground truth.
|
| 228 |
+
- Be precise output a single word correct / incorrect / unknown and **nothing else**
|
| 229 |
+
- For MCQ questions match the option ID A. B. C. or D. if its correct the answer is correct.
|
| 230 |
+
"""
|
| 231 |
+
# - If the model's answer is partially correct or contains errors, it is incorrect.
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
# Threadβlocal OpenAI client cache
|
| 235 |
+
|
| 236 |
+
def _oa() -> OpenAI:
|
| 237 |
+
th = threading.current_thread()
|
| 238 |
+
if not hasattr(th, "_oa"):
|
| 239 |
+
th._oa = OpenAI()
|
| 240 |
+
return th._oa
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
def judge(q: str, gt: str, pred: str) -> str:
|
| 244 |
+
if pred == "":
|
| 245 |
+
return "unknown"
|
| 246 |
+
prompt = JUDGE_SYS.format(question=q, ground_truth=gt, model_answer=pred)
|
| 247 |
+
try:
|
| 248 |
+
with SEM:
|
| 249 |
+
resp = _oa().chat.completions.create(
|
| 250 |
+
model=JUDGE_MODEL,
|
| 251 |
+
messages=[
|
| 252 |
+
{"role": "system", "content": JUDGE_SYS},
|
| 253 |
+
{"role": "user", "content": prompt},
|
| 254 |
+
],
|
| 255 |
+
temperature=0.0,
|
| 256 |
+
max_tokens=100,
|
| 257 |
+
)
|
| 258 |
+
return resp.choices[0].message.content.strip().lower()
|
| 259 |
+
except APIStatusError:
|
| 260 |
+
return "unknown"
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
# --------------------------------------------------------------------
|
| 264 |
+
# Agent factory
|
| 265 |
+
# --------------------------------------------------------------------
|
| 266 |
+
def build_agent(kind: str, model_url: str):
|
| 267 |
+
kind = kind.lower()
|
| 268 |
+
print(kind)
|
| 269 |
+
if kind == "recall":
|
| 270 |
+
return ReCall(executor_url=EXECUTOR_URL)
|
| 271 |
+
else:
|
| 272 |
+
raise ValueError(f"Unknown agent kind: {kind}")
|
| 273 |
+
# if kind == "o1-search" or kind == "sds":
|
| 274 |
+
# cfg = O1Cfg()
|
| 275 |
+
# return O1Searcher(cfg, thinker_url=model_url)
|
| 276 |
+
# if kind == "zerosearch":
|
| 277 |
+
# cfg = ZeroSearchConfig(thinker_url=model_url)
|
| 278 |
+
# return ZeroSearchInference(cfg)
|
| 279 |
+
# if kind in ("r1-search", "r1-searcher", "r1"):
|
| 280 |
+
# cfg = R1Cfg(serper_api_key=os.getenv("SERPER_API_KEY", ""))
|
| 281 |
+
# return R1Searcher(cfg=cfg, model_url=model_url)
|
| 282 |
+
# raise ValueError(f"Unknown agent kind: {kind}")
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
# --------------------------------------------------------------------
|
| 286 |
+
# Core evaluation routine for a single example (threadβsafe)
|
| 287 |
+
# --------------------------------------------------------------------
|
| 288 |
+
def evaluate_example(example: Dict[str, str], agent_kind: str, model_url: str) -> Dict[str, str]:
|
| 289 |
+
"""Run one example through the pipeline and return result row."""
|
| 290 |
+
question = example["question"].strip()
|
| 291 |
+
answer_gt = example["answer"].strip()
|
| 292 |
+
idx = example["id"].strip()
|
| 293 |
+
|
| 294 |
+
|
| 295 |
+
# Build a *fresh* agent per thread to avoid sharedβstate issues
|
| 296 |
+
agent = build_agent(agent_kind, model_url=model_url)
|
| 297 |
+
|
| 298 |
+
if agent_kind == "recall" and model_url == "deepseek-ai/DeepSeek-R1":
|
| 299 |
+
# print(agent_kind)
|
| 300 |
+
# print("B"*100)
|
| 301 |
+
transcript, tool_calls = agent.run_deepseek(
|
| 302 |
+
env=search_env,
|
| 303 |
+
func_schemas=search_schemas,
|
| 304 |
+
question=question,
|
| 305 |
+
model_name="deepseek-ai/DeepSeek-R1",
|
| 306 |
+
temperature=0.6,
|
| 307 |
+
max_tokens=40960,
|
| 308 |
+
# tokenizer = tokenizer
|
| 309 |
+
)
|
| 310 |
+
elif agent_kind == "recall":
|
| 311 |
+
transcript, tool_calls, chat = agent.run(
|
| 312 |
+
env=search_env,
|
| 313 |
+
func_schemas=search_schemas,
|
| 314 |
+
question=question,
|
| 315 |
+
model_url=model_url,
|
| 316 |
+
temperature=0.6,
|
| 317 |
+
max_new_tokens=40960,
|
| 318 |
+
tokenizer = tokenizer
|
| 319 |
+
)
|
| 320 |
+
# tool_calls = agent.extract_tool_calls(transcript)
|
| 321 |
+
else: # zerosearch or r1βsearcher
|
| 322 |
+
transcript, tool_calls = agent.run(question)
|
| 323 |
+
|
| 324 |
+
if agent_kind in [
|
| 325 |
+
"r1-searcher",
|
| 326 |
+
"zerosearch",
|
| 327 |
+
# "o1-search",
|
| 328 |
+
]:
|
| 329 |
+
pred = extract_answer_tagged(transcript)
|
| 330 |
+
if agent_kind in [
|
| 331 |
+
"recall",
|
| 332 |
+
"SDS"
|
| 333 |
+
"o1-searcher"
|
| 334 |
+
]:
|
| 335 |
+
try:
|
| 336 |
+
pred = extract_answer_boxed(transcript)
|
| 337 |
+
except:
|
| 338 |
+
print("falling to last string")
|
| 339 |
+
pred = transcript[-200:]
|
| 340 |
+
else:
|
| 341 |
+
try:
|
| 342 |
+
pred = extract_answer_boxed(transcript)
|
| 343 |
+
except:
|
| 344 |
+
print("falling to last string")
|
| 345 |
+
pred = transcript[-200:]
|
| 346 |
+
|
| 347 |
+
verdict = judge(question, answer_gt.lower(), pred.lower())
|
| 348 |
+
|
| 349 |
+
return {
|
| 350 |
+
"id": idx,
|
| 351 |
+
"question": question,
|
| 352 |
+
"answer_gt": answer_gt,
|
| 353 |
+
"model_answer": pred,
|
| 354 |
+
"judge": verdict,
|
| 355 |
+
"tool_calls": tool_calls,
|
| 356 |
+
"transcript": transcript,
|
| 357 |
+
"chat": chat
|
| 358 |
+
}
|
| 359 |
+
|
| 360 |
+
# --------------------------------------------------------------------
|
| 361 |
+
# CLI entryβpoint
|
| 362 |
+
# --------------------------------------------------------------------
|
| 363 |
+
def build_output_path(out_base, agent, dataset, name) -> pathlib.Path:
|
| 364 |
+
"""Construct output path as {out_base}/{model_name}/{dataset}.jsonl."""
|
| 365 |
+
return out_base / f"{agent}" / f"{dataset}-{name}.jsonl"
|
| 366 |
+
|
| 367 |
+
def normalize(s: str) -> str:
|
| 368 |
+
return unicodedata.normalize("NFKD", s.strip().lower())
|
| 369 |
+
|
| 370 |
+
def load_existing_results(path: pathlib.Path) -> tuple[list[dict], set[str]]:
|
| 371 |
+
results = []
|
| 372 |
+
uids = set()
|
| 373 |
+
if not path.exists():
|
| 374 |
+
return results, uids
|
| 375 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 376 |
+
for line in f:
|
| 377 |
+
try:
|
| 378 |
+
row = json.loads(line)
|
| 379 |
+
if row['model_answer'] != "":
|
| 380 |
+
results.append(row)
|
| 381 |
+
uids.add(row["id"])
|
| 382 |
+
except Exception:
|
| 383 |
+
continue
|
| 384 |
+
return results, uids
|
| 385 |
+
|
| 386 |
+
def main():
|
| 387 |
+
parser = argparse.ArgumentParser(description="Benchmark QA agents on a dataset (single or multiβthreaded)")
|
| 388 |
+
parser.add_argument("--dataset", required=True, help="dataset name (frames, β¦)")
|
| 389 |
+
parser.add_argument("--agent", required=True, choices=["recall", "zerosearch", "r1-searcher", "o1-search", "SDS", "deepseek-r1"], help="agent wrapper")
|
| 390 |
+
parser.add_argument("--out", required=True, help="base directory for outputs")
|
| 391 |
+
parser.add_argument("--model-url", required=False, help="URL of the model server")
|
| 392 |
+
parser.add_argument("--limit", type=int, default=0, help="optional cap on number of questions")
|
| 393 |
+
parser.add_argument("--mode", choices=["single", "multi"], default="single", help="execution mode")
|
| 394 |
+
parser.add_argument("--workers", type=int, default=8, help="number of worker threads for multiβmode")
|
| 395 |
+
parser.add_argument("--name", type=str, default="", help="suffix for save dir")
|
| 396 |
+
|
| 397 |
+
args = parser.parse_args()
|
| 398 |
+
|
| 399 |
+
# ----------------------------------------------------------------
|
| 400 |
+
# Dataset loading
|
| 401 |
+
# ----------------------------------------------------------------
|
| 402 |
+
ds_path = DATA_ROOT / f"{args.dataset}.jsonl"
|
| 403 |
+
if not ds_path.exists():
|
| 404 |
+
raise FileNotFoundError(ds_path)
|
| 405 |
+
|
| 406 |
+
with ds_path.open() as f:
|
| 407 |
+
data = [json.loads(line) for line in f]
|
| 408 |
+
|
| 409 |
+
# ----------------------------------------------------------------
|
| 410 |
+
# Output path setup
|
| 411 |
+
# ----------------------------------------------------------------
|
| 412 |
+
out_base = pathlib.Path(args.out).expanduser().resolve()
|
| 413 |
+
out_path = build_output_path(out_base, args.agent, args.dataset, args.name)
|
| 414 |
+
print(out_path)
|
| 415 |
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
| 416 |
+
|
| 417 |
+
if args.limit:
|
| 418 |
+
data = data[: args.limit]
|
| 419 |
+
# data = data[246:]
|
| 420 |
+
|
| 421 |
+
correct = 0
|
| 422 |
+
start_time = time.perf_counter()
|
| 423 |
+
|
| 424 |
+
|
| 425 |
+
# ----------------------------------------------------------------
|
| 426 |
+
# SINGLEβTHREADED EXECUTION
|
| 427 |
+
# ----------------------------------------------------------------
|
| 428 |
+
if args.mode == "single":
|
| 429 |
+
with open(out_path, "w", encoding="utf-8") as fout:
|
| 430 |
+
for ex in tqdm(data, desc="QA loop (single)"):
|
| 431 |
+
|
| 432 |
+
row = evaluate_example(ex, args.agent, args.model_url)
|
| 433 |
+
if row["judge"] == "correct":
|
| 434 |
+
correct += 1
|
| 435 |
+
# context for row
|
| 436 |
+
row.update({"agent": args.agent, "dataset": args.dataset})
|
| 437 |
+
fout.write(json.dumps(row, ensure_ascii=False) + "\n")
|
| 438 |
+
fout.flush()
|
| 439 |
+
|
| 440 |
+
# ----------------------------------------------------------------
|
| 441 |
+
# MULTIβTHREADED EXECUTION
|
| 442 |
+
# ----------------------------------------------------------------
|
| 443 |
+
else:
|
| 444 |
+
workers = max(1, args.workers)
|
| 445 |
+
logging.info("Running in multiβthreaded mode with %d workers", workers)
|
| 446 |
+
with ThreadPoolExecutor(max_workers=workers) as executor, open(out_path, "a", encoding="utf-8") as fout:
|
| 447 |
+
futures = {executor.submit(evaluate_example, ex, args.agent, args.model_url): ex for ex in data}
|
| 448 |
+
for fut in tqdm(as_completed(futures), total=len(futures), desc="QA loop (multi)"):
|
| 449 |
+
try:
|
| 450 |
+
row = fut.result()
|
| 451 |
+
except Exception as exc:
|
| 452 |
+
logging.exception("Evaluation failed: %s", exc)
|
| 453 |
+
continue
|
| 454 |
+
# print(row['id'])
|
| 455 |
+
if row["judge"] == "correct":
|
| 456 |
+
correct += 1
|
| 457 |
+
row.update({"agent": args.agent, "dataset": args.dataset})
|
| 458 |
+
fout.write(json.dumps(row, ensure_ascii=False) + "\n")
|
| 459 |
+
fout.flush()
|
| 460 |
+
|
| 461 |
+
elapsed = time.perf_counter() - start_time
|
| 462 |
+
accuracy = correct / len(data) if data else 0.0
|
| 463 |
+
print(f"Accuracy: {correct}/{len(data)} = {accuracy:.1%}")
|
| 464 |
+
print(f"Elapsed time: {elapsed:.2f}s ({elapsed/len(data):.2f}s per example)")
|
| 465 |
+
|
| 466 |
+
|
| 467 |
+
if __name__ == "__main__":
|
| 468 |
+
main()
|
re_call/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# from .inference.re_call import ReCall
|
| 2 |
+
# from .inference.r1_searcher import R1Searcher, R1SearchConfig
|
| 3 |
+
# from .inference.zerosearch import ZeroSearchInference, ZeroSearchConfig
|
| 4 |
+
# from .inference.o1_searcher import O1Cfg, O1Searcher
|
| 5 |
+
# from .inference.simpledeepsearch import SDSCfg, SDSearcher
|
| 6 |
+
from .re_call import ReCall
|
| 7 |
+
__all__ = ["ReCall"]
|
| 8 |
+
# __all__ = ["ReCall", "R1Searcher", "ZeroSearchInference", "ZeroSearchConfig", "R1SearchConfig", "O1Cfg", "O1Searcher", "SDSCfg", "SDSearcher"]
|
re_call/prompts.py
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Final
|
| 2 |
+
|
| 3 |
+
# DEEPRESEARCH_REPORT_SYS_PROMPT: Final[str] = r"""
|
| 4 |
+
# You are a DeepResearch analyst and Report Converter. Turn a raw investigation trace into a clear,
|
| 5 |
+
# decision-grade report suitable for executives.
|
| 6 |
+
|
| 7 |
+
# INPUTS (provided in the user message)
|
| 8 |
+
# - QUESTION: the research question.
|
| 9 |
+
# - TRACE: the full transcript (may include assistant/user/tool snippets).
|
| 10 |
+
# - TOOL_CALLS: raw list of tool calls (JSON-ish), which may contain URLs.
|
| 11 |
+
|
| 12 |
+
# CRITICAL SOURCING CONSTRAINTS (non-negotiable)
|
| 13 |
+
# - TRAJECTORY_LINKS = every URL you find in TRACE and TOOL_CALLS. Use ONLY these links. Do NOT add new sources.
|
| 14 |
+
# - Evidence density: cite every non-obvious fact/date/figure/evaluative claim.
|
| 15 |
+
# - Citation format: append raw bracketed URLs immediately after the supported sentence/point,
|
| 16 |
+
# e.g., ββ¦ announced in 2003. [https://example.com/page]β.
|
| 17 |
+
# - Prefer primary/official and the most recent authoritative updates. If sources conflict, explain briefly and cite both.
|
| 18 |
+
|
| 19 |
+
# QUALITY & FRESHNESS
|
| 20 |
+
# - Be neutral, precise, and reproducible. No fabrication.
|
| 21 |
+
# - Distinguish **event date**, **publish/update date**, and **effective date** where relevant.
|
| 22 |
+
# - If critical info is missing, state the gap and proceed with best-effort analysis grounded in available links.
|
| 23 |
+
|
| 24 |
+
# OUTPUT RULES
|
| 25 |
+
# - **Markdown only.** No system markers. No boxed answers (\boxed{}).
|
| 26 |
+
# - Public-facing rationale only (no hidden chain-of-thought).
|
| 27 |
+
# - Length proportional to complexity (short for simple, detailed for complex).
|
| 28 |
+
# - **You decide the sectioning and narrative flow** based on the QUESTION and TRACE. Use headings only if they help clarity.
|
| 29 |
+
# - Keep it decision-useful: tight claims tied to evidence, crisp takeaways, explicit uncertainties.
|
| 30 |
+
|
| 31 |
+
# OPERATION
|
| 32 |
+
# 1) Extract TRAJECTORY_LINKS from TRACE and TOOL_CALLS. These are your only allowable citations.
|
| 33 |
+
# 2) Think privately about the best structure for this topic; then write the report accordingly.
|
| 34 |
+
# 3) Map each included claim to at least one link; mark any necessary but unsupported claim as βunsupportedβ.
|
| 35 |
+
# 4) Normalize names/dates/figures; note gaps and conflicts, and how you resolved them.
|
| 36 |
+
# 5) Conclude with a **deduplicated βSources usedβ list** of the raw URLs you actually cited (one per line).
|
| 37 |
+
|
| 38 |
+
# """
|
| 39 |
+
|
| 40 |
+
DEEPRESEARCH_SYS_PROMPT: Final[str] = r"""
|
| 41 |
+
You are a DeepResearch Assistant.
|
| 42 |
+
|
| 43 |
+
Goal: (1) Produce a concise PLAN that breaks the QUESTION into sections and **maps every URL and tool_call content** in the trace to those sections; (2) Produce a public-facing REPORT that synthesizes **all** information from TRACE/TOOL_CALLS into an insightful report.
|
| 44 |
+
|
| 45 |
+
========================
|
| 46 |
+
INPUTS
|
| 47 |
+
========================
|
| 48 |
+
- QUESTION: research question.
|
| 49 |
+
- TRACE: transcript (assistant/user/tool snippets).
|
| 50 |
+
- TOOL_CALLS: raw tool calls (includes URLs and tool_responses).
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
========================
|
| 54 |
+
CITATIONS (ACCURACY-FIRST)
|
| 55 |
+
========================
|
| 56 |
+
- **TRAJECTORY_LINKS** = all URLs in TRACE/TOOL_CALLS. Cite **only** these; do not invent/browse.
|
| 57 |
+
- Cite pivotal or non-obvious claims (dates, numbers, quotes, contested points).
|
| 58 |
+
- **Density with accuracy:** Prefer **dense citations** on non-obvious/pivotal claims **only when confident** the link supports the exact statement; avoid stray/low-confidence citations.
|
| 59 |
+
- **Sources used** = only URLs actually cited in REPORT.
|
| 60 |
+
- Citation format: append raw square bracketed full URLs immediately after the supported sentence/point, e.g., ββ¦ announced in 2003. [https://example.com/page]β.
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
========================
|
| 64 |
+
PLAN (MANDATORY CONTENT)
|
| 65 |
+
========================
|
| 66 |
+
1) **Question β Sections** (derivation):
|
| 67 |
+
- Decompose QUESTION into sub-questions SQ1..SQn, then plan the structure of the report around that to cover all bases.
|
| 68 |
+
- Clearly outline the breakdown and structure of the report and the thought process for it.
|
| 69 |
+
|
| 70 |
+
2) **Evidence Map: Section β URL/tool_call mapping**
|
| 71 |
+
- **Harvest** all URLs from TRACE and TOOL_CALLS β this forms TRAJECTORY_LINKS.
|
| 72 |
+
- For **each Section (S1..Sn)**, list the **evidence items** (every TRAJECTORY_LINK and its content explored in the TRACE) relevant to it.
|
| 73 |
+
- **Coverage rule:** Ensure **most** URL/tool_call items from TRACE is mapped to at least one Section (unless truly irrelevant to the topic).
|
| 74 |
+
- Use this table (include all rows; add as many as needed):
|
| 75 |
+
| Section | Item | | Content | Confidence |
|
| 76 |
+
|---|---|---|---|---|
|
| 77 |
+
| S1 | <URL_4> | date/stat/quote/context | High/Med/Low |
|
| 78 |
+
| S2 | <URL_1> <URL_2> | stat/definition/quote | High/Med/Low |
|
| 79 |
+
- If something is truly irrelevant, list under **Omitted as Irrelevant (with reason)**; keep this list short do not cite them in the report in this case.
|
| 80 |
+
|
| 81 |
+
3) **Layout the Strategy for insight generation**:
|
| 82 |
+
- 4β6 bullets on how you will generate higher level insight / aalysis: e.g., contrast/benchmark, timeline, ratios/growth, causal chain, risks.
|
| 83 |
+
- You may generate insights / analysis by concatenating **general background knowledge** with TRACE facts, but only if the TRACE facts remain central.
|
| 84 |
+
- Beyond description, provide **analysis, interpretation, and recommendations** where possible.
|
| 85 |
+
- Recommendations must be **derived strictly from TRACE evidence**. No hallucinated numbers or unsupported claims.
|
| 86 |
+
- If evidence is insufficient for a clear recommendation, state this explicitly.
|
| 87 |
+
|
| 88 |
+
========================
|
| 89 |
+
REPORT (MANDATORY CONTENT)
|
| 90 |
+
========================
|
| 91 |
+
- # Executive Summary β 5-10 crisp bullets with concrete takeaways; cite pivotal/non-obvious claims.
|
| 92 |
+
- ## Main Body β brief scope and inclusion rules; **provide higher-order insights built on the harvested evidence** (e.g., causal explanations, benchmarks, ratios/growth, timelines, scenarios/risks). Add a one-line deviation note if sections differ from PLAN.
|
| 93 |
+
- ## S1..Sn (exactly as defined in PLAN) β each section answers its mapped sub-question and **integrates all mapped evidence**:
|
| 94 |
+
- Weave facts; where β₯3 related numbers exist, add a small Markdown table.
|
| 95 |
+
- **Integrate as much of the TRACE/TOOL_CALLS information as possible** in a structured way based on the question decomposition; if an item is only contextual, summarize briefly and attribute.
|
| 96 |
+
- Call out conflicts with both sources cited.
|
| 97 |
+
- ## Recommendations β actionable, prioritized; must follow from cited evidence.
|
| 98 |
+
- ## Conclusion β 3β6 sentences directly answering the QUESTION.
|
| 99 |
+
- ## Sources used β deduplicated raw URLs, one per line (only those cited above).
|
| 100 |
+
|
| 101 |
+
========================
|
| 102 |
+
EXHAUSTIVENESS & COVERAGE
|
| 103 |
+
========================
|
| 104 |
+
- **Inclusion duty:** Factual detail explored in TRACE must appear in the final report unless completely irrlevant.
|
| 105 |
+
- **Do not compress away specifics.** Prioritize: (1) exact figures/dates, (2) named entities/products, (3) risks/criticisms, (4) methods/assumptions, (5) contextual detail.
|
| 106 |
+
- **Numeric presentation:** For β₯3 related numbers, render a small Markdown table with citations.
|
| 107 |
+
- Be verbose in the Main Body; detailed explanations / exhaustive covergage, novel synthesis, insights and dense citations are encouraged.
|
| 108 |
+
|
| 109 |
+
========================
|
| 110 |
+
QUALITY TARGETS (SCORING GUARDRAILS)
|
| 111 |
+
========================
|
| 112 |
+
- **Comprehensiveness (COMP):** Every URL/tool_response mapped in the plan is integrated. The REPORT should **strive to integrate maximum trace information** in context.
|
| 113 |
+
- **Insight/Depth (DEPTH):** Use contrast/benchmarks, timelines, ratios/growth, causal links, scenarios, and risk framing to explain βwhy it matters,β building insights **on top of the existing evidence** (no new facts).
|
| 114 |
+
- **Instruction-Following (INST):** Sections mirror sub-questions; each SQ is explicitly answered, the report should be precise and not digress from what is asked in the question.
|
| 115 |
+
- **Readability (READ):** Clear headings, short paragraphs, lead sentences with takeaways, tables for numeric clusters, and **dense-but-accurate** citations.
|
| 116 |
+
|
| 117 |
+
========================
|
| 118 |
+
STRICT OUTPUT FORMAT
|
| 119 |
+
========================
|
| 120 |
+
- You must give exactly one single output with the private planning / thinking enclosed within the <think></think> and the public facing report follwing that:
|
| 121 |
+
<think>[Plan here]</think>[Report here]
|
| 122 |
+
- The REPORT is strictly public-facing (no meta/process/thinking).
|
| 123 |
+
- Markdown only. Public-facing rationale; no hidden notes or menntion of the search trace or the thinking process in the report.
|
| 124 |
+
- Target lengt for the Report Section: **β₯2000 words** (longer if complexity requires).
|
| 125 |
+
"""
|
| 126 |
+
|
| 127 |
+
# SUMMARY_SYS_PROMPT: Final[str] = r"""
|
| 128 |
+
# You are a Summary Assistant.
|
| 129 |
+
|
| 130 |
+
# Goal: Produce a public-facing response that structures all information from input trace into a single answer.
|
| 131 |
+
|
| 132 |
+
# ========================
|
| 133 |
+
# INPUTS
|
| 134 |
+
# ========================
|
| 135 |
+
# - QUESTION: user's question.
|
| 136 |
+
# - TRACE: transcript (assistant/user/tool snippets).
|
| 137 |
+
# - TOOL_CALLS: raw tool calls (includes URLs and tool_responses).
|
| 138 |
+
|
| 139 |
+
# ========================
|
| 140 |
+
# RESPONSE (ANSWER) (MANDATORY CONTENT)
|
| 141 |
+
# ========================
|
| 142 |
+
# - The response to the user's question, enclosed in <answer></answer> tags.
|
| 143 |
+
# - The response must be well-structured and detailed, covering all important steps, ideas, and any evidence/calculations found in the trace.
|
| 144 |
+
# - If the task is CLOSED-ENDED (math/logic with a determinate result; factual single value/word; code producing a definite output), think and reason/plan internally and respond with the final part (explanation, method, proof, etc.) and present the result boxed with LaTeX: \boxed{β¦}.
|
| 145 |
+
# - If the task is OPEN-ENDED (analysis, synthesis, design choices, multiple valid outcomes), think and reason/plan internally and respond containing a detailed explanation of the search trace, sources, investigation, process/methodology, result/outcome/solution, conclusion, etc.; i.e. create a nicely-structured and detailed structure of the answer for the question, that can be shown to the user who asked it.
|
| 146 |
+
# - Keep the answer detailed and well-structured, providing a thorough explanation/methodology/solution for the final response, whatever is desired for in the user query. Do not just give a one-line/very-short final response. The answer maybe short if the question is trivial, but it must be well-structured and thorough.
|
| 147 |
+
|
| 148 |
+
# ========================
|
| 149 |
+
# STRICT OUTPUT FORMAT
|
| 150 |
+
# ========================
|
| 151 |
+
# - You must give exactly one single output with the private planning / thinking enclosed within the <think></think> and the public facing report follwing that:
|
| 152 |
+
# <think>[Plan here]</think><answer>[Final Answer here]</answer>
|
| 153 |
+
# - The final answer is strictly public-facing (no meta/process/thinking).
|
| 154 |
+
# - Markdown only.
|
| 155 |
+
# """
|
| 156 |
+
|
| 157 |
+
SUMMARY_SYS_PROMPT: Final[str] = r"""
|
| 158 |
+
You are an expert search trace structurer. Given a QUESTION and the full search TRACE (may include tool-call notes),
|
| 159 |
+
write a clear, accurate, self-contained explanation/solution using only the information in the trace. Do not add external facts.
|
| 160 |
+
|
| 161 |
+
What to produce:
|
| 162 |
+
- A single, readable well-structured narrative / solution that covers all important steps, ideas, and any evidence/calculations found in the trace.
|
| 163 |
+
- If the task is CLOSED-ENDED (math/logic with a determinate result; factual single value/word; code producing a definite output), think and reason/plan internally and respond with the final part (explanation, method, proof, etc.) and present the result boxed with LaTeX: \boxed{β¦}.
|
| 164 |
+
- If the task is OPEN-ENDED (analysis, synthesis, design choices, multiple valid outcomes), think and reason/plan internally and respond containing a detailed explanation of the search trace, sources, investigation, process/methodology, result/outcome/solution, conclusion, etc.; i.e. create a nicely-structured and detailed structure of the answer for the question, that can be shown to the user who asked it.
|
| 165 |
+
- Note: The final part is strictly public-facing (no meta/process/thinking) and is to be enclosed in <answer></answer> tags and the thinking/planning/reasoning is internal and to be compulsorily enclosed within <think></think> tags.
|
| 166 |
+
- The final part can be short or detailed depending on the question but has to be seperately enclosed in <answer></answer> tags and will be after the thinking block (which is to be enclosed in <think></think> tags).
|
| 167 |
+
|
| 168 |
+
Style:
|
| 169 |
+
- Clear prose and paragraphs; use LaTeX sparingly for clarity in math.
|
| 170 |
+
- Prefer thorough and detailed coverage; keep it shorter for trivial items.
|
| 171 |
+
- Use only facts present in the trace. If something is uncertain or missing, state it plainly and proceed with best-effort reasoning.
|
| 172 |
+
- Provide detailed explanation/methodology/solution for the final response (public-facing part), whatever is desired for in the user query. Do not just give a one-line/very-short final response.
|
| 173 |
+
- The final response should be well-structured and detailed and is to be enclosed within <answer></answer> tags.
|
| 174 |
+
- The reasoning part is non-public facing and internal, and should be enclosed within <think></think> tags.
|
| 175 |
+
|
| 176 |
+
**OUTPUT FORMAT:**
|
| 177 |
+
- Enclose your thinking/reasoning/planning (if you are thinking before answering) within the <think></think> tags: <think>{thinking here}</think>{response here}
|
| 178 |
+
- It is compulsory to use the <think></think> tags for enclosing planning/thinking/internal reasoning.
|
| 179 |
+
- Return the final answer in the format:
|
| 180 |
+
```<think>{your thinking here}</think>
|
| 181 |
+
<answer>{your final answer here}</answer>```
|
| 182 |
+
- The final answer part of the response is strictly public-facing and should be well-structured and detailed.
|
| 183 |
+
- Markdown only.
|
| 184 |
+
"""
|
re_call/re_call.py
ADDED
|
@@ -0,0 +1,1490 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import os
|
| 3 |
+
import json
|
| 4 |
+
import requests
|
| 5 |
+
import time
|
| 6 |
+
from typing import List, Optional, Dict
|
| 7 |
+
from .prompts import DEEPRESEARCH_SYS_PROMPT, SUMMARY_SYS_PROMPT
|
| 8 |
+
from functools import wraps
|
| 9 |
+
from together import Together # pip install together
|
| 10 |
+
from datetime import datetime # needed for retries / logging and date string (for giving current date and time to LLM)
|
| 11 |
+
|
| 12 |
+
# return decorator
|
| 13 |
+
def retry(max: int = 10, sleep: int = 1, fallback=None):
|
| 14 |
+
"""
|
| 15 |
+
Retry `max` times and, if still failing, return `fallback`
|
| 16 |
+
instead of raising. This keeps outer loops alive.
|
| 17 |
+
"""
|
| 18 |
+
def decorator(func):
|
| 19 |
+
@wraps(func)
|
| 20 |
+
def wrapper(*args, **kwargs):
|
| 21 |
+
for i in range(max):
|
| 22 |
+
try:
|
| 23 |
+
return func(*args, **kwargs)
|
| 24 |
+
except Exception as e:
|
| 25 |
+
print(f"[retry] attempt {i+1}/{max} failed: {e}")
|
| 26 |
+
if i == max - 1: # last try exhausted
|
| 27 |
+
print(f"[retry] giving up β returning {fallback!r}")
|
| 28 |
+
return fallback # β swallow the error
|
| 29 |
+
if sleep:
|
| 30 |
+
time.sleep(sleep)
|
| 31 |
+
return wrapper
|
| 32 |
+
return decorator
|
| 33 |
+
|
| 34 |
+
class ReCall():
|
| 35 |
+
date_str = \
|
| 36 |
+
f"""
|
| 37 |
+
|
| 38 |
+
**Note**: Today's Date is {datetime.now().strftime("%Y-%m-%d")}, and time is {datetime.now().strftime("%H:%M:%S")}. This may be useful for answering questions about current events."""
|
| 39 |
+
|
| 40 |
+
anti_chinese_str = \
|
| 41 |
+
"""
|
| 42 |
+
|
| 43 |
+
**Note**: Do not respond in chinese, do not think in chinese, only think and respond/answer in English, unless explicitly instructed by the user to respond in some other language."""
|
| 44 |
+
|
| 45 |
+
# proper_formatting_str = \
|
| 46 |
+
# """
|
| 47 |
+
# **Note**: Provide a well-structured answer first, then put only the final short answer in \\boxed{{}}.
|
| 48 |
+
|
| 49 |
+
# **How to format your response**
|
| 50 |
+
# - Write in clear English prose and use Markdown headings/bullets where helpful.
|
| 51 |
+
# - Give a detailed, in-depth explanation of the steps or facts used.
|
| 52 |
+
# - Use LaTeX only for short formulas/equations. For multi-line LaTeX, include line breaks (\\\\) or environments like \\begin{{align}} ... \\end{{align}} when genuinely helpful.
|
| 53 |
+
# - Do **not** wrap the whole response in LaTeX. Only the final short answer goes in \\boxed{{...}} on its own line at the end.
|
| 54 |
+
|
| 55 |
+
# **Examples**
|
| 56 |
+
|
| 57 |
+
# 1) **Simple fact question**
|
| 58 |
+
# **Question:** What is the capital of India?
|
| 59 |
+
# **Brief rationale:** Indiaβs seat of government and primary national institutions are located in New Delhi.
|
| 60 |
+
# **Final:** \\boxed{{New Delhi}}
|
| 61 |
+
|
| 62 |
+
# 2) **Quick calculation**
|
| 63 |
+
# **Question:** Convert 68^\\circ F to Celsius.
|
| 64 |
+
# **Approach:** Use C = (F - 32) \\times \\tfrac{{5}}{{9}}.
|
| 65 |
+
# **Computation:** (68 - 32) \\times \\tfrac{{5}}{{9}} = 20.
|
| 66 |
+
# **Final:** \\boxed{{20^\\circ C}}
|
| 67 |
+
|
| 68 |
+
# 3) **Search & synthesis (structured, detailed)**
|
| 69 |
+
# **Question:** When did the EUβs GDPR go into effect?
|
| 70 |
+
|
| 71 |
+
# **Complete Final Response:**
|
| 72 |
+
# '''**Key findings (evidence, concise):**
|
| 73 |
+
# - **European Commission overview** states GDPR βapplies from 25 May 2018.β
|
| 74 |
+
# - **EUR-Lex (Regulation (EU) 2016/679), Article 99**: entered into force 20 days after publication in the OJ (2016), and **applies from 25 May 2018**.
|
| 75 |
+
# - **EDPB FAQs/communications** reiterate that enforcement/application begins **25 May 2018**.
|
| 76 |
+
|
| 77 |
+
# **Cross-check & validation:**
|
| 78 |
+
# - Independent primary sources (Commission portal and EUR-Lex) agree on the same application date. A supervisory body source (EDPB) corroborates.
|
| 79 |
+
|
| 80 |
+
# **Common pitfalls addressed:**
|
| 81 |
+
# - Some secondary blogs list **24 May 2018**βthis confuses the **last day before** applicability with the first day **of** applicability.
|
| 82 |
+
# - βEntered into forceβ in **2016** (post-publication) is not the same as βapplication/effective for obligations,β which is **2018**.
|
| 83 |
+
|
| 84 |
+
# **Date normalization:**
|
| 85 |
+
# - Normalize to an unambiguous calendar date and present in a clear format (e.g., βMay 25, 2018β).
|
| 86 |
+
|
| 87 |
+
# **Conclusion:**
|
| 88 |
+
# - The effective (application) date for GDPR obligations across the EU is the same in all Member States and is confirmed by multiple primary sources.
|
| 89 |
+
|
| 90 |
+
# **Final:** \\boxed{{May\ 25,\ 2018}}'''
|
| 91 |
+
# """
|
| 92 |
+
|
| 93 |
+
# print(f"Date string:\n'{date_str}'")
|
| 94 |
+
|
| 95 |
+
# proper_formatting_str = \
|
| 96 |
+
# """
|
| 97 |
+
# **DeepResearch Response Protocol**
|
| 98 |
+
# Provide a comprehensive, decision-grade report first, then put only the short final answer in \\boxed{{}} on its own line at the very end.
|
| 99 |
+
|
| 100 |
+
# ---
|
| 101 |
+
|
| 102 |
+
# ## Mandatory Sections (in order)
|
| 103 |
+
|
| 104 |
+
# 1) **Executive Summary**
|
| 105 |
+
# - 5β10 bullets capturing the direct answer, key numbers/dates, and the top implications.
|
| 106 |
+
# - Include any material uncertainty (e.g., βmoderate confidence due to limited primary dataβ).
|
| 107 |
+
|
| 108 |
+
# 2) **Problem Framing & Scope**
|
| 109 |
+
# - One short paragraph restating the question, goals, and audience.
|
| 110 |
+
# - Clarify interpretations, exclusions, and assumptions. Define key terms and acronyms.
|
| 111 |
+
|
| 112 |
+
# 3) **Method (Search & Validation Plan)**
|
| 113 |
+
# - 5β8 bullets detailing how you searched and validated. Include:
|
| 114 |
+
# - **Source priority:** primary/official (laws, filings, standards, regulator notices) β reputable secondary (major outlets, respected orgs) β tertiary/background.
|
| 115 |
+
# - **Query strategy:** main queries and alternates (synonyms, regional spellings, technical names).
|
| 116 |
+
# - **Freshness policy:** prefer the most recent authoritative updates; when dates matter, distinguish **event date**, **publication/update date**, and **effective date**.
|
| 117 |
+
# - **Triangulation rule:** corroborate all key claims with β₯2 independent reputable sources (or 1 clear primary).
|
| 118 |
+
# - **Inclusion/Exclusion:** note discarded sources (paywalled, low quality, self-published without review) and why.
|
| 119 |
+
# - **Conflict resolution:** how disagreements will be weighed (mandate, jurisdiction, methodological rigor, recency).
|
| 120 |
+
|
| 121 |
+
# 4) **Evidence Ledger (Cited Facts)**
|
| 122 |
+
# - 6β15 bullets. Each bullet is a **Fact Card**:
|
| 123 |
+
# - **Claim:** one-sentence fact.
|
| 124 |
+
# - **Evidence:** short quote/figure/line (paraphrase unless a short quote is essential).
|
| 125 |
+
# - **Source:** Publisher/Title β (Event Date if applicable) β Publish/Update Date β Access Date.
|
| 126 |
+
# - **Confidence:** High / Medium / Low.
|
| 127 |
+
# - Group with mini-subheadings where helpful (e.g., βOfficial noticesβ, βRegulatory filingsβ, βPress coverageβ).
|
| 128 |
+
# - Explicitly flag contradictions.
|
| 129 |
+
|
| 130 |
+
# 5) **Timeline of Key Events**
|
| 131 |
+
# - A compact, chronological list linking milestones to sources; include both event and publication dates where relevant.
|
| 132 |
+
|
| 133 |
+
# 6) **Data Extraction & Normalization** (as needed)
|
| 134 |
+
# - Present important numbers in a small table (β€8 rows) with units, currency (ISO codes, e.g., **USD**), and rounding policy (state precision, e.g., βrounded to 2 decimalsβ).
|
| 135 |
+
# - Perform any conversions or calculations and show formulas succinctly (LaTeX inline for short formulas, e.g., \\( C = (F-32)\\times\\tfrac{{5}}{{9}} \\); use \\begin{{align}}β¦\\end{{align}} for multi-step math).
|
| 136 |
+
# - Specify timezones for dates/times when relevant.
|
| 137 |
+
|
| 138 |
+
# 7) **Comparative & Sensitivity Analysis** (if applicable)
|
| 139 |
+
# - Contrast competing interpretations, options, or sources; note trade-offs.
|
| 140 |
+
# - Include a brief sensitivity or scenario check if a key parameter could materially change the conclusion.
|
| 141 |
+
|
| 142 |
+
# 8) **Synthesis & Conclusion**
|
| 143 |
+
# - 2β4 tight paragraphs that integrate the evidence, resolve conflicts, and explain *why* the conclusion follows.
|
| 144 |
+
# - Be explicit about scope limits and residual uncertainties.
|
| 145 |
+
|
| 146 |
+
# 9) **Risks, Caveats & Unknowns**
|
| 147 |
+
# - Bullet the major risks, data gaps, and what would most change the answer.
|
| 148 |
+
# - Note any ethical, legal, or safety considerations.
|
| 149 |
+
|
| 150 |
+
# 10) **Recommendations / Next Steps** (if applicable)
|
| 151 |
+
# - Actionable items tailored to the userβs likely goal (e.g., verify with regulator X, monitor source Y weekly, collect dataset Z).
|
| 152 |
+
|
| 153 |
+
# 11) **Answer (one sentence)**
|
| 154 |
+
# - State the direct answer clearly with units/timezone as needed.
|
| 155 |
+
|
| 156 |
+
# 12) **Final**
|
| 157 |
+
# - Repeat only the short final answer inside \\boxed{{...}} with no extra words.
|
| 158 |
+
|
| 159 |
+
# 13) **Source Log (Audit Trail)**
|
| 160 |
+
# - A compact, reproducible list: *Title β Publisher/Author β (Event Date, if any) β Publish/Update Date β Access Date β URL*.
|
| 161 |
+
# - Prefer diverse, authoritative domains; avoid duplicates.
|
| 162 |
+
|
| 163 |
+
# ---
|
| 164 |
+
|
| 165 |
+
# ## Formatting & Quality Rules
|
| 166 |
+
|
| 167 |
+
# - Use clear English with Markdown headings and bullets; favor short paragraphs.
|
| 168 |
+
# - Do **not** reveal inner monologue or hidden chain-of-thought; provide only public-facing rationale.
|
| 169 |
+
# - Use LaTeX sparingly for math; do **not** wrap the entire response in LaTeX. Only the final short answer goes in \\boxed{{...}}.
|
| 170 |
+
# - Always specify units, currency codes, and timezones when relevant.
|
| 171 |
+
# - When listing β₯3 items or comparing options, include a small, focused table rather than long prose.
|
| 172 |
+
# - If information is uncertain or contested, *quantify* the uncertainty (confidence labels or ranges) and state why.
|
| 173 |
+
|
| 174 |
+
# ---
|
| 175 |
+
|
| 176 |
+
# ## Depth & Completeness Expectations
|
| 177 |
+
|
| 178 |
+
# - **Complex/high-stakes queries**: Populate all sections thoroughly; provide triangulated citations and explicit conflict resolution.
|
| 179 |
+
# - **Simple fact queries**: Keep Sections 3β9 concise (one to two lines each) but still cite at least one authoritative source.
|
| 180 |
+
# - Strive for neutrality, reproducibility, and decision usefulness over verbosity.
|
| 181 |
+
|
| 182 |
+
# ---
|
| 183 |
+
# """
|
| 184 |
+
|
| 185 |
+
proper_formatting_str = """"""
|
| 186 |
+
|
| 187 |
+
sys_prompt_non_search = """You are a helpful assistant. You will answer the user's question based on your knowledge and reasoning ability. You do not have access to the internet or any external tools. Do not use search. Answer all questions yourself.""" + date_str + anti_chinese_str
|
| 188 |
+
|
| 189 |
+
sys_prompt_websailor_start = """
|
| 190 |
+
You are a Web Information Seeking Master. Your task is to thoroughly seek the internet for information and provide accurate answers to questions. No matter how complex the query, you will not give up until you find the corresponding information.
|
| 191 |
+
In this environment you have access to a set of tools you can use to assist with the user query.
|
| 192 |
+
You may perform multiple rounds of function calls. In each round, you can call one or more functions.
|
| 193 |
+
|
| 194 |
+
As you proceed, adhere to the following principles:
|
| 195 |
+
|
| 196 |
+
1. **Persistent Actions for Answers**: You will engage in many interactions, delving deeply into the topic to explore all possible aspects until a satisfactory answer is found.
|
| 197 |
+
|
| 198 |
+
2. **Repeated Verification**: Before presenting a Final Answer, you will **cross-check** and **validate the information** you've gathered to confirm its accuracy and reliability.
|
| 199 |
+
|
| 200 |
+
3. **Attention to Detail**: You will carefully analyze each information source to ensure that all data is current, relevant, and from credible origins."""
|
| 201 |
+
|
| 202 |
+
sys_prompt_websailor = """
|
| 203 |
+
You are a Web Information Seeking Master. Your task is to thoroughly seek the internet for information and provide accurate answers to questions. No matter how complex the query, you will not give up until you find the corresponding information.
|
| 204 |
+
In this environment you have access to a set of tools you can use to assist with the user query.
|
| 205 |
+
You may perform multiple rounds of function calls. In each round, you can call one or more functions.
|
| 206 |
+
|
| 207 |
+
As you proceed, adhere to the following principles:
|
| 208 |
+
|
| 209 |
+
1. **Persistent Actions for Answers**: You will engage in many interactions, delving deeply into the topic to explore all possible aspects until a satisfactory answer is found.
|
| 210 |
+
|
| 211 |
+
2. **Repeated Verification**: Before presenting a Final Answer, you will **cross-check** and **validate the information** you've gathered to confirm its accuracy and reliability.
|
| 212 |
+
|
| 213 |
+
3. **Attention to Detail**: You will carefully analyze each information source to ensure that all data is current, relevant, and from credible origins.
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
Here are available functions in JSONSchema format: \n```json\n{func_schemas}\n```
|
| 218 |
+
|
| 219 |
+
In your response, you need to first think about the reasoning process in the mind and then conduct function calling to get the information or perform the actions if needed. \
|
| 220 |
+
The reasoning process and function calling are enclosed within <think> </think> and <tool_call> </tool_call> tags. \
|
| 221 |
+
The results of the function calls will be given back to you after execution, \
|
| 222 |
+
and you can continue to call functions until you get the final answer for the user's question.
|
| 223 |
+
|
| 224 |
+
For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
|
| 225 |
+
<tool_call>
|
| 226 |
+
{{"name": <function-name>, "arguments": <args-json-object>}}
|
| 227 |
+
</tool_call>
|
| 228 |
+
""" + date_str + anti_chinese_str + proper_formatting_str
|
| 229 |
+
|
| 230 |
+
sys_prompt_websailor_deepseek = """
|
| 231 |
+
You are a Web Information Seeking Master. Your task is to thoroughly seek the internet for information and provide accurate answers to questions. No matter how complex the query, you will not give up until you find the corresponding information.
|
| 232 |
+
In this environment you have access to a set of tools you can use to assist with the user query.
|
| 233 |
+
You may perform multiple rounds of function calls. In each round, you can call one or more functions.
|
| 234 |
+
|
| 235 |
+
As you proceed, adhere to the following principles:
|
| 236 |
+
|
| 237 |
+
1. **Persistent Actions for Answers**: You will engage in many interactions, delving deeply into the topic to explore all possible aspects until a satisfactory answer is found.
|
| 238 |
+
|
| 239 |
+
2. **Repeated Verification**: Before presenting a Final Answer, you will **cross-check** and **validate the information** you've gathered to confirm its accuracy and reliability.
|
| 240 |
+
|
| 241 |
+
3. **Attention to Detail**: You will carefully analyze each information source to ensure that all data is current, relevant, and from credible origins.
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
Here are available functions in JSONSchema format: \n```json\n{func_schemas}\n```
|
| 246 |
+
|
| 247 |
+
In your response, you need to first think about the reasoning process in the mind and then conduct function calling to get the information or perform the actions if needed. \
|
| 248 |
+
The reasoning process and function calling are enclosed within <think> </think> and <tool_calls_begin> <tool_calls_end> tags. \
|
| 249 |
+
The results of the function calls will be given back to you after execution, \
|
| 250 |
+
and you can continue to call functions until you get the final answer for the user's question. \
|
| 251 |
+
Finally, if you have got the answer, enclose it within \\boxed{{}} with latex format and do not continue to call functions, \
|
| 252 |
+
i.e., <think> Based on the response from the function call, I get the weather information. </think> The weather in Beijing on 2025-04-01 is \\[ \\boxed{{20C}} \\].
|
| 253 |
+
""" + date_str + anti_chinese_str + proper_formatting_str
|
| 254 |
+
|
| 255 |
+
# sys_prompt_websailor_deepseek = """
|
| 256 |
+
# You are a Web Information Seeking Master. Seek the internet thoroughly and provide accurate answers. You may use tools multiple times.
|
| 257 |
+
|
| 258 |
+
# Principles:
|
| 259 |
+
# 1) Persistent Actions for Answers: explore deeply until you find satisfactory information.
|
| 260 |
+
# 2) Repeated Verification: cross-check and validate before the final answer.
|
| 261 |
+
# 3) Attention to Detail: ensure sources are current, relevant, and credible.
|
| 262 |
+
|
| 263 |
+
# You have the following tools (JSONSchema):
|
| 264 |
+
# ```json
|
| 265 |
+
# {func_schemas}
|
| 266 |
+
# Follow this EXACT tool-call I/O protocol.
|
| 267 |
+
|
| 268 |
+
# TO CALL ONE OR MORE TOOLS:
|
| 269 |
+
# Respond only with this block (no extra text before/after):
|
| 270 |
+
# <ο½toolβcallβbeginο½>function<ο½toolβsepο½>{tool_name}{args_json}
|
| 271 |
+
# <ο½toolβcallβendο½>
|
| 272 |
+
# ... (repeat <ο½toolβcallβbeginο½>β¦<ο½toolβcallβendο½> for multiple tools)
|
| 273 |
+
# <ο½toolβcallsβendο½><ο½endβofβsentenceο½>
|
| 274 |
+
|
| 275 |
+
# HOW TOOL RESULTS ARRIVE:
|
| 276 |
+
# I will send tool outputs back embedded inside a single user message, each wrapped like:
|
| 277 |
+
# <tool_response>{one_tool_call_you_made}
|
| 278 |
+
# {tool_return_text_or_json}
|
| 279 |
+
# </tool_response>
|
| 280 |
+
|
| 281 |
+
# WHAT TO DO NEXT:
|
| 282 |
+
|
| 283 |
+
# If you still need info, emit another tool-calls block (same exact format).
|
| 284 |
+
|
| 285 |
+
# If you have the final answer, output:
|
| 286 |
+
# <answer> β¦your final answerβ¦ </answer>
|
| 287 |
+
# and DO NOT call any more tools.
|
| 288 |
+
|
| 289 |
+
# Important:
|
| 290 |
+
|
| 291 |
+
# Do not expose your internal reasoning; keep thoughts private.
|
| 292 |
+
|
| 293 |
+
# When emitting a tool-calls block, do not include any explanations, only the block specified above.
|
| 294 |
+
|
| 295 |
+
# Arguments must be valid JSON.
|
| 296 |
+
|
| 297 |
+
# Stop tokens to respect: <ο½endβofβsentenceο½>
|
| 298 |
+
# """
|
| 299 |
+
|
| 300 |
+
system_prompt = """In this environment you have access to a set of tools you can use to assist with the user query. \
|
| 301 |
+
You may perform multiple rounds of function calls. \
|
| 302 |
+
In each round, you can call one or more functions. \
|
| 303 |
+
|
| 304 |
+
Here are available functions in JSONSchema format: \n```json\n{func_schemas}\n```
|
| 305 |
+
|
| 306 |
+
In your response, you need to first think about the reasoning process in the mind and then conduct function calling to get the information or perform the actions if needed. \
|
| 307 |
+
The reasoning process and function calling are enclosed within <think> </think> and <tool_call> </tool_call> tags. \
|
| 308 |
+
The results of the function calls will be given back to you after execution, \
|
| 309 |
+
and you can continue to call functions until you get the final answer for the user's question. You are encouraged to utilize as many function calls as possible. \
|
| 310 |
+
Finally, if you have got the answer, wrap it in <answer> </answer> **and do not call any more functions**, \
|
| 311 |
+
e.g. <think> Based on the tool results β¦ </think> <answer>20 Β°C</answer>.
|
| 312 |
+
|
| 313 |
+
For each function call, return a JSON object with function name and arguments within <tool_call></tool_call> XML tags:
|
| 314 |
+
<tool_call>
|
| 315 |
+
{{"name": <function-name-1>, "arguments": <args-json-object>}}
|
| 316 |
+
</tool_call>""" + date_str + anti_chinese_str + proper_formatting_str
|
| 317 |
+
|
| 318 |
+
system_prompt_budget = """
|
| 319 |
+
You are an autonomous reasoning agent with access to external tools.
|
| 320 |
+
|
| 321 |
+
The conversation will retain only the *most-recent* <tool_response> block; older ones disappear.
|
| 322 |
+
As soon as you receive tool results, extract the *essential facts tables links etc* that might be needed for later and restate them inside your <think> section.
|
| 323 |
+
β**Never copy large bodies of text** or raw JSON from tool output into your visible reply; summarise instead.
|
| 324 |
+
|
| 325 |
+
β **Workflow**
|
| 326 |
+
1. In every round, start with <think> β¦ </think> to lay out your short reasoning.
|
| 327 |
+
2. If you need external information or an action, emit one or more <tool_call> β¦ </tool_call> blocks (JSON spec below).
|
| 328 |
+
3. When the environment returns <tool_response>, continue reasoning; you may call more tools.
|
| 329 |
+
4. Once you can answer the user, wrap the final result in <answer> β¦ </answer> and STOP calling tools.
|
| 330 |
+
|
| 331 |
+
β **Tool call format** (do **not** restate the schema or any explanations):
|
| 332 |
+
<tool_call>
|
| 333 |
+
{{"name": <function-name-1>, "arguments": <args-json-object>}}
|
| 334 |
+
</tool_call>
|
| 335 |
+
|
| 336 |
+
Here are available functions in JSONSchema format: \n```json\n{func_schemas}\n```
|
| 337 |
+
""" + date_str + anti_chinese_str + proper_formatting_str
|
| 338 |
+
|
| 339 |
+
|
| 340 |
+
|
| 341 |
+
system_prompt_forcing_tool_call = """
|
| 342 |
+
In this environment you have access to a set of tools you can use to assist with the user query.
|
| 343 |
+
You may perform multiple rounds of function calls upto ten. In each round, you can call upto three functions.
|
| 344 |
+
|
| 345 |
+
ββββββββββββββββββββββββ AVAILABLE TOOLS ββββββββββββββββββββββββ
|
| 346 |
+
```json
|
| 347 |
+
[
|
| 348 |
+
{
|
| 349 |
+
"type": "function",
|
| 350 |
+
"function": {
|
| 351 |
+
"name": "pubmed_search",
|
| 352 |
+
"description": "Search PubMed for Medical related queries.",
|
| 353 |
+
"parameters": {
|
| 354 |
+
"type": "object",
|
| 355 |
+
"properties": {
|
| 356 |
+
"query": { "type": "string", "description": "Query to search for." },
|
| 357 |
+
"top_n": { "type": "integer", "description": "Number of hits", "default": 3 }
|
| 358 |
+
},
|
| 359 |
+
"required": ["query"]
|
| 360 |
+
}
|
| 361 |
+
}
|
| 362 |
+
}
|
| 363 |
+
]
|
| 364 |
+
```
|
| 365 |
+
|
| 366 |
+
ββββββββββββββββββββββββββββββ RULES ββββββββββββββββββββββββββββββ
|
| 367 |
+
1. You MUST issue one pubmed_search tool call for each answer choice. Each query must relate the clinical context to that option.
|
| 368 |
+
2. You MAY NOT skip any option or decide based only on internal reasoning. Evidence must be retrieved for all choices.
|
| 369 |
+
3. You MAY issue follow-up tool calls if your reasoning leads you to need more evidence.
|
| 370 |
+
4. You MUST wrap all reasoning in <think> </think> tags and all tool usage in <tool_call> </tool_call> tags. Number of <tool_call> and </tool_call> tokens in the entire trace MUST always match.
|
| 371 |
+
5. Do NOT casually emit the <tool_call> </tool_call> during reasoning unless explicitly calling a tool in the proper format.
|
| 372 |
+
5. Your final answer must be enclosed a single letter corresponding to the correct option enclosed in the <answer> </answer> tags. Do not output anything else inside these tags.
|
| 373 |
+
6. DO NOT use any other confusing tags like <thiking> or </thinking>.
|
| 374 |
+
7. Each <think> </think> block MUST be followed by a <tool_call> </tool_call> or <answer> </answer> or else the program will break without an answer.
|
| 375 |
+
|
| 376 |
+
βββββββββββββββββββββ DUMMY EXAMPLE INTERLEAVED SKELETON βββββββββββββββββββββ
|
| 377 |
+
<think>
|
| 378 |
+
We are presented with a 54-year-old woman with invasive ductal carcinoma of the breast and osteolytic lesions in the thoracic spine. This strongly suggests metastatic spread. Our task is to determine the most likely anatomical route of metastasis to the spine.
|
| 379 |
+
|
| 380 |
+
Letβs examine the given options:
|
| 381 |
+
A. Hemiazygos vein
|
| 382 |
+
B. Posterior intercostal veins
|
| 383 |
+
C. Batsonβs vertebral venous plexus
|
| 384 |
+
D. Internal mammary lymphatics
|
| 385 |
+
|
| 386 |
+
We'll evaluate each option in turn using available literature and known anatomical pathways.
|
| 387 |
+
**Option A: Hemiazygos vein**
|
| 388 |
+
We begin by evaluating whether the hemiazygos vein could be involved in metastatic spread from breast cancer to the spine.
|
| 389 |
+
</think>
|
| 390 |
+
<tool_call>
|
| 391 |
+
{"name": "pubmed_search", "arguments": {"query": "breast cancer metastasis hemiazygos vein", "top_n": 2}}
|
| 392 |
+
</tool_call>
|
| 393 |
+
<tool_response>
|
| 394 |
+
...
|
| 395 |
+
</tool_response>
|
| 396 |
+
<think>
|
| 397 |
+
There is limited or no strong evidence suggesting the hemiazygos vein is a common or primary route for vertebral metastasis from breast cancer.
|
| 398 |
+
Lets explore **Option B: Posterior intercostal veins** and **Option C: Batsonβs vertebral venous plexus** and **Option D:Internal mammary lymphatics**
|
| 399 |
+
</think>
|
| 400 |
+
<tool_call>
|
| 401 |
+
{"name": "pubmed_search", "arguments": {"query": "posterior intercostal veins breast cancer spinal metastasis", "top_n": 3}}
|
| 402 |
+
</tool_call>
|
| 403 |
+
<tool_call>
|
| 404 |
+
{"name": "pubmed_search", "arguments": {"query": "Batson vertebral venous plexus breast cancer metastasis", "top_n": 3}}
|
| 405 |
+
</tool_call>
|
| 406 |
+
<tool_call>
|
| 407 |
+
{"name": "pubmed_search", "arguments": {"query": "Internal mammary lymphatics breast cancer metastasis", "top_n": 3}}
|
| 408 |
+
</tool_call>
|
| 409 |
+
<tool_response>
|
| 410 |
+
...
|
| 411 |
+
</tool_response>
|
| 412 |
+
<think>
|
| 413 |
+
While the posterior intercostal veins may be involved in venous drainage, there is insufficient evidence to support them as a primary route for metastasis to the vertebral column.
|
| 414 |
+
where as Batsonβs vertebral venous plexus β a valveless venous network that connects the thoracic and abdominal veins directly to the spine. I to find more specific information about option C.
|
| 415 |
+
</think>
|
| 416 |
+
<tool_call>
|
| 417 |
+
{"name": "pubmed_search", "arguments": {"query": ""Batson vertebral venous plexus breast cancer metastasis in people over 50", "top_n": 1}}
|
| 418 |
+
</tool_call>
|
| 419 |
+
<think>
|
| 420 |
+
After evaluating all four options, the most plausible route for breast cancer metastasis to the thoracic spine is clearly via Batsonβs vertebral venous plexus:
|
| 421 |
+
</think>
|
| 422 |
+
<answer>C</answer>
|
| 423 |
+
""" + date_str + anti_chinese_str + proper_formatting_str
|
| 424 |
+
# STOP_TOKENS =STOP_TOKENS = ["<|im_end|>", "<|endoftext|>"
|
| 425 |
+
|
| 426 |
+
|
| 427 |
+
def __init__(self, executor_url):
|
| 428 |
+
self.executor_url = executor_url
|
| 429 |
+
|
| 430 |
+
def init_prompt(self, func_schemas, question, old_prompt: Optional[str] = None, search_on: bool = True) -> str:
|
| 431 |
+
if old_prompt is None or len(old_prompt.strip()) == 0:
|
| 432 |
+
if search_on:
|
| 433 |
+
system_prompt = f"<|im_start|>system\n{self.sys_prompt_websailor.format(func_schemas=func_schemas)}<|im_end|>"
|
| 434 |
+
else:
|
| 435 |
+
system_prompt = f"<|im_start|>system\n{self.sys_prompt_non_search}<|im_end|>"
|
| 436 |
+
user_prompt = f"<|im_start|>user\n{question}<|im_end|>"
|
| 437 |
+
assistant_prefix = f"<|im_start|>assistant\n<think>"
|
| 438 |
+
return system_prompt + "\n" + user_prompt + "\n" + assistant_prefix
|
| 439 |
+
else:
|
| 440 |
+
user_prompt = f"<|im_start|>user\n{question}<|im_end|>"
|
| 441 |
+
assistant_prefix = f"<|im_start|>assistant\n<think>"
|
| 442 |
+
return old_prompt + "\n" + user_prompt + "\n" + assistant_prefix
|
| 443 |
+
|
| 444 |
+
def replace_sys_prompt(self, old_prompt: str, func_schemas: str, search_on: bool = True) -> str:
|
| 445 |
+
if search_on:
|
| 446 |
+
new_sys_prompt = f"<|im_start|>system\n{self.sys_prompt_websailor.format(func_schemas=func_schemas)}<|im_end|>"
|
| 447 |
+
old_sys_prompt = f"<|im_start|>system\n{self.sys_prompt_non_search}<|im_end|>"
|
| 448 |
+
else:
|
| 449 |
+
new_sys_prompt = f"<|im_start|>system\n{self.sys_prompt_non_search}<|im_end|>"
|
| 450 |
+
old_sys_prompt = f"<|im_start|>system\n{self.sys_prompt_websailor.format(func_schemas=func_schemas)}<|im_end|>"
|
| 451 |
+
|
| 452 |
+
return old_prompt.replace(old_sys_prompt, new_sys_prompt)
|
| 453 |
+
|
| 454 |
+
def _strip_old_tool_responses(self, prompt: str) -> str:
|
| 455 |
+
TOOL_RESPONSE_RE = re.compile(r"<tool_response>.*?</tool_response>\s*", re.DOTALL)
|
| 456 |
+
"""Remove every existing <tool_response> β¦ </tool_response> block."""
|
| 457 |
+
return TOOL_RESPONSE_RE.sub("", prompt)
|
| 458 |
+
|
| 459 |
+
def cat_assistant_response(self, curr_prompt, assistant_response):
|
| 460 |
+
return curr_prompt + assistant_response + "<|im_end|>"
|
| 461 |
+
|
| 462 |
+
def cat_tool_results(self, curr_prompt, tool_calls, results):
|
| 463 |
+
tool_response_str = ""
|
| 464 |
+
for tool_call, result in zip(tool_calls, results):
|
| 465 |
+
tool_response_str += f"<tool_response>{tool_call}\n{result}\n</tool_response>\n"
|
| 466 |
+
tool_response_str = f"<|im_start|>user\n{tool_response_str}<|im_end|>"
|
| 467 |
+
assistant_prefix = f"<|im_start|>assistant\n<think>"
|
| 468 |
+
return curr_prompt + "\n" + tool_response_str + "\n" + assistant_prefix
|
| 469 |
+
|
| 470 |
+
def format_tool_call(self, tool_call_str: str):
|
| 471 |
+
"""Convert JSON function call description to Python executable code string."""
|
| 472 |
+
try:
|
| 473 |
+
call_json = json.loads(tool_call_str)
|
| 474 |
+
func_name = call_json['name']
|
| 475 |
+
arguments = call_json.get('arguments', {})
|
| 476 |
+
|
| 477 |
+
args_str = ', '.join(f"{k}={repr(v)}" for k, v in arguments.items())
|
| 478 |
+
return f"{func_name}({args_str})"
|
| 479 |
+
except Exception as e:
|
| 480 |
+
return f"Parse tool call failed: {e}"
|
| 481 |
+
|
| 482 |
+
def execute_tool_calls(self, env: str, tool_calls: List[str]) -> List[str]:
|
| 483 |
+
def exe_tool_call(env, call):
|
| 484 |
+
url = self.executor_url + '/execute'
|
| 485 |
+
|
| 486 |
+
call_str = self.format_tool_call(call)
|
| 487 |
+
# print(call_str)
|
| 488 |
+
if call_str.startswith("error: parse tool call failed"):
|
| 489 |
+
return call_str
|
| 490 |
+
|
| 491 |
+
try:
|
| 492 |
+
data = {
|
| 493 |
+
'env': env,
|
| 494 |
+
'call': call_str
|
| 495 |
+
}
|
| 496 |
+
response = requests.post(url, json=data, timeout=60)
|
| 497 |
+
if response.status_code != 200:
|
| 498 |
+
return f"error: {response.status_code}"
|
| 499 |
+
response = response.json()
|
| 500 |
+
ret_str = ''
|
| 501 |
+
if response['result']:
|
| 502 |
+
ret_str += f'result: \n{response["result"]}\n'
|
| 503 |
+
if response['output']:
|
| 504 |
+
ret_str += f'output: \n{response["output"]}\n'
|
| 505 |
+
if response['error']:
|
| 506 |
+
ret_str += f'error: \n{response["error"]}\n'
|
| 507 |
+
return ret_str.strip()
|
| 508 |
+
except requests.exceptions.Timeout:
|
| 509 |
+
return "error: execution timed out"
|
| 510 |
+
except Exception as e:
|
| 511 |
+
return str(e)
|
| 512 |
+
|
| 513 |
+
results = []
|
| 514 |
+
for tool_call in tool_calls:
|
| 515 |
+
result = exe_tool_call(env, tool_call)
|
| 516 |
+
results.append(result)
|
| 517 |
+
return results
|
| 518 |
+
|
| 519 |
+
def validate_tool_calls(self, output_str):
|
| 520 |
+
start_tags = re.findall(r'<tool_call>', output_str)
|
| 521 |
+
end_tags = re.findall(r'</tool_call>', output_str)
|
| 522 |
+
|
| 523 |
+
if len(start_tags) != len(end_tags):
|
| 524 |
+
return False
|
| 525 |
+
|
| 526 |
+
start_positions = [m.start() for m in re.finditer(r'<tool_call>', output_str)]
|
| 527 |
+
end_positions = [m.start() for m in re.finditer(r'</tool_call>', output_str)]
|
| 528 |
+
|
| 529 |
+
for start, end in zip(start_positions, end_positions):
|
| 530 |
+
if start >= end:
|
| 531 |
+
return False
|
| 532 |
+
|
| 533 |
+
return True
|
| 534 |
+
|
| 535 |
+
def extract_tool_calls(self, output_str):
|
| 536 |
+
if not self.validate_tool_calls(output_str):
|
| 537 |
+
return []
|
| 538 |
+
|
| 539 |
+
try:
|
| 540 |
+
pattern = r'<tool_call>((?:(?!</tool_call>).)*)</tool_call>'
|
| 541 |
+
matches = re.finditer(pattern, output_str, re.DOTALL)
|
| 542 |
+
|
| 543 |
+
return [match.group(1).strip() for match in matches]
|
| 544 |
+
except Exception as e:
|
| 545 |
+
return []
|
| 546 |
+
|
| 547 |
+
def extract_tool_calls_deepseek(self, output_str):
|
| 548 |
+
if not self.validate_tool_calls(output_str):
|
| 549 |
+
return []
|
| 550 |
+
|
| 551 |
+
try:
|
| 552 |
+
pattern = r'<tool_calls_begin>((?:(?!</tool_calls_end>).)*)<tool_calls_end>'
|
| 553 |
+
matches = re.finditer(pattern, output_str, re.DOTALL)
|
| 554 |
+
|
| 555 |
+
return [match.group(1).strip() for match in matches]
|
| 556 |
+
except Exception as e:
|
| 557 |
+
return []
|
| 558 |
+
|
| 559 |
+
|
| 560 |
+
|
| 561 |
+
@retry(max=5, sleep=1, fallback={"score": 0})
|
| 562 |
+
def run_ii_searcher(
|
| 563 |
+
self,
|
| 564 |
+
env: str,
|
| 565 |
+
func_schemas: str,
|
| 566 |
+
question: str,
|
| 567 |
+
tokenizer,
|
| 568 |
+
model_url="http://0.0.0.0:1214",
|
| 569 |
+
temperature: float = 0.0,
|
| 570 |
+
max_new_tokens: int = 40960,
|
| 571 |
+
):
|
| 572 |
+
curr_prompt = self.init_prompt(func_schemas, question)
|
| 573 |
+
all_tool_calls= []
|
| 574 |
+
|
| 575 |
+
for _ in range(16):
|
| 576 |
+
prompt_tokens = tokenizer(curr_prompt, return_tensors=None, add_special_tokens=False)["input_ids"]
|
| 577 |
+
max_tokens_left = max_new_tokens - len(prompt_tokens) - 100
|
| 578 |
+
# for oss model served via vllm
|
| 579 |
+
# response = requests.post(
|
| 580 |
+
# f'{model_url}/v1/chat/completions',
|
| 581 |
+
# json={
|
| 582 |
+
# "text": curr_prompt,
|
| 583 |
+
# # "reasoning": "medium"
|
| 584 |
+
# },
|
| 585 |
+
# ).json()
|
| 586 |
+
# for sglang served models hf models
|
| 587 |
+
response = requests.post(
|
| 588 |
+
f'{model_url}/generate',
|
| 589 |
+
json={
|
| 590 |
+
"text": curr_prompt,
|
| 591 |
+
"sampling_params": {
|
| 592 |
+
"temperature": temperature,
|
| 593 |
+
"max_new_tokens": max_tokens_left,
|
| 594 |
+
"repetition_penalty": 1.05
|
| 595 |
+
},
|
| 596 |
+
|
| 597 |
+
}
|
| 598 |
+
).json()
|
| 599 |
+
if "error" in response.keys():
|
| 600 |
+
print("resp",response)
|
| 601 |
+
curr_prompt = self.cat_assistant_response(curr_prompt, response['text'])
|
| 602 |
+
|
| 603 |
+
tool_calls: List[str] = self.extract_tool_calls(response['text'])
|
| 604 |
+
all_tool_calls += tool_calls
|
| 605 |
+
|
| 606 |
+
if len(tool_calls) == 0:
|
| 607 |
+
break
|
| 608 |
+
|
| 609 |
+
else:
|
| 610 |
+
results: List[str] = self.execute_tool_calls(env, tool_calls)
|
| 611 |
+
curr_prompt = self.cat_tool_results(curr_prompt, tool_calls, results)
|
| 612 |
+
|
| 613 |
+
return curr_prompt, all_tool_calls
|
| 614 |
+
|
| 615 |
+
# @retry(max=5, sleep=1, fallback={"score": 0})
|
| 616 |
+
# def run(
|
| 617 |
+
# self,
|
| 618 |
+
# env: str,
|
| 619 |
+
# func_schemas: str,
|
| 620 |
+
# question: str,
|
| 621 |
+
# tokenizer,
|
| 622 |
+
# model_url="http://0.0.0.0:1214",
|
| 623 |
+
# temperature: float = 0.0,
|
| 624 |
+
# max_new_tokens: int = 40960,
|
| 625 |
+
# ):
|
| 626 |
+
# curr_prompt = self.init_prompt(func_schemas, question)
|
| 627 |
+
# all_tool_calls= []
|
| 628 |
+
|
| 629 |
+
# for i in range(32):
|
| 630 |
+
# prompt_tokens = tokenizer(curr_prompt, return_tensors=None, add_special_tokens=False)["input_ids"]
|
| 631 |
+
# max_tokens_left = max_new_tokens - len(prompt_tokens) - 100
|
| 632 |
+
# # for oss model served via vllm
|
| 633 |
+
# # response = requests.post(
|
| 634 |
+
# # f'{model_url}/v1/chat/completions',
|
| 635 |
+
# # json={
|
| 636 |
+
# # "text": curr_prompt,
|
| 637 |
+
# # # "reasoning": "medium"
|
| 638 |
+
# # },
|
| 639 |
+
# # ).json()
|
| 640 |
+
# # for sglang served models hf models
|
| 641 |
+
# response = requests.post(
|
| 642 |
+
# f'{model_url}/generate',
|
| 643 |
+
# json={
|
| 644 |
+
# "text": curr_prompt,
|
| 645 |
+
# "sampling_params": {
|
| 646 |
+
# "temperature": temperature,
|
| 647 |
+
# "max_new_tokens": max_tokens_left,
|
| 648 |
+
# "repetition_penalty": 1.05
|
| 649 |
+
# },
|
| 650 |
+
|
| 651 |
+
# }
|
| 652 |
+
# ).json()
|
| 653 |
+
# if "error" in response.keys():
|
| 654 |
+
# print("resp",response)
|
| 655 |
+
# curr_prompt = self.cat_assistant_response(curr_prompt, response['text'])
|
| 656 |
+
|
| 657 |
+
# tool_calls: List[str] = self.extract_tool_calls(response['text'])
|
| 658 |
+
# all_tool_calls += tool_calls
|
| 659 |
+
|
| 660 |
+
# if len(tool_calls) == 0:
|
| 661 |
+
# break
|
| 662 |
+
|
| 663 |
+
# else:
|
| 664 |
+
# # print(f"Step-{i+1}")
|
| 665 |
+
# results: List[str] = self.execute_tool_calls(env, tool_calls)
|
| 666 |
+
# curr_prompt = self.cat_tool_results(curr_prompt, tool_calls, results)
|
| 667 |
+
|
| 668 |
+
# return curr_prompt, all_tool_calls
|
| 669 |
+
from typing import List, Dict, Any, Tuple
|
| 670 |
+
import requests
|
| 671 |
+
|
| 672 |
+
def build_summary_prompt(self, question: str, transcript: str, tool_calls: Any) -> str:
|
| 673 |
+
"""Assemble a compact but detailed prompt for summarization."""
|
| 674 |
+
tool_str = ""
|
| 675 |
+
if tool_calls is not None:
|
| 676 |
+
try:
|
| 677 |
+
tool_str = str(tool_calls)
|
| 678 |
+
except Exception:
|
| 679 |
+
tool_str = "<unprintable tool_calls>"
|
| 680 |
+
return (
|
| 681 |
+
"You are given a DeepSearch investigation trace.\n\n"
|
| 682 |
+
f"Question:\n{question}\n\n"
|
| 683 |
+
"Trace (model transcript):\n"
|
| 684 |
+
f"{transcript}\n\n"
|
| 685 |
+
"Tool Calls (as-recorded):\n"
|
| 686 |
+
f"{tool_str}\n\n"
|
| 687 |
+
"β End of trace β"
|
| 688 |
+
)
|
| 689 |
+
|
| 690 |
+
def reformat_trace(self, s: str) -> str:
|
| 691 |
+
if not s:
|
| 692 |
+
return s
|
| 693 |
+
|
| 694 |
+
t = s
|
| 695 |
+
|
| 696 |
+
# 1) Speaker tags: <|im_start|>assistant -> "ASSISTANT:\n"
|
| 697 |
+
def _speaker(m: re.Match) -> str:
|
| 698 |
+
role = (m.group(1) or "").strip().upper()
|
| 699 |
+
return f"\n{role}:\n"
|
| 700 |
+
t = re.sub(r"<\|im_start\|\>(\w+)", _speaker, t, flags=re.IGNORECASE)
|
| 701 |
+
|
| 702 |
+
# 2) End-of-message tag: drop but keep spacing
|
| 703 |
+
t = re.sub(r"<\|im_end\|\>", "\n", t, flags=re.IGNORECASE)
|
| 704 |
+
|
| 705 |
+
# 3) THINK blocks: replace tags with label, keep content
|
| 706 |
+
t = re.sub(r"<think\s*>", "", t, flags=re.IGNORECASE)
|
| 707 |
+
t = re.sub(r"</think\s*>", "\n", t, flags=re.IGNORECASE)
|
| 708 |
+
|
| 709 |
+
# 4) TOOL RESPONSE blocks: support both 'response' and the misspelt 'resonse'
|
| 710 |
+
t = re.sub(r"<tool_respon[sc]e\s*>", "SEARCH RESULT\n", t, flags=re.IGNORECASE)
|
| 711 |
+
t = re.sub(r"</tool_respon[sc]e\s*>", "\n", t, flags=re.IGNORECASE)
|
| 712 |
+
|
| 713 |
+
# 5) TOOL CALL wrappers: drop tags, keep the JSON/content
|
| 714 |
+
t = re.sub(r"</?tool_call\s*>", "", t, flags=re.IGNORECASE)
|
| 715 |
+
|
| 716 |
+
# 6) Any remaining ChatML specials like <|eot_id|>, <|...|> -> remove
|
| 717 |
+
t = re.sub(r"<\|[^>]+?\|>", "", t)
|
| 718 |
+
|
| 719 |
+
# 7) Remove any other angle-bracket tags we didnβt explicitly keep
|
| 720 |
+
# (leaves inner text intact). This will strip e.g. <tool_response_extra>
|
| 721 |
+
t = re.sub(r"</?[^>\n]+?>", "", t)
|
| 722 |
+
|
| 723 |
+
# 8) Normalize whitespace (collapse 3+ newlines to 2)
|
| 724 |
+
t = re.sub(r"\n{3,}", "\n\n", t).strip()
|
| 725 |
+
|
| 726 |
+
return t
|
| 727 |
+
|
| 728 |
+
def _openai_client(self):
|
| 729 |
+
try:
|
| 730 |
+
from openai import OpenAI # type: ignore
|
| 731 |
+
except Exception as e:
|
| 732 |
+
raise RuntimeError("openai package not installed. `pip install openai`") from e
|
| 733 |
+
return OpenAI()
|
| 734 |
+
|
| 735 |
+
def init_summary_prompt(self, system_prompt: str, prompt: str) -> str:
|
| 736 |
+
system_prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>"
|
| 737 |
+
user_prompt = f"<|im_start|>user\n{prompt}<|im_end|>"
|
| 738 |
+
assistant_prefix = f"<|im_start|>assistant\n<think>"
|
| 739 |
+
return system_prompt + "\n" + user_prompt + "\n" + assistant_prefix
|
| 740 |
+
|
| 741 |
+
def _call_hf_endpoint(self, base_url: str, system_prompt: str, prompt: str, temperature: float, max_tokens: int, deepresearch_on: bool) -> str:
|
| 742 |
+
curr_prompt = self.init_summary_prompt(system_prompt, prompt)
|
| 743 |
+
|
| 744 |
+
hf_token= os.environ['HF_TOKEN']
|
| 745 |
+
|
| 746 |
+
headers = {
|
| 747 |
+
"Accept" : "application/json",
|
| 748 |
+
"Authorization": f"Bearer {hf_token}",
|
| 749 |
+
"Content-Type": "application/json"
|
| 750 |
+
}
|
| 751 |
+
|
| 752 |
+
# print(f"User Prompt:\n{curr_prompt}\n\n")
|
| 753 |
+
|
| 754 |
+
response_summary = requests.post(
|
| 755 |
+
url=f"{base_url}",
|
| 756 |
+
headers=headers,
|
| 757 |
+
json={
|
| 758 |
+
"inputs": curr_prompt,
|
| 759 |
+
"parameters": {
|
| 760 |
+
"temperature": temperature,
|
| 761 |
+
"max_new_tokens": max_tokens,
|
| 762 |
+
"top_p": 0.95,
|
| 763 |
+
"repetition_penalty": 1.05,
|
| 764 |
+
},
|
| 765 |
+
},
|
| 766 |
+
timeout=300,
|
| 767 |
+
).json()
|
| 768 |
+
|
| 769 |
+
if isinstance(response_summary, list):
|
| 770 |
+
response_summary = response_summary[0]
|
| 771 |
+
|
| 772 |
+
if isinstance(response_summary, dict) and "error" in response_summary:
|
| 773 |
+
# Log the error as assistant text for visibility and break
|
| 774 |
+
err_msg = f"[model_error] {response_summary.get('error')}"
|
| 775 |
+
print("Got error response from summarising model:", err_msg, end="\n\n")
|
| 776 |
+
|
| 777 |
+
assistant_text = response_summary.get("generated_text", "")
|
| 778 |
+
|
| 779 |
+
if curr_prompt == assistant_text[:len(curr_prompt)]:
|
| 780 |
+
assistant_text = assistant_text[len(curr_prompt):]
|
| 781 |
+
|
| 782 |
+
# print(assistant_text)
|
| 783 |
+
|
| 784 |
+
report = re.split(r"</think\s*>", assistant_text, flags=re.IGNORECASE)[-1]
|
| 785 |
+
# plan = re.split(r"</think\s*>", assistant_text, flags=re.IGNORECASE)[0]
|
| 786 |
+
|
| 787 |
+
# print(report, "\n\n")
|
| 788 |
+
|
| 789 |
+
if not deepresearch_on:
|
| 790 |
+
report = report.strip()
|
| 791 |
+
# report = report[::-1]
|
| 792 |
+
# str_find = "Final Answer:"
|
| 793 |
+
# pos = report.find(str_find[::-1])
|
| 794 |
+
# pos += len(str_find)
|
| 795 |
+
# report = report[pos:][::-1]
|
| 796 |
+
# report = report.rstrip('# \n-').strip(' \n-')
|
| 797 |
+
|
| 798 |
+
start_tag = "<answer>"
|
| 799 |
+
end_tag = "</answer>"
|
| 800 |
+
pos_start = report.find(start_tag)
|
| 801 |
+
pos_end = report[pos_start:].find(end_tag) + pos_start
|
| 802 |
+
answer = report
|
| 803 |
+
if pos_start != -1 and pos_end != -1:
|
| 804 |
+
answer = report[pos_start + len(start_tag):pos_end].strip()
|
| 805 |
+
|
| 806 |
+
str_find = "Final Answer:"
|
| 807 |
+
if str_find in answer:
|
| 808 |
+
answer = answer[::-1]
|
| 809 |
+
pos = answer.find(str_find[::-1])
|
| 810 |
+
pos += len(str_find)
|
| 811 |
+
answer = answer[pos:][::-1]
|
| 812 |
+
answer = answer.rstrip('# \n-').strip(' \n-')
|
| 813 |
+
|
| 814 |
+
# print("answer:")
|
| 815 |
+
# print(answer, "\n\n")
|
| 816 |
+
|
| 817 |
+
return answer
|
| 818 |
+
|
| 819 |
+
report = report.strip()
|
| 820 |
+
report = report[::-1]
|
| 821 |
+
str_find = "Sources used"
|
| 822 |
+
pos = report.find(str_find[::-1])
|
| 823 |
+
pos += len(str_find)
|
| 824 |
+
report = report[pos:][::-1]
|
| 825 |
+
report = report.rstrip('# \n-').strip(' \n-')
|
| 826 |
+
|
| 827 |
+
if not report.startswith("##") and report.startswith("#"):
|
| 828 |
+
report = "#" + report
|
| 829 |
+
elif not report.startswith("##") and not report.startswith("#"):
|
| 830 |
+
report = "## " + report
|
| 831 |
+
|
| 832 |
+
# report = '\n\n' + report.strip()
|
| 833 |
+
|
| 834 |
+
# print(report.find('Executive Summary'), report.find('#'))
|
| 835 |
+
# print(f"'{report[:20]}'")
|
| 836 |
+
|
| 837 |
+
# print(report,"\n\n")
|
| 838 |
+
|
| 839 |
+
urls = {}
|
| 840 |
+
count = 1
|
| 841 |
+
|
| 842 |
+
while "[http" in report:
|
| 843 |
+
start_idx = report.find("[http")
|
| 844 |
+
end_idx = report.find("]", start_idx)
|
| 845 |
+
if end_idx != -1:
|
| 846 |
+
url_string = report[start_idx + 1:end_idx]
|
| 847 |
+
url_list = []
|
| 848 |
+
while len(url_string) > 0:
|
| 849 |
+
pos1 = url_string.find(";")
|
| 850 |
+
pos2 = url_string.find(",")
|
| 851 |
+
pos3 = url_string.find(" ")
|
| 852 |
+
|
| 853 |
+
if pos1 == -1:
|
| 854 |
+
pos1 = len(url_string) + 1
|
| 855 |
+
if pos2 == -1:
|
| 856 |
+
pos2 = len(url_string) + 1
|
| 857 |
+
if pos3 == -1:
|
| 858 |
+
pos3 = len(url_string) + 1
|
| 859 |
+
|
| 860 |
+
pos = min(pos1, pos2, pos3)
|
| 861 |
+
|
| 862 |
+
if pos == len(url_string) + 1:
|
| 863 |
+
url = url_string
|
| 864 |
+
else:
|
| 865 |
+
url = url_string[:pos]
|
| 866 |
+
|
| 867 |
+
url_list.append(url)
|
| 868 |
+
|
| 869 |
+
if pos < len(url_string):
|
| 870 |
+
url_string = url_string[pos + 1:].lstrip(" ,;")
|
| 871 |
+
else:
|
| 872 |
+
break
|
| 873 |
+
|
| 874 |
+
report_new = report[:start_idx] + '(**'
|
| 875 |
+
for url in url_list:
|
| 876 |
+
if url not in urls:
|
| 877 |
+
urls[url] = count
|
| 878 |
+
count += 1
|
| 879 |
+
report_new += f'[{urls[url]}], '
|
| 880 |
+
report_new = report_new[:-2]
|
| 881 |
+
report_new += '**)' + report[end_idx+1:]
|
| 882 |
+
report = report_new
|
| 883 |
+
else:
|
| 884 |
+
break
|
| 885 |
+
|
| 886 |
+
if len(urls) > 0:
|
| 887 |
+
report += "\n\n## Sources used:\n"
|
| 888 |
+
sorted_urls = sorted(urls.items(), key=lambda x: x[1])
|
| 889 |
+
for url, idx in sorted_urls:
|
| 890 |
+
report += f"- **{idx}**: {url}\n"
|
| 891 |
+
report += '\n'
|
| 892 |
+
# adding references (auto-removed in markdown)
|
| 893 |
+
for url, idx in sorted_urls:
|
| 894 |
+
report += f"[{idx}]: {url}\n"
|
| 895 |
+
|
| 896 |
+
# print(report,"\n\n")
|
| 897 |
+
|
| 898 |
+
return report
|
| 899 |
+
|
| 900 |
+
def _route_and_summarize(
|
| 901 |
+
self,
|
| 902 |
+
summary_llm: str,
|
| 903 |
+
system_prompt: str,
|
| 904 |
+
prompt: str,
|
| 905 |
+
*,
|
| 906 |
+
temperature: float,
|
| 907 |
+
max_tokens: int,
|
| 908 |
+
deepresearch_on: bool,
|
| 909 |
+
) -> str:
|
| 910 |
+
"""
|
| 911 |
+
If `summary_llm` starts with 'http', treat as vLLM base_url; else treat as an OpenAI model id.
|
| 912 |
+
For vLLM, prepend [SYSTEM]/[USER] tags; for OpenAI, pass messages with system+user.
|
| 913 |
+
"""
|
| 914 |
+
if not summary_llm.strip().lower().startswith("gpt-"):
|
| 915 |
+
# print(system_prompt)
|
| 916 |
+
# print(prompt)
|
| 917 |
+
return self._call_hf_endpoint(summary_llm, system_prompt, prompt, temperature=temperature, max_tokens=max_tokens, deepresearch_on=deepresearch_on)
|
| 918 |
+
|
| 919 |
+
else:
|
| 920 |
+
client = self._openai_client()
|
| 921 |
+
rsp = client.chat.completions.create(
|
| 922 |
+
model=summary_llm,
|
| 923 |
+
temperature=temperature,
|
| 924 |
+
messages=[
|
| 925 |
+
{"role": "system", "content": system_prompt},
|
| 926 |
+
{"role": "user", "content": prompt},
|
| 927 |
+
],
|
| 928 |
+
max_tokens=max_tokens,
|
| 929 |
+
)
|
| 930 |
+
|
| 931 |
+
return rsp.choices[0].message.content or ""
|
| 932 |
+
|
| 933 |
+
@retry(max=5, sleep=1, fallback={"score": 0})
|
| 934 |
+
def run(
|
| 935 |
+
self,
|
| 936 |
+
env: str,
|
| 937 |
+
func_schemas: str,
|
| 938 |
+
question: str,
|
| 939 |
+
tokenizer,
|
| 940 |
+
model_url: str = "http://0.0.0.0:1214",
|
| 941 |
+
temperature: float = 0.0,
|
| 942 |
+
max_new_tokens: int = 40960,
|
| 943 |
+
top_p: float = 0.6,
|
| 944 |
+
old_prompt: Optional[str] = None,
|
| 945 |
+
deepresearch_on: bool = True,
|
| 946 |
+
summary_llm: str = "gpt-4.1-mini"
|
| 947 |
+
):
|
| 948 |
+
# ) -> Tuple[str, List[str], List[Dict[str, str]]]:
|
| 949 |
+
"""
|
| 950 |
+
Returns:
|
| 951 |
+
curr_prompt: the final prompt buffer (with assistant/tool traces you maintain internally)
|
| 952 |
+
all_tool_calls: flat list of all tool call strings extracted across steps
|
| 953 |
+
chat: a lightweight chat transcript list[{"role": "...", "content": "..."}]
|
| 954 |
+
β’ 'user' items = the original question + aggregated tool responses
|
| 955 |
+
β’ 'assistant' items = model responses (and a compact line-list of tool calls)
|
| 956 |
+
"""
|
| 957 |
+
# off_str = "\n\n**User has TURNED OFF search**. **DO NOT use search**. **Answer all questions YOURSELF**. **DO NOT use any tools**.\n**YOUR FIRST-RESPONSE WILL BE CONSIDERED AS THE FINAL ANSWER**. **YOU WILL NOT GET TO CALL TOOLS AND WAIT FOR TOOL RESULTS AND THEN ANSWER**.\n**YOU WON'T BE ALLOWED TO CHAT AND CALL TOOLS, IN A MULTI-TURN FASHION**. **YOU WILL CHAT IN A SINGLE-TURN FORMAT**.\n**SO MAKE SURE YOUR FIRST RESPONSE IS THE FINAL ANSWER**.\n"
|
| 958 |
+
|
| 959 |
+
# if not search_on and (old_prompt is not None and self.sys_prompt_websailor_start not in old_prompt):
|
| 960 |
+
# question += off_str
|
| 961 |
+
|
| 962 |
+
search_on = True
|
| 963 |
+
|
| 964 |
+
if old_prompt is not None:
|
| 965 |
+
old_prompt = self.replace_sys_prompt(old_prompt, func_schemas, search_on)
|
| 966 |
+
|
| 967 |
+
# Build runtime prompt and initialize accumulators
|
| 968 |
+
curr_prompt = self.init_prompt(func_schemas, question, old_prompt, search_on)
|
| 969 |
+
all_tool_calls: List[str] = []
|
| 970 |
+
chat: List[Dict[str, str]] = []
|
| 971 |
+
|
| 972 |
+
# Seed transcript with JUST the question (no system prompt)
|
| 973 |
+
chat.append({"role": "user", "content": question})
|
| 974 |
+
|
| 975 |
+
for i in range(64):
|
| 976 |
+
# Budget tokens for this step
|
| 977 |
+
prompt_tokens = tokenizer(curr_prompt, return_tensors=None, add_special_tokens=False)["input_ids"]
|
| 978 |
+
max_tokens_left = max(1, max_new_tokens - len(prompt_tokens) - 100)
|
| 979 |
+
|
| 980 |
+
# ---- Model call (sglang/vLLM-style JSON) ----
|
| 981 |
+
# If you switch to /v1/chat/completions, adjust accordingly.
|
| 982 |
+
hf_token= os.environ['HF_TOKEN']
|
| 983 |
+
|
| 984 |
+
headers = {
|
| 985 |
+
"Accept" : "application/json",
|
| 986 |
+
"Authorization": f"Bearer {hf_token}",
|
| 987 |
+
"Content-Type": "application/json"
|
| 988 |
+
}
|
| 989 |
+
|
| 990 |
+
# print(f"User Prompt:\n{curr_prompt}\n\n")
|
| 991 |
+
|
| 992 |
+
response = requests.post(
|
| 993 |
+
url=f"{model_url}",
|
| 994 |
+
headers=headers,
|
| 995 |
+
json={
|
| 996 |
+
"inputs": curr_prompt,
|
| 997 |
+
"parameters": {
|
| 998 |
+
"temperature": temperature,
|
| 999 |
+
"max_new_tokens": max_tokens_left,
|
| 1000 |
+
"top_p": top_p,
|
| 1001 |
+
"repetition_penalty": 1.05,
|
| 1002 |
+
},
|
| 1003 |
+
},
|
| 1004 |
+
timeout=300,
|
| 1005 |
+
).json()
|
| 1006 |
+
|
| 1007 |
+
if isinstance(response, list):
|
| 1008 |
+
response = response[0]
|
| 1009 |
+
|
| 1010 |
+
if isinstance(response, dict) and "error" in response:
|
| 1011 |
+
# Log the error as assistant text for visibility and break
|
| 1012 |
+
err_msg = f"[model_error] {response.get('error')}"
|
| 1013 |
+
print("Got error response from model:", err_msg, end="\n\n")
|
| 1014 |
+
chat.append({"role": "assistant", "content": err_msg})
|
| 1015 |
+
break
|
| 1016 |
+
|
| 1017 |
+
assistant_text = response.get("generated_text", "")
|
| 1018 |
+
|
| 1019 |
+
if curr_prompt == assistant_text[:len(curr_prompt)]:
|
| 1020 |
+
# print("Current prompt is a prefix to generated text.")
|
| 1021 |
+
# If the assistant's response is just a continuation of the prompt, we can use it directly
|
| 1022 |
+
assistant_text = assistant_text[len(curr_prompt):]
|
| 1023 |
+
|
| 1024 |
+
# print(f"Assistant Text:\n{assistant_text}\n\n")
|
| 1025 |
+
|
| 1026 |
+
# Append assistant's raw text to chat
|
| 1027 |
+
chat.append({"role": "assistant", "content": assistant_text})
|
| 1028 |
+
|
| 1029 |
+
# Update your running prompt with assistant text
|
| 1030 |
+
curr_prompt = self.cat_assistant_response(curr_prompt, assistant_text)
|
| 1031 |
+
|
| 1032 |
+
# Extract tool calls from the assistant text
|
| 1033 |
+
if search_on:
|
| 1034 |
+
tool_calls: List[str] = self.extract_tool_calls(assistant_text)
|
| 1035 |
+
|
| 1036 |
+
else:
|
| 1037 |
+
tool_calls: List[str] = []
|
| 1038 |
+
|
| 1039 |
+
# yield "assistant_resp", (assistant_text, tool_calls)
|
| 1040 |
+
|
| 1041 |
+
if tool_calls:
|
| 1042 |
+
yield "assistant_resp", (assistant_text, tool_calls)
|
| 1043 |
+
all_tool_calls.extend(tool_calls)
|
| 1044 |
+
|
| 1045 |
+
# Log tool calls as an assistant message (newline-joined)
|
| 1046 |
+
chat.append({"role": "assistant", "content": "\n".join(tool_calls)})
|
| 1047 |
+
|
| 1048 |
+
# Execute tools and collect results
|
| 1049 |
+
results: List[str] = self.execute_tool_calls(env, tool_calls)
|
| 1050 |
+
|
| 1051 |
+
yield "tool_results", (results, )
|
| 1052 |
+
|
| 1053 |
+
# Feed tool results back into prompt
|
| 1054 |
+
curr_prompt = self.cat_tool_results(curr_prompt, tool_calls, results)
|
| 1055 |
+
|
| 1056 |
+
# Aggregate tool responses into a single user message
|
| 1057 |
+
tool_res_blocks = []
|
| 1058 |
+
for idx, (call, res) in enumerate(zip(tool_calls, results), 1):
|
| 1059 |
+
tool_res_blocks.append(f"[Tool {idx}] Result:\n{res}")
|
| 1060 |
+
chat.append({"role": "user", "content": "\n\n".join(tool_res_blocks)})
|
| 1061 |
+
|
| 1062 |
+
else:
|
| 1063 |
+
if search_on:
|
| 1064 |
+
prompt = self.build_summary_prompt(question, self.reformat_trace(curr_prompt) or "", all_tool_calls)
|
| 1065 |
+
system_prompt = DEEPRESEARCH_SYS_PROMPT if deepresearch_on else SUMMARY_SYS_PROMPT
|
| 1066 |
+
|
| 1067 |
+
summary_text = self._route_and_summarize(
|
| 1068 |
+
summary_llm=summary_llm if deepresearch_on else model_url,
|
| 1069 |
+
system_prompt=system_prompt,
|
| 1070 |
+
prompt=prompt,
|
| 1071 |
+
temperature=0.6,
|
| 1072 |
+
max_tokens=16000,
|
| 1073 |
+
deepresearch_on=deepresearch_on
|
| 1074 |
+
)
|
| 1075 |
+
|
| 1076 |
+
summary_text_splits = summary_text.split("</think>")
|
| 1077 |
+
summary_text_initial = summary_text_splits[0]
|
| 1078 |
+
summary_text_initial = summary_text_initial.replace("<think>", "").strip()
|
| 1079 |
+
summary_text_final = summary_text_splits[-1]
|
| 1080 |
+
|
| 1081 |
+
if len(summary_text_initial) > 0 and "</think>" in summary_text:
|
| 1082 |
+
yield "assistant_resp", (summary_text_initial, [])
|
| 1083 |
+
yield "tool_results", ([], )
|
| 1084 |
+
yield "assistant_resp", (summary_text_final, tool_calls)
|
| 1085 |
+
# print(f"No tool calls found in assistant response.\nAssistant Response:\n{assistant_text}\n\n")
|
| 1086 |
+
else:
|
| 1087 |
+
yield "assistant_resp", (assistant_text, tool_calls)
|
| 1088 |
+
print(f"Search is off, so no tool calls expected and no tool calls called.\nAssistant Response:\n{assistant_text}\n\n")
|
| 1089 |
+
# No tool calls β model produced a final answer; stop.
|
| 1090 |
+
break
|
| 1091 |
+
|
| 1092 |
+
# Return the original outputs plus the chat-style transcript
|
| 1093 |
+
# return curr_prompt, all_tool_calls, chat
|
| 1094 |
+
|
| 1095 |
+
return "end", (curr_prompt, )
|
| 1096 |
+
|
| 1097 |
+
@retry(max=5, sleep=1, fallback={"score": 0})
|
| 1098 |
+
def run_deepseek(
|
| 1099 |
+
self,
|
| 1100 |
+
env: str,
|
| 1101 |
+
func_schemas: str,
|
| 1102 |
+
question: str,
|
| 1103 |
+
model_name: str,
|
| 1104 |
+
temperature: float = 0.0,
|
| 1105 |
+
top_p: float = 0.95,
|
| 1106 |
+
max_tokens: int = 32768,
|
| 1107 |
+
):
|
| 1108 |
+
# print("AA"* 100)
|
| 1109 |
+
"""
|
| 1110 |
+
Chat-based ReCall loop for DeepSeek-R1 on Together.
|
| 1111 |
+
"""
|
| 1112 |
+
sys_content = self.sys_prompt_websailor_deepseek.format(func_schemas=func_schemas)
|
| 1113 |
+
# sys_content = self.init_prompt(func_schemas, question)
|
| 1114 |
+
|
| 1115 |
+
messages = [
|
| 1116 |
+
{"role": "system", "content": sys_content},
|
| 1117 |
+
{"role": "user", "content": question},
|
| 1118 |
+
]
|
| 1119 |
+
|
| 1120 |
+
# client = Together(api_key="")
|
| 1121 |
+
client = Together(api_key="")
|
| 1122 |
+
all_tool_calls = []
|
| 1123 |
+
for turn in range(32): # up to 10 reasoning turns
|
| 1124 |
+
resp = client.chat.completions.create(
|
| 1125 |
+
model=model_name,
|
| 1126 |
+
# model="Qwen/Qwen3-235B-A22B-fp8-tput",
|
| 1127 |
+
messages=messages,
|
| 1128 |
+
temperature=temperature,
|
| 1129 |
+
top_p=top_p,
|
| 1130 |
+
max_tokens=39000,
|
| 1131 |
+
stop=["<ο½endβofβsentenceο½>", "<|im_end|>"]
|
| 1132 |
+
)
|
| 1133 |
+
# print(resp)
|
| 1134 |
+
|
| 1135 |
+
|
| 1136 |
+
assistant_text = resp.choices[0].message.content
|
| 1137 |
+
# print(assistant_text)
|
| 1138 |
+
messages.append({"role": "assistant", "content": assistant_text})
|
| 1139 |
+
# print(f"assistant_output: {assistant_text}")
|
| 1140 |
+
|
| 1141 |
+
# β Safe tool call extraction with diagnostic
|
| 1142 |
+
# try:
|
| 1143 |
+
# print("Extracting tool calls")
|
| 1144 |
+
tool_calls = self.extract_tool_calls_deepseek(assistant_text)
|
| 1145 |
+
print(tool_calls)
|
| 1146 |
+
all_tool_calls += tool_calls
|
| 1147 |
+
# except Exception as e:
|
| 1148 |
+
# print(f"Extraction failed with exception {e}")
|
| 1149 |
+
# err_msg = f"<tool_response>Tool call extraction failed on turn {turn+1}: {str(e)}</tool_response>"
|
| 1150 |
+
# messages.append({"role": "user", "content": err_msg})
|
| 1151 |
+
# continue # continue to next turn instead of breaking
|
| 1152 |
+
if "<answer>" in assistant_text:
|
| 1153 |
+
break
|
| 1154 |
+
|
| 1155 |
+
if len(tool_calls) != 0:
|
| 1156 |
+
results = self.execute_tool_calls(env, tool_calls)
|
| 1157 |
+
tool_resp_block = "".join(
|
| 1158 |
+
f"<tool_response>{c}\n{r}\n</tool_response>\n"
|
| 1159 |
+
for c, r in zip(tool_calls, results)
|
| 1160 |
+
)
|
| 1161 |
+
messages.append({"role": "user", "content": tool_resp_block})
|
| 1162 |
+
# print(f"Tool Response {tool_resp_block}")
|
| 1163 |
+
else:
|
| 1164 |
+
print("no answer or tool call")
|
| 1165 |
+
break
|
| 1166 |
+
|
| 1167 |
+
trajectory = "\n".join(
|
| 1168 |
+
f"<{m['role']}>\n{m['content']}" for m in messages
|
| 1169 |
+
if m["role"] != "system"
|
| 1170 |
+
)
|
| 1171 |
+
return trajectory, all_tool_calls
|
| 1172 |
+
|
| 1173 |
+
|
| 1174 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1175 |
+
# HF-endpoint version of βretrieve β inject β tool loopβ
|
| 1176 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1177 |
+
@retry(max=5, sleep=1, fallback=None)
|
| 1178 |
+
def run_with_prompt_injection(
|
| 1179 |
+
self,
|
| 1180 |
+
env: str,
|
| 1181 |
+
func_schemas: str,
|
| 1182 |
+
question: str,
|
| 1183 |
+
model_url: str = "http://0.0.0.0:1214",
|
| 1184 |
+
temperature: float = 0.0,
|
| 1185 |
+
max_new_tokens: int = 512,
|
| 1186 |
+
top_n: int = 5,
|
| 1187 |
+
):
|
| 1188 |
+
"""
|
| 1189 |
+
0) call pubmed_search(question, top_n) once via the sandbox
|
| 1190 |
+
1) inject those snippets into the very first user message
|
| 1191 |
+
2) continue with the normal multi-turn ReCall loop against *model_url*
|
| 1192 |
+
"""
|
| 1193 |
+
|
| 1194 |
+
# 0οΈβ£ do a single retrieval tool call
|
| 1195 |
+
retrieve_call = json.dumps({
|
| 1196 |
+
"name": "pubmed_search",
|
| 1197 |
+
"arguments": {"query": question, "top_n": top_n}
|
| 1198 |
+
})
|
| 1199 |
+
retrieval_raw = self.execute_tool_calls(env, [retrieve_call])[0]
|
| 1200 |
+
try:
|
| 1201 |
+
snippets_block = retrieval_raw.split("result:", 1)[-1].strip()
|
| 1202 |
+
except Exception:
|
| 1203 |
+
snippets_block = ""
|
| 1204 |
+
|
| 1205 |
+
# 1οΈβ£ build initial prompt with injected snippets
|
| 1206 |
+
user_msg = (
|
| 1207 |
+
f"Question: {question}\n\n"
|
| 1208 |
+
"Here are some relevant PubMed snippets:\n"
|
| 1209 |
+
f"{snippets_block}"
|
| 1210 |
+
) if snippets_block else f"Question: {question}"
|
| 1211 |
+
|
| 1212 |
+
sys_prompt = self.init_prompt(func_schemas, question)
|
| 1213 |
+
system_prompt = f"<|im_start|>system\n{sys_prompt}<|im_end|>"
|
| 1214 |
+
user_prompt = f"<|im_start|>user\n{user_msg}<|im_end|>"
|
| 1215 |
+
assistant_pref= f"<|im_start|>assistant\n<think>"
|
| 1216 |
+
curr_prompt = system_prompt + "\n" + user_prompt + "\n" + assistant_pref
|
| 1217 |
+
|
| 1218 |
+
# 2οΈβ£ normal ReCall loop hitting the HF inference endpoint
|
| 1219 |
+
for _ in range(10):
|
| 1220 |
+
resp = requests.post(
|
| 1221 |
+
f"{model_url}/generate",
|
| 1222 |
+
json={
|
| 1223 |
+
"text": curr_prompt,
|
| 1224 |
+
"sampling_params": {
|
| 1225 |
+
"temperature": temperature,
|
| 1226 |
+
"max_new_tokens": max_new_tokens,
|
| 1227 |
+
}
|
| 1228 |
+
},
|
| 1229 |
+
timeout=120,
|
| 1230 |
+
).json()
|
| 1231 |
+
if "error" in resp.keys():
|
| 1232 |
+
print("resp", resp)
|
| 1233 |
+
assistant_txt = resp["text"]
|
| 1234 |
+
curr_prompt = self.cat_assistant_response(curr_prompt, assistant_txt)
|
| 1235 |
+
|
| 1236 |
+
tool_calls = self.extract_tool_calls(assistant_txt)
|
| 1237 |
+
if len(tool_calls) != 0:
|
| 1238 |
+
# break # model produced an answer β done
|
| 1239 |
+
|
| 1240 |
+
results = self.execute_tool_calls(env, tool_calls)
|
| 1241 |
+
curr_prompt = self.cat_tool_results(curr_prompt, tool_calls, results)
|
| 1242 |
+
|
| 1243 |
+
else:
|
| 1244 |
+
continue
|
| 1245 |
+
return curr_prompt
|
| 1246 |
+
|
| 1247 |
+
|
| 1248 |
+
|
| 1249 |
+
@retry(max=5, sleep=1, fallback={"score": 0})
|
| 1250 |
+
def run_budget(
|
| 1251 |
+
self,
|
| 1252 |
+
env: str,
|
| 1253 |
+
func_schemas: str,
|
| 1254 |
+
question: str,
|
| 1255 |
+
model_url: str = "http://0.0.0.0:1214",
|
| 1256 |
+
temperature: float = 0.0,
|
| 1257 |
+
max_new_tokens: int = 2048,
|
| 1258 |
+
) -> str:
|
| 1259 |
+
"""
|
| 1260 |
+
Execute an agentic dialogue with external tools while *pruning* previous
|
| 1261 |
+
<tool_response> blocks to prevent context-length explosion.
|
| 1262 |
+
"""
|
| 1263 |
+
curr_prompt = self.init_prompt(func_schemas, question)
|
| 1264 |
+
|
| 1265 |
+
for _ in range(16): # hard loop-limit
|
| 1266 |
+
# ββ 1. Call the model
|
| 1267 |
+
rsp = requests.post(
|
| 1268 |
+
f"{model_url}/generate",
|
| 1269 |
+
json={
|
| 1270 |
+
"text": curr_prompt,
|
| 1271 |
+
"sampling_params": {
|
| 1272 |
+
"temperature": temperature,
|
| 1273 |
+
"max_new_tokens": max_new_tokens,
|
| 1274 |
+
"stop": ["<|im_end|>", "</think>", "</think>\n" "</think>\n\n"],
|
| 1275 |
+
},
|
| 1276 |
+
|
| 1277 |
+
},
|
| 1278 |
+
timeout=120,
|
| 1279 |
+
).json()
|
| 1280 |
+
generated = rsp["text"] # what you have now
|
| 1281 |
+
matched = rsp["meta_info"]["finish_reason"].get("matched")
|
| 1282 |
+
|
| 1283 |
+
# β’Β append the tag back only if it was removed
|
| 1284 |
+
if matched and not generated.endswith(matched):
|
| 1285 |
+
generated += matched
|
| 1286 |
+
|
| 1287 |
+
# Fail fast on server error
|
| 1288 |
+
if "error" in rsp:
|
| 1289 |
+
raise RuntimeError(rsp["error"])
|
| 1290 |
+
|
| 1291 |
+
assistant_text: str = rsp["text"]
|
| 1292 |
+
curr_prompt = self.cat_assistant_response(curr_prompt, assistant_text)
|
| 1293 |
+
|
| 1294 |
+
# ββ 2. Check for final answer ββββββββββββββββββββββββββββββββββββ
|
| 1295 |
+
if "<answer>" in assistant_text:
|
| 1296 |
+
break
|
| 1297 |
+
|
| 1298 |
+
# ββ 3. Extract & execute tool calls ββββββββββββββββββββββββββββββ
|
| 1299 |
+
tool_calls: List[str] = self.extract_tool_calls(assistant_text)
|
| 1300 |
+
if not tool_calls: # continue reasoning without calling a tool
|
| 1301 |
+
continue
|
| 1302 |
+
|
| 1303 |
+
results: List[str] = self.execute_tool_calls(env, tool_calls)
|
| 1304 |
+
|
| 1305 |
+
|
| 1306 |
+
# ββ 4. BEFORE appending new tool output, drop all old ones βββββββ
|
| 1307 |
+
curr_prompt =self. _strip_old_tool_responses(curr_prompt)
|
| 1308 |
+
|
| 1309 |
+
# ββ 5. Append *only* the fresh tool_response block βββββββββββββββ
|
| 1310 |
+
curr_prompt = self.cat_tool_results(curr_prompt, tool_calls, results)
|
| 1311 |
+
|
| 1312 |
+
return curr_prompt
|
| 1313 |
+
|
| 1314 |
+
|
| 1315 |
+
|
| 1316 |
+
|
| 1317 |
+
def _strip_old_tool_responses_msgs(self, messages: list[dict]) -> list[dict]:
|
| 1318 |
+
"""
|
| 1319 |
+
Return a copy of `messages` with every *user* message that starts with
|
| 1320 |
+
<tool_response> removed. Keeps assistant turns untouched.
|
| 1321 |
+
"""
|
| 1322 |
+
return [
|
| 1323 |
+
m for m in messages
|
| 1324 |
+
if not (m["role"] == "user" and m["content"].lstrip().startswith("<tool_response>"))
|
| 1325 |
+
]
|
| 1326 |
+
# ββββββββββ budget version ββββββββββ
|
| 1327 |
+
@retry(max=5, sleep=1, fallback={"score": 0})
|
| 1328 |
+
def run_deepseek_budget(
|
| 1329 |
+
self,
|
| 1330 |
+
env: str,
|
| 1331 |
+
func_schemas: str,
|
| 1332 |
+
question: str,
|
| 1333 |
+
api_key: str,
|
| 1334 |
+
model_name: str,
|
| 1335 |
+
temperature: float = 0.0,
|
| 1336 |
+
top_p: float = 0.95,
|
| 1337 |
+
max_tokens: int = 32768,
|
| 1338 |
+
max_turns: int = 10,
|
| 1339 |
+
):
|
| 1340 |
+
"""
|
| 1341 |
+
Chat-based ReCall loop for DeepSeek-R1 **with context-budget pruning**.
|
| 1342 |
+
Keeps only the *latest* <tool_response> block to avoid prompt bloat.
|
| 1343 |
+
"""
|
| 1344 |
+
sys_content = self.system_prompt_budget.format(func_schemas=func_schemas)
|
| 1345 |
+
|
| 1346 |
+
messages = [
|
| 1347 |
+
{"role": "system", "content": sys_content},
|
| 1348 |
+
{"role": "user", "content": question},
|
| 1349 |
+
]
|
| 1350 |
+
|
| 1351 |
+
client = Together(api_key=api_key)
|
| 1352 |
+
|
| 1353 |
+
for turn in range(max_turns):
|
| 1354 |
+
# ββ 1. model call βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1355 |
+
resp = client.chat.completions.create(
|
| 1356 |
+
model=model_name,
|
| 1357 |
+
messages=messages,
|
| 1358 |
+
temperature=temperature,
|
| 1359 |
+
top_p=top_p,
|
| 1360 |
+
max_tokens=max_tokens,
|
| 1361 |
+
stop=["</tool_call>", "<ο½endβofβsentenceο½>"],
|
| 1362 |
+
)
|
| 1363 |
+
assistant_text = resp.choices[0].message.content
|
| 1364 |
+
messages.append({"role": "assistant", "content": assistant_text})
|
| 1365 |
+
|
| 1366 |
+
print(f"**assistant** \n {assistant_text}")
|
| 1367 |
+
|
| 1368 |
+
# ββ 2. finished? ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1369 |
+
if "<answer>" in assistant_text:
|
| 1370 |
+
break
|
| 1371 |
+
|
| 1372 |
+
# ββ 3. parse tool calls ββββββββββββββββββββββββββββββββββββββββ
|
| 1373 |
+
tool_calls = self.extract_tool_calls(assistant_text)
|
| 1374 |
+
print(f"**tool_calls** \n {tool_calls}")
|
| 1375 |
+
if not tool_calls:
|
| 1376 |
+
continue # keep reasoning without tools
|
| 1377 |
+
|
| 1378 |
+
# ββ 4. execute tools βββββββββββββββββββββββββββββββββββββββββββ
|
| 1379 |
+
results = self.execute_tool_calls(env, tool_calls)
|
| 1380 |
+
print(f"**tool_response** \n {results}")
|
| 1381 |
+
|
| 1382 |
+
# ββ 5. prune & append fresh tool_response ββββββββββββββββββββββ
|
| 1383 |
+
messages = self._strip_old_tool_responses_msgs(messages)
|
| 1384 |
+
|
| 1385 |
+
tool_resp_block = "".join(
|
| 1386 |
+
f"<tool_response>{c}\n{r}\n</tool_response>\n"
|
| 1387 |
+
for c, r in zip(tool_calls, results)
|
| 1388 |
+
)
|
| 1389 |
+
messages.append({"role": "user", "content": tool_resp_block})
|
| 1390 |
+
|
| 1391 |
+
# ββ 6. flatten & return trajectory (sans system for readability) βββ
|
| 1392 |
+
trajectory = "\n".join(
|
| 1393 |
+
f"<{m['role']}>\n{m['content']}" for m in messages if m["role"] != "system"
|
| 1394 |
+
)
|
| 1395 |
+
return trajectory
|
| 1396 |
+
|
| 1397 |
+
|
| 1398 |
+
@retry(max=5, sleep=1, fallback=None)
|
| 1399 |
+
def run_deepseek_with_prompt_injection(
|
| 1400 |
+
self,
|
| 1401 |
+
env: str,
|
| 1402 |
+
func_schemas: str,
|
| 1403 |
+
question: str,
|
| 1404 |
+
api_key: str,
|
| 1405 |
+
model_name: str,
|
| 1406 |
+
temperature: float = 0.0,
|
| 1407 |
+
top_p: float = 0.95,
|
| 1408 |
+
max_tokens: int = 32768,
|
| 1409 |
+
):
|
| 1410 |
+
"""
|
| 1411 |
+
1) Call pubmed_search(question, top_n=5) as a tool to get snippets.
|
| 1412 |
+
2) Inject them into the first user message.
|
| 1413 |
+
3) Proceed with the usual DeepSeek-R1 toolβbased rollout.
|
| 1414 |
+
"""
|
| 1415 |
+
|
| 1416 |
+
# ββ Step 0: prepare the singleβtool call for retrieval βββββββββββββββ
|
| 1417 |
+
retrieve_call = json.dumps({
|
| 1418 |
+
"name": "pubmed_search",
|
| 1419 |
+
"arguments": {
|
| 1420 |
+
"query": question,
|
| 1421 |
+
"top_n": 5
|
| 1422 |
+
}
|
| 1423 |
+
})
|
| 1424 |
+
|
| 1425 |
+
# Execute it once via your helper
|
| 1426 |
+
# note: `env` must include whatever import / clientβsetup
|
| 1427 |
+
# your sandbox needs to run pubmed_search(...)
|
| 1428 |
+
raw_retrieval_results = self.execute_tool_calls(env, [retrieve_call])[0]
|
| 1429 |
+
# print("AAAAA"*100)
|
| 1430 |
+
try:
|
| 1431 |
+
snippets = raw_retrieval_results[9:] #"remove result: str"
|
| 1432 |
+
# print(snippets)
|
| 1433 |
+
except:
|
| 1434 |
+
snippets = ""
|
| 1435 |
+
# print(f"[ReCall] Retriever call failed to parse JSON, got:\n{raw_retrieval_results!r}")
|
| 1436 |
+
|
| 1437 |
+
# ββ Step 1: build the injected user prompt ββββββββββββββββββββββββββββ
|
| 1438 |
+
if snippets:
|
| 1439 |
+
|
| 1440 |
+
user_content = (
|
| 1441 |
+
f"Question: {question}\n\n"
|
| 1442 |
+
"Here are some relevant PubMed snippets:\n"
|
| 1443 |
+
f"{snippets}"
|
| 1444 |
+
)
|
| 1445 |
+
else:
|
| 1446 |
+
user_content = f"Question: {question}"
|
| 1447 |
+
|
| 1448 |
+
# ββ Step 2: start the chat history ββββββββββββββββββββββββββββββββββββ
|
| 1449 |
+
sys_content = self.system_prompt_forcing_tool_call
|
| 1450 |
+
messages = [
|
| 1451 |
+
{"role": "system", "content": sys_content},
|
| 1452 |
+
{"role": "user", "content": user_content},
|
| 1453 |
+
]
|
| 1454 |
+
client = Together(api_key=api_key)
|
| 1455 |
+
|
| 1456 |
+
# ββ Step 3: your normal ReCall toolβcalling loop βββββββββββββββββββββ
|
| 1457 |
+
for turn in range(10):
|
| 1458 |
+
resp = client.chat.completions.create(
|
| 1459 |
+
model = model_name,
|
| 1460 |
+
messages = messages,
|
| 1461 |
+
temperature = temperature,
|
| 1462 |
+
top_p = top_p,
|
| 1463 |
+
max_tokens = max_tokens,
|
| 1464 |
+
stop = ["</tool_call>", "<ο½endβofβsentenceο½>"]
|
| 1465 |
+
)
|
| 1466 |
+
|
| 1467 |
+
assistant_text = resp.choices[0].message.content
|
| 1468 |
+
messages.append({"role": "assistant", "content": assistant_text})
|
| 1469 |
+
|
| 1470 |
+
tool_calls = self.extract_tool_calls(assistant_text)
|
| 1471 |
+
if not tool_calls:
|
| 1472 |
+
break
|
| 1473 |
+
|
| 1474 |
+
# Execute all of the tool calls in one go
|
| 1475 |
+
results = self.execute_tool_calls(env, tool_calls)
|
| 1476 |
+
# and append them back in the required <tool_response> format
|
| 1477 |
+
tool_resp_block = "".join(
|
| 1478 |
+
f"<tool_response>{call}\n{out}\n</tool_response>\n"
|
| 1479 |
+
for call, out in zip(tool_calls, results)
|
| 1480 |
+
)
|
| 1481 |
+
messages.append({"role": "user", "content": tool_resp_block})
|
| 1482 |
+
|
| 1483 |
+
# ββ Step 4: flatten to a single trajectory ββββββββββββββββββββββββββββ
|
| 1484 |
+
trajectory = "\n".join(
|
| 1485 |
+
f"<{m['role']}>\n{m['content']}"
|
| 1486 |
+
for m in messages
|
| 1487 |
+
if m["role"] != "system"
|
| 1488 |
+
)
|
| 1489 |
+
return trajectory
|
| 1490 |
+
|
requirements.txt
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
transformers
|
| 2 |
+
accelerate
|
| 3 |
+
openai
|
| 4 |
+
tiktoken
|
| 5 |
+
pix2tex
|
| 6 |
+
Pillow
|
| 7 |
+
gradio
|
| 8 |
+
fastapi
|
| 9 |
+
pydantic
|
| 10 |
+
uvicorn
|
| 11 |
+
together
|
| 12 |
+
beautifulsoup4
|
| 13 |
+
trafilatura
|
| 14 |
+
wikipedia
|
| 15 |
+
PyMuPDF
|
| 16 |
+
Crawl4AI
|
run_question.py
ADDED
|
@@ -0,0 +1,275 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# single_question_recall.py
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
import re
|
| 4 |
+
import os
|
| 5 |
+
from typing import Any, Dict, Optional
|
| 6 |
+
|
| 7 |
+
from re_call import ReCall
|
| 8 |
+
from transformers import AutoTokenizer
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
import re
|
| 12 |
+
from typing import Optional, Any, Dict, Tuple, List
|
| 13 |
+
|
| 14 |
+
def _extract_answer_boxed(s: str) -> Optional[str]:
|
| 15 |
+
"""
|
| 16 |
+
Return the content of the *last* \\boxed{...} or \\fbox{...} in `s`,
|
| 17 |
+
with proper matching of nested braces. Escaped braces (\\{, \\}) are ignored
|
| 18 |
+
for counting. If no balanced block is found, returns None.
|
| 19 |
+
"""
|
| 20 |
+
def _iter_box_like_spans(text: str):
|
| 21 |
+
# Find openings for \boxed{ and \fbox{
|
| 22 |
+
openings: List[Tuple[str, int, int]] = []
|
| 23 |
+
for m in re.finditer(r'\\boxed\s*\{', text):
|
| 24 |
+
openings.append(("boxed", m.start(), m.end()))
|
| 25 |
+
for m in re.finditer(r'\\fbox\s*\{', text):
|
| 26 |
+
openings.append(("fbox", m.start(), m.end()))
|
| 27 |
+
openings.sort(key=lambda x: x[1])
|
| 28 |
+
# For each opening, scan forward to find its matching closing brace
|
| 29 |
+
for kind, start, open_end in openings:
|
| 30 |
+
depth = 1
|
| 31 |
+
i = open_end
|
| 32 |
+
n = len(text)
|
| 33 |
+
while i < n:
|
| 34 |
+
ch = text[i]
|
| 35 |
+
# Skip escaped characters: backslash escapes the next char (including { or })
|
| 36 |
+
if ch == '\\' and i + 1 < n:
|
| 37 |
+
i += 2
|
| 38 |
+
continue
|
| 39 |
+
if ch == '{':
|
| 40 |
+
depth += 1
|
| 41 |
+
elif ch == '}':
|
| 42 |
+
depth -= 1
|
| 43 |
+
if depth == 0:
|
| 44 |
+
# content is text[open_end:i]
|
| 45 |
+
yield (kind, start, open_end, i)
|
| 46 |
+
break
|
| 47 |
+
i += 1
|
| 48 |
+
|
| 49 |
+
last_content: Optional[str] = None
|
| 50 |
+
for _, _start, open_end, close_idx in _iter_box_like_spans(s):
|
| 51 |
+
last_content = s[open_end:close_idx] # keep the *last* one
|
| 52 |
+
|
| 53 |
+
return last_content.strip() if last_content is not None else None
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def _extract_answer_tagged(s: str) -> Optional[str]:
|
| 57 |
+
answer_tag_re = re.compile(r"<answer>(.*?)</answer>", re.S)
|
| 58 |
+
m = answer_tag_re.findall(s)
|
| 59 |
+
return m[-1].strip() if m else None
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def _parse_answer_from_transcript(transcript: str) -> str:
|
| 63 |
+
"""
|
| 64 |
+
Prefer balanced \\boxed{...}/\\fbox{...} content, then <answer>...</answer>,
|
| 65 |
+
else fall back to the last 200 chars.
|
| 66 |
+
"""
|
| 67 |
+
return (
|
| 68 |
+
_extract_answer_boxed(transcript)
|
| 69 |
+
or _extract_answer_tagged(transcript)
|
| 70 |
+
# or transcript[-200:].strip()
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
# --- main API: recall only ---
|
| 75 |
+
def answer_question_recall(
|
| 76 |
+
question: str,
|
| 77 |
+
*,
|
| 78 |
+
model_url: Optional[str] = None, # your thinker endpoint (if recall uses one)
|
| 79 |
+
executor_url: Optional[str] = None,
|
| 80 |
+
tokenizer_dir: str = "./tokenizer-info",
|
| 81 |
+
temperature: float = 0.6,
|
| 82 |
+
max_new_tokens: int = 40960,
|
| 83 |
+
top_p: float = 0.95,
|
| 84 |
+
search_env: str = "from search_api import search_urls, open_url, search_and_parse_query, query_url",
|
| 85 |
+
func_schemas = [
|
| 86 |
+
{
|
| 87 |
+
"name": "search_urls",
|
| 88 |
+
"description": "Google search and return links to web-pages with a brief snippet given a text query",
|
| 89 |
+
"parameters": {
|
| 90 |
+
"type": "object",
|
| 91 |
+
"properties": {"query": {"type": "string"}, "top_k": {"type": "integer", "default": 10}},
|
| 92 |
+
"required": ["query"],
|
| 93 |
+
},
|
| 94 |
+
},
|
| 95 |
+
{
|
| 96 |
+
"name": "query_url",
|
| 97 |
+
"description": "Visit webpage and return evidence based retrival for the provided goal",
|
| 98 |
+
"parameters": {
|
| 99 |
+
"type": "object",
|
| 100 |
+
"properties": {
|
| 101 |
+
"url": {"type": "string", "description": "The URL of the webpage to visit. Must be a single URL"},
|
| 102 |
+
"goal": {"type": "string", "description": "The specific information goal for visiting webpage"},
|
| 103 |
+
},
|
| 104 |
+
"required": ["url", "goal"],
|
| 105 |
+
},
|
| 106 |
+
},
|
| 107 |
+
],
|
| 108 |
+
deepseek_name: str = "deepseek-ai/DeepSeek-R1",
|
| 109 |
+
old_prompt: Optional[str] = None,
|
| 110 |
+
deepresearch_on: bool = True,
|
| 111 |
+
summary_llm: str = "gpt-4.1-mini",
|
| 112 |
+
):
|
| 113 |
+
# ) -> Dict[str, Any]:
|
| 114 |
+
"""
|
| 115 |
+
Runs a single question through ReCall and returns:
|
| 116 |
+
{
|
| 117 |
+
"answer": str,
|
| 118 |
+
"transcript": str,
|
| 119 |
+
"tool_calls": Any,
|
| 120 |
+
"chat": Any | None
|
| 121 |
+
}
|
| 122 |
+
"""
|
| 123 |
+
if executor_url is None:
|
| 124 |
+
executor_url = os.environ["HOST_SERPER_URL"]
|
| 125 |
+
|
| 126 |
+
if model_url is None:
|
| 127 |
+
model_url = os.environ["HF_MODEL_URL"]
|
| 128 |
+
|
| 129 |
+
# 1) tokenizer (REQUIRED by ReCall.run)
|
| 130 |
+
tok = AutoTokenizer.from_pretrained(tokenizer_dir, trust_remote_code=True)
|
| 131 |
+
|
| 132 |
+
# 2) build agent
|
| 133 |
+
agent = ReCall(executor_url=executor_url)
|
| 134 |
+
|
| 135 |
+
last_out = ""
|
| 136 |
+
|
| 137 |
+
# 3) call the correct entrypoint
|
| 138 |
+
if model_url == deepseek_name:
|
| 139 |
+
# some setups use a special deepseek path that returns (transcript, tool_calls)
|
| 140 |
+
out = agent.run_deepseek(
|
| 141 |
+
env=search_env,
|
| 142 |
+
func_schemas=func_schemas,
|
| 143 |
+
question=question,
|
| 144 |
+
model_name=deepseek_name,
|
| 145 |
+
temperature=temperature,
|
| 146 |
+
max_tokens=max_new_tokens,
|
| 147 |
+
top_p=top_p,
|
| 148 |
+
)
|
| 149 |
+
transcript, tool_calls, chat = _normalize_out(out, expect_chat=False)
|
| 150 |
+
last_out = transcript
|
| 151 |
+
else:
|
| 152 |
+
# standard ReCall.run MUST receive tokenizer
|
| 153 |
+
agent_generator = agent.run(
|
| 154 |
+
env=search_env,
|
| 155 |
+
func_schemas=func_schemas,
|
| 156 |
+
question=question,
|
| 157 |
+
model_url=model_url,
|
| 158 |
+
temperature=temperature,
|
| 159 |
+
max_new_tokens=max_new_tokens,
|
| 160 |
+
tokenizer=tok, # <- fixes your "missing tokenizer" error
|
| 161 |
+
top_p=top_p,
|
| 162 |
+
old_prompt=old_prompt,# <- you can pass the raw old prompt here if there exists an older chat history
|
| 163 |
+
# the function will append the question to the raw old prompt string (chat history) if it is not None
|
| 164 |
+
deepresearch_on=deepresearch_on,
|
| 165 |
+
summary_llm=summary_llm
|
| 166 |
+
# deepresearch=deepresearch, # <- use the deepresearch prompt
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
while True:
|
| 170 |
+
try:
|
| 171 |
+
tag, out = next(agent_generator)
|
| 172 |
+
if tag == "assistant_resp":
|
| 173 |
+
last_out = out[0]
|
| 174 |
+
yield tag, out
|
| 175 |
+
if tag == "end":
|
| 176 |
+
break
|
| 177 |
+
except StopIteration as e:
|
| 178 |
+
# the chat_str variable contains the whole conversation in the raw string form
|
| 179 |
+
# it contains the raw tokens like "<|im_start|>system\n", "<|im_end|>"
|
| 180 |
+
# and "<|im_start|>assistant\n<think>", "<tool_response>", "\n</tool_response>\n", etc.
|
| 181 |
+
chat_str: str = e.value[1][0]
|
| 182 |
+
yield "end", (chat_str,)
|
| 183 |
+
break
|
| 184 |
+
|
| 185 |
+
# 4) parse final answer
|
| 186 |
+
answer = _parse_answer_from_transcript(last_out)
|
| 187 |
+
|
| 188 |
+
return "answer", (answer,)
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
def _normalize_out(out: Any, expect_chat: bool) -> tuple[str, Any, Any]:
|
| 192 |
+
"""
|
| 193 |
+
Normalize ReCall outputs to (transcript, tool_calls, chat)
|
| 194 |
+
Handles:
|
| 195 |
+
- (transcript, tool_calls, chat)
|
| 196 |
+
- (transcript, tool_calls)
|
| 197 |
+
- "transcript"
|
| 198 |
+
- {"transcript": ..., "tool_calls": ..., "chat": ...} variants
|
| 199 |
+
"""
|
| 200 |
+
transcript, tool_calls, chat = "", None, None
|
| 201 |
+
|
| 202 |
+
if isinstance(out, tuple):
|
| 203 |
+
if len(out) == 3:
|
| 204 |
+
transcript, tool_calls, chat = out
|
| 205 |
+
elif len(out) == 2:
|
| 206 |
+
transcript, tool_calls = out
|
| 207 |
+
elif len(out) == 1:
|
| 208 |
+
transcript = out[0]
|
| 209 |
+
else:
|
| 210 |
+
transcript = str(out[-1])
|
| 211 |
+
elif isinstance(out, dict):
|
| 212 |
+
transcript = out.get("transcript") or out.get("output") or out.get("response") or ""
|
| 213 |
+
tool_calls = out.get("tool_calls")
|
| 214 |
+
chat = out.get("chat")
|
| 215 |
+
else:
|
| 216 |
+
transcript = str(out)
|
| 217 |
+
|
| 218 |
+
# Some implementations return None/empty; keep things predictable
|
| 219 |
+
if chat is None and expect_chat is False:
|
| 220 |
+
chat = None
|
| 221 |
+
return transcript, tool_calls, chat
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
# quick demo
|
| 225 |
+
if __name__ == "__main__":
|
| 226 |
+
old_prompt = None
|
| 227 |
+
|
| 228 |
+
answer_generator = answer_question_recall(
|
| 229 |
+
"What is the most popular restraunt in kolkata?",
|
| 230 |
+
old_prompt=old_prompt
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
# print("ANSWER:", res["answer"])
|
| 234 |
+
# print("\n")
|
| 235 |
+
# # print(type(res["tool_calls"]), len(res["tool_calls"]))
|
| 236 |
+
# for i in res["tool_calls"]:
|
| 237 |
+
# print(f"{i}\n")
|
| 238 |
+
# print("\n")
|
| 239 |
+
# if res["chat"] is not None:
|
| 240 |
+
# # print(type(res["chat"]), len(res["chat"]))
|
| 241 |
+
# for i in res["chat"]:
|
| 242 |
+
# print(f"{i}\n")
|
| 243 |
+
# print("\n")
|
| 244 |
+
# print("TRANSCRIPT (tail):\n", res["transcript"][-300:])
|
| 245 |
+
|
| 246 |
+
final_chat_str = ""
|
| 247 |
+
|
| 248 |
+
while True:
|
| 249 |
+
try:
|
| 250 |
+
tag, out = next(answer_generator)
|
| 251 |
+
if tag == "assistant_resp":
|
| 252 |
+
assistant_text, tool_calls = out
|
| 253 |
+
print(f"ASSISTANT RESPONSE:\n{assistant_text}\n\n")
|
| 254 |
+
print("TOOL CALLS:\n")
|
| 255 |
+
for tool_call in tool_calls:
|
| 256 |
+
print(f"{tool_call}")
|
| 257 |
+
print("\n")
|
| 258 |
+
elif tag == "tool_results":
|
| 259 |
+
results = out[0]
|
| 260 |
+
print("TOOL RESULTS:\n")
|
| 261 |
+
for result in results:
|
| 262 |
+
print(f"{result}")
|
| 263 |
+
print("\n")
|
| 264 |
+
elif tag == "end":
|
| 265 |
+
print(f"{'='*20}\nASSISTANT RESPONSE ENDED\n{'='*20}\n\n")
|
| 266 |
+
final_chat_str = out[0]
|
| 267 |
+
elif tag == "answer":
|
| 268 |
+
answer = out[0]
|
| 269 |
+
print(f"FINAL ANSWER:\n{answer}\n\n")
|
| 270 |
+
break
|
| 271 |
+
except StopIteration as e:
|
| 272 |
+
print(f"FINAL ANSWER:\n{e.value[1][0]}\n\n")
|
| 273 |
+
break
|
| 274 |
+
|
| 275 |
+
print(f"{'='*20}\nEND\n{'='*20}\n\n\nFINAL CHAT STRING:\n{final_chat_str}\n\n")
|
tokenizer-info/added_tokens.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c0284b582e14987fbd3d5a2cb2bd139084371ed9acbae488829a1c900833c680
|
| 3 |
+
size 707
|
tokenizer-info/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer-info/special_tokens_map.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:76862e765266b85aa9459767e33cbaf13970f327a0e88d1c65846c2ddd3a1ecd
|
| 3 |
+
size 613
|
tokenizer-info/tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
|
| 3 |
+
size 11422654
|
tokenizer-info/tokenizer_config.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:443bfa629eb16387a12edbf92a76f6a6f10b2af3b53d87ba1550adfcf45f7fa0
|
| 3 |
+
size 5404
|
tokenizer-info/vocab.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ca10d7e9fb3ed18575dd1e277a2579c16d108e32f27439684afa0e10b1440910
|
| 3 |
+
size 2776833
|
web_agents_5/compressor.py
ADDED
|
@@ -0,0 +1,314 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# compressor.py
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
import functools, json, logging, re
|
| 4 |
+
from difflib import SequenceMatcher
|
| 5 |
+
from io import StringIO
|
| 6 |
+
from typing import Dict, List, Tuple
|
| 7 |
+
|
| 8 |
+
import pandas as pd
|
| 9 |
+
import regex # needed by tiktoken
|
| 10 |
+
import tiktoken
|
| 11 |
+
from bs4 import BeautifulSoup
|
| 12 |
+
from config import CFG
|
| 13 |
+
from web_helpers import retry
|
| 14 |
+
|
| 15 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 16 |
+
# 0. shared helpers
|
| 17 |
+
# ------------------------------------------------------------------------
|
| 18 |
+
enc = tiktoken.get_encoding("cl100k_base")
|
| 19 |
+
_tok = lambda s: len(enc.encode(s)) # fast inline counter
|
| 20 |
+
|
| 21 |
+
@functools.lru_cache(maxsize=1)
|
| 22 |
+
def _nlp():
|
| 23 |
+
import spacy
|
| 24 |
+
return spacy.load("en_core_web_sm")
|
| 25 |
+
|
| 26 |
+
def _openai_client():
|
| 27 |
+
"""Import OpenAI lazily to avoid overhead when not needed."""
|
| 28 |
+
import importlib
|
| 29 |
+
mod = importlib.import_module("openai")
|
| 30 |
+
return getattr(mod, "OpenAI", None)() if hasattr(mod, "OpenAI") else mod
|
| 31 |
+
|
| 32 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 33 |
+
# 1. regex patterns (compiled once)
|
| 34 |
+
# ------------------------------------------------------------------------
|
| 35 |
+
DATE_PATS = [re.compile(p, re.I) for p in [
|
| 36 |
+
r"\d{4}-\d{2}-\d{2}",
|
| 37 |
+
r"(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2},\s+\d{4}",
|
| 38 |
+
r"(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4}",
|
| 39 |
+
r"\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{4}",
|
| 40 |
+
r"\b\d{4}/\d{2}\b",
|
| 41 |
+
r"\b\d{4}\b(?!\s*(?:%|million|billion|thousand))",
|
| 42 |
+
]]
|
| 43 |
+
EMAIL_PAT = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+")
|
| 44 |
+
URL_PAT = re.compile(r"https?://[^\s\)]+")
|
| 45 |
+
PHONE_PAT = re.compile(r"\+?\d[\d\s\-().]{7,}\d")
|
| 46 |
+
CURR_PAT = re.compile(r"(\$\s?\d+(?:,\d{3})*(?:\.\d+)?|\d+(?:,\d{3})*(?:\.\d+)?\s*(USD|EUR|GBP|INR|Β₯|β©|βΉ|β¬))", re.I)
|
| 47 |
+
DEF_PAT = re.compile(r"([A-Z][A-Za-z0-9\s]+?)\s+(is|are|refers to|means)\s+(.*?)(?:[\.\n])")
|
| 48 |
+
|
| 49 |
+
MD_TABLE_PAT = re.compile(
|
| 50 |
+
r"(?:^\|.*?\|\n?)+(?:^\|[-:\s|]+\|\n?)?(?:^\|.*?\|\n?)+", re.M)
|
| 51 |
+
CSV_PAT = re.compile(r"((?:^.*?,.*?\n){2,})", re.M)
|
| 52 |
+
TSV_PAT = re.compile(r"((?:^.*?\t.*?\n){2,})", re.M)
|
| 53 |
+
|
| 54 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 55 |
+
# 2. core utilities
|
| 56 |
+
# ------------------------------------------------------------------------
|
| 57 |
+
def deduplicate_items(items: List[str], *, similarity=0.5,
|
| 58 |
+
other: List[str] | None = None) -> List[str]:
|
| 59 |
+
"""Drop nearβduplicates; prefer the longest variant."""
|
| 60 |
+
if not items: return []
|
| 61 |
+
other = other or []
|
| 62 |
+
|
| 63 |
+
def _clean(x: str) -> str:
|
| 64 |
+
x = re.sub(r'\[edit\]|\[\d+\]', '', x)
|
| 65 |
+
return re.sub(r'\s+', ' ', x).strip()
|
| 66 |
+
|
| 67 |
+
out, out_clean = [], []
|
| 68 |
+
for orig in items:
|
| 69 |
+
clean = _clean(orig)
|
| 70 |
+
dup = False
|
| 71 |
+
for ref in out_clean + list(map(_clean, other)):
|
| 72 |
+
sim = SequenceMatcher(None, clean, ref).ratio()
|
| 73 |
+
if sim >= similarity or clean in ref or ref in clean:
|
| 74 |
+
dup = True
|
| 75 |
+
# if current is longer than stored, replace
|
| 76 |
+
if clean not in out_clean and len(clean) > len(ref):
|
| 77 |
+
idx = out_clean.index(ref)
|
| 78 |
+
out[idx], out_clean[idx] = orig, clean
|
| 79 |
+
break
|
| 80 |
+
if not dup:
|
| 81 |
+
out.append(orig)
|
| 82 |
+
out_clean.append(clean)
|
| 83 |
+
return out
|
| 84 |
+
|
| 85 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 86 |
+
# 3. fact & table extractor
|
| 87 |
+
# ------------------------------------------------------------------------
|
| 88 |
+
def extract_facts_and_tables(text: str) -> Tuple[str, List[str], List[str]]:
|
| 89 |
+
facts, spans = [], []
|
| 90 |
+
|
| 91 |
+
def _add(match):
|
| 92 |
+
facts.append(match.group())
|
| 93 |
+
spans.append(match.span())
|
| 94 |
+
|
| 95 |
+
for pat in DATE_PATS: [_add(m) for m in pat.finditer(text)]
|
| 96 |
+
for m in EMAIL_PAT.finditer(text): _add(m)
|
| 97 |
+
for m in URL_PAT.finditer(text): _add(m)
|
| 98 |
+
for m in PHONE_PAT.finditer(text): _add(m)
|
| 99 |
+
for m in CURR_PAT.finditer(text): _add(m)
|
| 100 |
+
for m in DEF_PAT.finditer(text): _add(m)
|
| 101 |
+
|
| 102 |
+
# contextual sentences around facts
|
| 103 |
+
doc = _nlp()(text)
|
| 104 |
+
ctx = [s.text.strip() for s in doc.sents
|
| 105 |
+
if any(s.start_char <= s_ <= s.end_char for s_, _ in spans)]
|
| 106 |
+
facts.extend(ctx)
|
| 107 |
+
facts = sorted(set(facts))
|
| 108 |
+
|
| 109 |
+
# ββ tables
|
| 110 |
+
tables = []
|
| 111 |
+
|
| 112 |
+
for tbl in MD_TABLE_PAT.findall(text):
|
| 113 |
+
cleaned = "\n".join(l for l in tbl.splitlines()
|
| 114 |
+
if l.strip() and not re.match(r"^\|[-:\s|]+\|$", l))
|
| 115 |
+
if len(cleaned.splitlines()) < 2: continue
|
| 116 |
+
try:
|
| 117 |
+
df = pd.read_csv(StringIO(cleaned), sep="|").dropna(how="all", axis=1)
|
| 118 |
+
tables.append(df.to_markdown(index=False))
|
| 119 |
+
except Exception:
|
| 120 |
+
tables.append(cleaned)
|
| 121 |
+
|
| 122 |
+
soup = BeautifulSoup(text, "lxml")
|
| 123 |
+
for html_tbl in soup.find_all("table"):
|
| 124 |
+
try:
|
| 125 |
+
df = pd.read_html(str(html_tbl))[0]
|
| 126 |
+
tables.append(df.to_markdown(index=False))
|
| 127 |
+
except Exception:
|
| 128 |
+
tables.append(str(html_tbl))
|
| 129 |
+
|
| 130 |
+
for m in CSV_PAT.finditer(text):
|
| 131 |
+
try:
|
| 132 |
+
df = pd.read_csv(StringIO(m.group(1)))
|
| 133 |
+
if not df.empty:
|
| 134 |
+
tables.append(df.to_markdown(index=False))
|
| 135 |
+
except Exception:
|
| 136 |
+
pass
|
| 137 |
+
for m in TSV_PAT.finditer(text):
|
| 138 |
+
try:
|
| 139 |
+
df = pd.read_csv(StringIO(m.group(1)), sep="\t")
|
| 140 |
+
if not df.empty:
|
| 141 |
+
tables.append(df.to_markdown(index=False))
|
| 142 |
+
except Exception:
|
| 143 |
+
pass
|
| 144 |
+
|
| 145 |
+
# ββ clean narrative (remove facts & tables)
|
| 146 |
+
narrative = text
|
| 147 |
+
for tbl in tables: narrative = narrative.replace(tbl, " ")
|
| 148 |
+
for s, e in sorted(spans, reverse=True): narrative = narrative[:s] + narrative[e:]
|
| 149 |
+
narrative = re.sub(r"\s{2,}", " ", narrative).strip()
|
| 150 |
+
|
| 151 |
+
return narrative, facts, tables
|
| 152 |
+
|
| 153 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 154 |
+
# 4. OpenAI summariser helpers
|
| 155 |
+
# ------------------------------------------------------------------------
|
| 156 |
+
def _summarise(text: str, pct: float, model: str) -> str:
|
| 157 |
+
target_tokens = int(_tok(text) * pct)
|
| 158 |
+
sys_prompt = (
|
| 159 |
+
"You are an expert abstractor. Summarize the text below to "
|
| 160 |
+
f"approximately {pct*100:.0f}% of its original length (β{target_tokens} tokens), "
|
| 161 |
+
"while **retaining all key facts, figures, names, dates, places, and events**. "
|
| 162 |
+
"Ensure the summary remains accurate, informative, and faithful to the original content."
|
| 163 |
+
)
|
| 164 |
+
client = _openai_client()
|
| 165 |
+
rsp = client.chat.completions.create(
|
| 166 |
+
model=model, temperature=0.2,
|
| 167 |
+
messages=[{"role":"system","content":sys_prompt},
|
| 168 |
+
{"role":"user","content":text}],
|
| 169 |
+
max_tokens=CFG.output_limit_per_link
|
| 170 |
+
)
|
| 171 |
+
return rsp.choices[0].message.content
|
| 172 |
+
|
| 173 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 174 |
+
# 5. compress_text (public)
|
| 175 |
+
# ------------------------------------------------------------------------
|
| 176 |
+
def compress_text(text: str, *, pct: float = 0.3,
|
| 177 |
+
model: str = "gpt-4o-mini") -> Dict[str, str]:
|
| 178 |
+
|
| 179 |
+
FACTS_TABLES_LIMIT = CFG.output_limit_per_link - CFG.disable_narrative_compress_thresh
|
| 180 |
+
narrative, facts, tables = extract_facts_and_tables(text)
|
| 181 |
+
|
| 182 |
+
# narrative compression
|
| 183 |
+
if _tok(narrative) > CFG.disable_narrative_compress_thresh:
|
| 184 |
+
narrative_txt = _summarise(narrative, pct, model)
|
| 185 |
+
else:
|
| 186 |
+
narrative_txt = narrative
|
| 187 |
+
return narrative_txt
|
| 188 |
+
|
| 189 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 190 |
+
# 6. query_text (goalβoriented extraction)
|
| 191 |
+
# ------------------------------------------------------------------------
|
| 192 |
+
EXTRACTOR_SYS_PROMPT = (
|
| 193 |
+
"You are a highly skilled information extraction agent. Your job is to analyze long, complex webpages "
|
| 194 |
+
"in the context of a specific user goal. You excel at identifying relevant sections, capturing supporting evidence "
|
| 195 |
+
"in full original context, and providing logically structured summaries. Always ensure precision, completeness, "
|
| 196 |
+
"and alignment with the userβs intent."
|
| 197 |
+
)
|
| 198 |
+
EXTRACTOR_PROMPT_TEMPLATE = """You are a highly skilled information extraction agent. Your task is to analyze the following webpage content in light of a specific user goal, and extract accurate, well-structured information using plain text format.
|
| 199 |
+
|
| 200 |
+
## Webpage Content
|
| 201 |
+
{webpage_content}
|
| 202 |
+
|
| 203 |
+
## User Goal
|
| 204 |
+
{goal}
|
| 205 |
+
|
| 206 |
+
## Task Guidelines
|
| 207 |
+
1. **Rational**: Briefly explain why this content is relevant to the userβs goal.
|
| 208 |
+
2. **Evidence**: Quote the most relevant parts of the webpage that directly support or address the goal. Use bullet points or numbered lines separated by newlines.
|
| 209 |
+
3. **Summary**: Provide a clear, logically structured summary of the extracted evidence that addresses the user's goal.
|
| 210 |
+
|
| 211 |
+
## Output Format
|
| 212 |
+
Your response must follow **exactly this format** with the three sections:
|
| 213 |
+
Rational: <one paragraph>
|
| 214 |
+
Evidence: <first point>\n<second point>...
|
| 215 |
+
Summary:<concise paragraph summarizing the evidence>
|
| 216 |
+
"""
|
| 217 |
+
|
| 218 |
+
def extract_regex(text: str) -> Dict[str, str]:
|
| 219 |
+
def extract_section(header: str) -> str:
|
| 220 |
+
# Match the section starting with `Header:` until the next capitalized line followed by `:` or end
|
| 221 |
+
pattern = rf"{header}:\s*(.*?)(?=\n[A-Z][a-z]+:|\Z)"
|
| 222 |
+
match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
|
| 223 |
+
return match.group(1).strip() if match else "(not found)"
|
| 224 |
+
|
| 225 |
+
return {
|
| 226 |
+
"rational": extract_section("Rational"),
|
| 227 |
+
"evidence": extract_section("Evidence"),
|
| 228 |
+
"summary": extract_section("Summary")
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
def query_text(
|
| 232 |
+
url: str,
|
| 233 |
+
text: str,
|
| 234 |
+
goal: str,
|
| 235 |
+
*,
|
| 236 |
+
model: str = "gpt-4.1-mini",
|
| 237 |
+
max_attempts: int = 3,
|
| 238 |
+
) -> Dict[str, str]:
|
| 239 |
+
"""Goalβoriented extractor with retries β compress fallback β token trim fallback."""
|
| 240 |
+
prompt = EXTRACTOR_PROMPT_TEMPLATE.format(
|
| 241 |
+
webpage_content=text[:15_000], # clip for safety
|
| 242 |
+
goal=goal,
|
| 243 |
+
)
|
| 244 |
+
client = _openai_client()
|
| 245 |
+
|
| 246 |
+
for attempt in range(1, max_attempts + 1):
|
| 247 |
+
try:
|
| 248 |
+
rsp = client.chat.completions.create(
|
| 249 |
+
model=model,
|
| 250 |
+
temperature=0.0,
|
| 251 |
+
messages=[
|
| 252 |
+
{"role": "system", "content": EXTRACTOR_SYS_PROMPT},
|
| 253 |
+
{"role": "user", "content": prompt},
|
| 254 |
+
],
|
| 255 |
+
max_tokens = 1024
|
| 256 |
+
).choices[0].message.content
|
| 257 |
+
|
| 258 |
+
extracted = extract_regex(rsp)
|
| 259 |
+
|
| 260 |
+
# Sanity check: evidence + summary must be > 20 characters
|
| 261 |
+
if len(extracted.get("evidence", "")) + len(extracted.get("summary", "")) > 20:
|
| 262 |
+
return {
|
| 263 |
+
"extracted_info": (
|
| 264 |
+
f"The useful information in {url} for goal β{goal}β:\n\n"
|
| 265 |
+
f"Rationale:\n{extracted.get('rational')}\n\n"
|
| 266 |
+
f"Evidence:\n{extracted.get('evidence')}\n\n"
|
| 267 |
+
f"Summary:\n{extracted.get('summary')}"
|
| 268 |
+
)
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
raise ValueError("LLM returned empty or malformed extraction")
|
| 272 |
+
|
| 273 |
+
except Exception as e:
|
| 274 |
+
logging.warning("Attempt %d/%d failed for query-based extraction: %s",
|
| 275 |
+
attempt, max_attempts, e)
|
| 276 |
+
|
| 277 |
+
# ββ Retry fallback: compress text βββββββββββββββββββββββββββββββββββββ
|
| 278 |
+
try:
|
| 279 |
+
compressed = compress_text(text, model=model)
|
| 280 |
+
return {
|
| 281 |
+
"extracted_info": (
|
| 282 |
+
f"Goal-based extraction failed after {max_attempts} attempts; "
|
| 283 |
+
f"returning compressed webpage:\n\n{compressed}"
|
| 284 |
+
)
|
| 285 |
+
}
|
| 286 |
+
except Exception as ce:
|
| 287 |
+
logging.error("compress_text also failed: %s", ce)
|
| 288 |
+
|
| 289 |
+
# ββ Final fallback: hard truncate to token budget ββββββββββββββββββββ
|
| 290 |
+
return {
|
| 291 |
+
"extracted_info": (
|
| 292 |
+
"Goal-based extraction and compression both failed; "
|
| 293 |
+
"returning truncated webpage content:\n\n" +
|
| 294 |
+
trim_to_budget(text, CFG.output_limit_per_link, model=model)
|
| 295 |
+
)
|
| 296 |
+
}
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 300 |
+
# 7. helper: trim long lists to token budget
|
| 301 |
+
# ------------------------------------------------------------------------
|
| 302 |
+
def trim_to_budget(items: List[str], budget: int, *,
|
| 303 |
+
is_table: bool) -> Tuple[str, int]:
|
| 304 |
+
build, used = [], 0
|
| 305 |
+
for it in items:
|
| 306 |
+
toks = _tok(it)
|
| 307 |
+
if used + toks > budget:
|
| 308 |
+
break
|
| 309 |
+
build.append(it)
|
| 310 |
+
used += toks
|
| 311 |
+
if len(build) < len(items):
|
| 312 |
+
build.append(f"[{len(items)-len(build)} {'tables' if is_table else 'facts'} omitted]")
|
| 313 |
+
joined = "\n\n".join(build) if is_table else "\n".join(build)
|
| 314 |
+
return joined, _tok(joined)
|
web_agents_5/config.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
import logging, os, random, requests
|
| 3 |
+
|
| 4 |
+
class _Cfg:
|
| 5 |
+
ua: str = (
|
| 6 |
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
| 7 |
+
"(KHTML, like Gecko) Chrome/125.0 Safari/537.36"
|
| 8 |
+
)
|
| 9 |
+
serper_key = os.getenv("SERPER_API_KEY", "")
|
| 10 |
+
jina_cache_dir = os.getenv("JINA_CACHE_DIR", "")
|
| 11 |
+
serper_cache_dir = os.getenv("SERPER_CACHE_DIR", "")
|
| 12 |
+
jina_key = os.getenv("JINA_API_KEY", "")
|
| 13 |
+
serper_ep = "https://google.serper.dev/search"
|
| 14 |
+
crawl4ai_ep = os.getenv("CRAWL4AI_EP", "http://localhost:8080")
|
| 15 |
+
retries = 3
|
| 16 |
+
backoff = 0.8
|
| 17 |
+
connect_to = 5
|
| 18 |
+
read_to = 10
|
| 19 |
+
stream_html_cap = 200_000
|
| 20 |
+
pdf_size_cap = 32_000_000
|
| 21 |
+
pdf_pages_cap = 40
|
| 22 |
+
pdf_chars_cap = 40_000
|
| 23 |
+
text_cap = 400_000
|
| 24 |
+
output_limit_per_link = 6_000
|
| 25 |
+
disable_narrative_compress_thresh = 2_000
|
| 26 |
+
pct = 0.25 # narrative compression pct
|
| 27 |
+
reddit_client_id = "Q2tovcGfYmo3hPNvzTpkXA"
|
| 28 |
+
reddit_client_secret = "geu4gH3pEOrNnsMpQvdTTVhQvDABgg"
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
CFG = _Cfg()
|
| 32 |
+
_RND = random.Random()
|
| 33 |
+
_SESS = requests.Session()
|
| 34 |
+
_SESS.headers.update({"User-Agent": CFG.ua})
|
| 35 |
+
|
| 36 |
+
# logging.basicConfig(level=logging.INFO,
|
| 37 |
+
# format="%(asctime)s - %(levelname)s - %(message)s")
|
| 38 |
+
logging.getLogger().setLevel(logging.INFO) # bump root to DEBUG
|
web_agents_5/fetchers/__init__.py
ADDED
|
File without changes
|
web_agents_5/fetchers/basic_fetcher.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
import logging
|
| 3 |
+
from urllib.parse import unquote
|
| 4 |
+
from config import CFG, _SESS
|
| 5 |
+
from web_helpers import extract_main_text, fetch_blocked_site
|
| 6 |
+
|
| 7 |
+
_BINARY = (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".zip", ".tar",
|
| 8 |
+
".gz", ".mp3", ".mp4", ".mkv", ".exe")
|
| 9 |
+
|
| 10 |
+
_ERROR = ["wrong", "error", "try again"]
|
| 11 |
+
|
| 12 |
+
def _looks_like_error(txt):
|
| 13 |
+
if len(txt) < 300:
|
| 14 |
+
for err in _ERROR:
|
| 15 |
+
if err in txt:
|
| 16 |
+
return True
|
| 17 |
+
return False
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def fetch_html(url: str) -> str:
|
| 21 |
+
if url.lower().endswith(_BINARY):
|
| 22 |
+
return "[binary omitted]"
|
| 23 |
+
try:
|
| 24 |
+
r = _SESS.get(url, stream=True, timeout=(CFG.connect_to, CFG.read_to))
|
| 25 |
+
r.raise_for_status()
|
| 26 |
+
ctype = (r.headers.get("content-type") or "").lower()
|
| 27 |
+
if "pdf" in ctype or not ("text" in ctype or "html" in ctype):
|
| 28 |
+
return "[binary omitted]"
|
| 29 |
+
raw = r.raw.read(CFG.stream_html_cap, decode_content=True)
|
| 30 |
+
html = raw.decode(r.encoding or "utf-8", errors="ignore")
|
| 31 |
+
txt = extract_main_text(html).strip()
|
| 32 |
+
if "wikipedia.org" in url:
|
| 33 |
+
slug = unquote(url.rsplit("/", 1)[-1]).replace("_", " ")
|
| 34 |
+
if slug.lower() not in txt.lower():
|
| 35 |
+
txt = f"{slug}\n\n{txt}"
|
| 36 |
+
if _looks_like_error(txt):
|
| 37 |
+
return f"[Error fetching url: {url}]"
|
| 38 |
+
else:
|
| 39 |
+
return "[Retrived using HTML] " + txt
|
| 40 |
+
except Exception as e:
|
| 41 |
+
logging.error("Generic fetch failed %s: %s", url, e)
|
| 42 |
+
return fetch_blocked_site(url)
|
web_agents_5/fetchers/crawl4ai_fetcher.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Asynchronous wrapper around **Crawl4AI** so that other coroutines can await a
|
| 3 |
+
single call β identical to the previous implementation but isolated in its own
|
| 4 |
+
module to satisfy cleanβarchitecture / layering.
|
| 5 |
+
|
| 6 |
+
Public API
|
| 7 |
+
==========
|
| 8 |
+
async def fetch_crawl4ai(url: str) -> str
|
| 9 |
+
Returns markdown extracted by Crawl4AI or raises `RuntimeError` on failure.
|
| 10 |
+
"""
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
import asyncio, logging
|
| 14 |
+
from dataclasses import dataclass, field
|
| 15 |
+
from typing import Any
|
| 16 |
+
|
| 17 |
+
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
| 18 |
+
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
| 19 |
+
from config import CFG
|
| 20 |
+
|
| 21 |
+
# ----------------------------------------------------------------------------
|
| 22 |
+
_MAX_CONCURRENT_PAGES = 6
|
| 23 |
+
_MAX_ATTEMPTS = 5
|
| 24 |
+
_RETRYABLE = (
|
| 25 |
+
"handler is closed",
|
| 26 |
+
"browser has disconnected",
|
| 27 |
+
"transport closed",
|
| 28 |
+
"target crashed",
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
# Globals bound to the *eventβloop* currently active
|
| 32 |
+
_CRAWLER: AsyncWebCrawler | None = None
|
| 33 |
+
_CRAWLER_LOOP: asyncio.AbstractEventLoop | None = None
|
| 34 |
+
_SEMAPHORES: dict[asyncio.AbstractEventLoop, asyncio.Semaphore] = {}
|
| 35 |
+
_CFG = CrawlerRunConfig(markdown_generator=DefaultMarkdownGenerator())
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def _get_semaphore() -> asyncio.Semaphore:
|
| 39 |
+
loop = asyncio.get_running_loop()
|
| 40 |
+
if loop not in _SEMAPHORES:
|
| 41 |
+
_SEMAPHORES[loop] = asyncio.Semaphore(_MAX_CONCURRENT_PAGES)
|
| 42 |
+
return _SEMAPHORES[loop]
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
async def _ensure_crawler() -> None:
|
| 46 |
+
global _CRAWLER, _CRAWLER_LOOP
|
| 47 |
+
loop = asyncio.get_running_loop()
|
| 48 |
+
if _CRAWLER is None or loop is not _CRAWLER_LOOP:
|
| 49 |
+
if _CRAWLER is not None:
|
| 50 |
+
try:
|
| 51 |
+
await _CRAWLER.aclose()
|
| 52 |
+
except Exception:
|
| 53 |
+
pass
|
| 54 |
+
_CRAWLER = AsyncWebCrawler()
|
| 55 |
+
await _CRAWLER.__aenter__()
|
| 56 |
+
_CRAWLER_LOOP = loop
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
@dataclass
|
| 60 |
+
class _Page:
|
| 61 |
+
success: bool
|
| 62 |
+
markdown: str | None = None
|
| 63 |
+
error: str | None = None
|
| 64 |
+
meta: dict[str, Any] = field(default_factory=dict)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
async def _crawl_once(url: str) -> _Page:
|
| 68 |
+
global _CRAWLER
|
| 69 |
+
await _ensure_crawler()
|
| 70 |
+
|
| 71 |
+
try:
|
| 72 |
+
result = await _CRAWLER.arun(url, config=_CFG)
|
| 73 |
+
if result.success and result.markdown:
|
| 74 |
+
return _Page(True, result.markdown, meta=result.__dict__)
|
| 75 |
+
return _Page(False, error=result.error_message or "no markdown")
|
| 76 |
+
except Exception as exc:
|
| 77 |
+
return _Page(False, error=str(exc))
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
async def fetch_crawl4ai(url: str) -> str:
|
| 81 |
+
"""Return markdown extracted by Crawl4AI or raise on failure."""
|
| 82 |
+
sem = _get_semaphore()
|
| 83 |
+
async with sem:
|
| 84 |
+
for attempt in range(1, _MAX_ATTEMPTS + 1):
|
| 85 |
+
page = await _crawl_once(url)
|
| 86 |
+
if page.success and page.markdown:
|
| 87 |
+
print(len(page.markdown))
|
| 88 |
+
return "[Retrieved from Craw4AI]" + page.markdown[:CFG.text_cap]
|
| 89 |
+
|
| 90 |
+
err = page.error or "unknown"
|
| 91 |
+
logging.warning("Crawl4AI attempt %d/%d failed: %s", attempt, _MAX_ATTEMPTS, err)
|
| 92 |
+
|
| 93 |
+
if attempt < _MAX_ATTEMPTS and any(p in err.lower() for p in _RETRYABLE):
|
| 94 |
+
# reset shared browser & retry after backβoff
|
| 95 |
+
global _CRAWLER
|
| 96 |
+
try:
|
| 97 |
+
await _CRAWLER.aclose()
|
| 98 |
+
except Exception:
|
| 99 |
+
pass
|
| 100 |
+
_CRAWLER = None
|
| 101 |
+
await asyncio.sleep(1.5 * attempt)
|
| 102 |
+
continue
|
| 103 |
+
|
| 104 |
+
raise RuntimeError(err)
|
web_agents_5/fetchers/github_fetcher.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
from config import CFG, _SESS, _RND
|
| 3 |
+
import logging
|
| 4 |
+
import re
|
| 5 |
+
from bs4 import BeautifulSoup
|
| 6 |
+
import functools
|
| 7 |
+
import random
|
| 8 |
+
import requests
|
| 9 |
+
import trafilatura
|
| 10 |
+
import time
|
| 11 |
+
from web_helpers import retry, fetch_blocked_site # β¬
οΈ shared
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def fetch_github(url: str) -> str:
|
| 15 |
+
def _markdown_cleanup(md: str) -> str:
|
| 16 |
+
md = re.sub(r"```.*?```", "", md, flags=re.S)
|
| 17 |
+
md = re.sub(r"^#+\s*", "", md, flags=re.M)
|
| 18 |
+
return re.sub(r"[ \t]{2,}", " ", md).strip()
|
| 19 |
+
|
| 20 |
+
hdr = {"User-Agent": "ii-research-bot/0.6"}
|
| 21 |
+
try:
|
| 22 |
+
m = re.match(r"https?://github\.com/([^/]+)/([^/]+?)(?:\.git)?/?$", url)
|
| 23 |
+
if m:
|
| 24 |
+
owner, repo = m.groups()
|
| 25 |
+
api = f"https://api.github.com/repos/{owner}/{repo}/readme"
|
| 26 |
+
hdr_api = hdr | {"Accept": "application/vnd.github.v3.raw"}
|
| 27 |
+
if (tok := os.getenv("GITHUB_TOKEN")):
|
| 28 |
+
hdr_api["Authorization"] = f"Bearer {tok}"
|
| 29 |
+
r = _SESS.get(api, headers=hdr_api, timeout=(CFG.connect_to, CFG.read_to))
|
| 30 |
+
if r.ok and len(r.text) > 30:
|
| 31 |
+
return _markdown_cleanup(r.text)[:CFG.text_cap]
|
| 32 |
+
|
| 33 |
+
if "/blob/" in url or "/tree/" in url:
|
| 34 |
+
raw = re.sub(
|
| 35 |
+
r"https://github\.com/([^/]+)/([^/]+)/(?:blob|tree)/",
|
| 36 |
+
r"https://raw.githubusercontent.com/\\1/\\2/",
|
| 37 |
+
url,
|
| 38 |
+
count=1,
|
| 39 |
+
).split("?", 1)[0]
|
| 40 |
+
r = _SESS.get(raw, headers=hdr, timeout=(CFG.connect_to, CFG.read_to))
|
| 41 |
+
if r.ok and "text" in (r.headers.get("content-type") or "") and len(r.text) > 0:
|
| 42 |
+
return r.text[:CFG.text_cap]
|
| 43 |
+
|
| 44 |
+
raw1 = url + ("?raw=1" if "?" not in url else "&raw=1")
|
| 45 |
+
r = _SESS.get(raw1, headers=hdr, timeout=(CFG.connect_to, CFG.read_to))
|
| 46 |
+
if r.ok and "text" in (r.headers.get("content-type") or "") and len(r.text) > 0:
|
| 47 |
+
return r.text[:CFG.text_cap]
|
| 48 |
+
|
| 49 |
+
plain = url + ("?plain=1" if "?" not in url else "&plain=1")
|
| 50 |
+
html = _SESS.get(plain, headers=hdr, timeout=(CFG.connect_to, CFG.read_to)).text
|
| 51 |
+
soup = BeautifulSoup(html, "lxml")
|
| 52 |
+
pre = soup.find("pre")
|
| 53 |
+
if pre and len(pre.text) > 10:
|
| 54 |
+
return pre.text[:CFG.text_cap]
|
| 55 |
+
|
| 56 |
+
if "raw.githubusercontent.com" in url:
|
| 57 |
+
r = _SESS.get(url.split("?", 1)[0], headers=hdr, timeout=(CFG.connect_to, CFG.read_to))
|
| 58 |
+
if r.ok and "text" in (r.headers.get("content-type") or ""):
|
| 59 |
+
return "[Retrieved from raw.githubusercontent.com]" + r.text[:CFG.text_cap]
|
| 60 |
+
|
| 61 |
+
raise RuntimeError("github: unable to retrieve plain text")
|
| 62 |
+
except Exception as e:
|
| 63 |
+
logging.error(f"GitHub fetch failed for {url}: {e}")
|
| 64 |
+
return _fetch_blocked_site(url)
|
web_agents_5/fetchers/jina_fetcher.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# """
|
| 2 |
+
# Jina AI powered web-page fetcher.
|
| 3 |
+
|
| 4 |
+
# Provides `fetch_jina(url: str) -> str` which returns a **plain-text or markdown** body
|
| 5 |
+
# prefixed with `[Retrieved from Jina AI]` so callers can recognise the source.
|
| 6 |
+
# If the Jina endpoint cannot return usable text (HTTP error, short / empty body, etc.)
|
| 7 |
+
# this function raises an Exception β letting the orchestrator fall back to other
|
| 8 |
+
# fetchers.
|
| 9 |
+
|
| 10 |
+
# The implementation is **stateless** and thread-safe β no global mutable state is
|
| 11 |
+
# kept apart from the shared requests session from `config` (mirroring the rest of
|
| 12 |
+
# the code-base).
|
| 13 |
+
# """
|
| 14 |
+
|
| 15 |
+
# from __future__ import annotations
|
| 16 |
+
|
| 17 |
+
# import logging
|
| 18 |
+
# import os
|
| 19 |
+
# import urllib.parse as _u
|
| 20 |
+
|
| 21 |
+
# from config import CFG, _SESS # shared requests session and config
|
| 22 |
+
# from web_helpers import retry
|
| 23 |
+
|
| 24 |
+
# _JINA_ENDPOINT = "https://r.jina.ai/{url}" # Note: will prepend http:// when formatting
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
# @retry
|
| 28 |
+
# def fetch_jina(url: str) -> str:
|
| 29 |
+
# """Return article text extracted by **Jina AI Read API**.
|
| 30 |
+
|
| 31 |
+
# Raises:
|
| 32 |
+
# RuntimeError β if the endpoint does not yield usable text
|
| 33 |
+
# """
|
| 34 |
+
# api_url = _JINA_ENDPOINT.format(url=url)
|
| 35 |
+
# headers = {
|
| 36 |
+
# "Authorization": f"Bearer {CFG.jina_key}"
|
| 37 |
+
# }
|
| 38 |
+
# logging.debug("Jina fetch β %s", api_url)
|
| 39 |
+
|
| 40 |
+
# # Make request
|
| 41 |
+
# r = _SESS.get(api_url, headers=headers, timeout=(CFG.connect_to, CFG.read_to))
|
| 42 |
+
# r.raise_for_status()
|
| 43 |
+
|
| 44 |
+
# txt = r.text.strip()
|
| 45 |
+
|
| 46 |
+
# # Treat short or errorful body as failure
|
| 47 |
+
# if len(txt) < 200 and any(err in txt.lower() for err in ["403", "forbidden", "error"]):
|
| 48 |
+
# raise RuntimeError("Jina AI returned no content")
|
| 49 |
+
|
| 50 |
+
# return "[Retrieved from Jina AI] " + txt[: CFG.text_cap]
|
| 51 |
+
|
| 52 |
+
"""
|
| 53 |
+
Jina AI powered web-page fetcher with URL-based disk cache.
|
| 54 |
+
|
| 55 |
+
- Cache key: canonicalized URL (sha256)
|
| 56 |
+
- Cache location: <CFG.cache_dir or $CACHE_DIR or ".cache">/jina_read/
|
| 57 |
+
- Always stores the *raw* Jina body (without the "[Retrieved...]" prefix).
|
| 58 |
+
- Atomic writes via os.replace for basic thread/process safety.
|
| 59 |
+
"""
|
| 60 |
+
|
| 61 |
+
from __future__ import annotations
|
| 62 |
+
|
| 63 |
+
import hashlib
|
| 64 |
+
import logging
|
| 65 |
+
import os
|
| 66 |
+
import urllib.parse as _u
|
| 67 |
+
from typing import Tuple
|
| 68 |
+
|
| 69 |
+
from config import CFG, _SESS # shared requests session and config
|
| 70 |
+
from web_helpers import retry
|
| 71 |
+
|
| 72 |
+
_JINA_ENDPOINT = "https://r.jina.ai/{url}" # expects a fully-qualified, url-encoded target
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def _canonicalize_url(url: str) -> str:
|
| 76 |
+
"""Ensure URL has a scheme and is normalized for caching/API calls."""
|
| 77 |
+
p = _u.urlparse(url.strip())
|
| 78 |
+
if not p.scheme:
|
| 79 |
+
# Default to http if missing; Jina reader prefers explicit scheme.
|
| 80 |
+
p = _u.urlparse("http://" + url.strip())
|
| 81 |
+
|
| 82 |
+
# Normalize: lowercase scheme/netloc, drop fragment, keep query & path
|
| 83 |
+
p = p._replace(scheme=p.scheme.lower(), netloc=p.netloc.lower(), fragment="")
|
| 84 |
+
# Ensure path is at least "/"
|
| 85 |
+
path = p.path if p.path else "/"
|
| 86 |
+
return _u.urlunparse((p.scheme, p.netloc, path, "", p.query, ""))
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def _cache_paths(nurl: str) -> Tuple[str, str]:
|
| 90 |
+
"""Return (cache_dir, cache_file_path) for a normalized URL."""
|
| 91 |
+
cache_root = CFG.jina_cache_dir
|
| 92 |
+
cache_dir = os.path.join(cache_root, "jina_read")
|
| 93 |
+
os.makedirs(cache_dir, exist_ok=True)
|
| 94 |
+
h = hashlib.sha256(nurl.encode("utf-8")).hexdigest()
|
| 95 |
+
return cache_dir, os.path.join(cache_dir, f"{h}.txt")
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def _load_from_cache(cpath: str) -> str | None:
|
| 99 |
+
try:
|
| 100 |
+
if os.path.exists(cpath) and os.path.getsize(cpath) > 0:
|
| 101 |
+
with open(cpath, "r", encoding="utf-8") as f:
|
| 102 |
+
return f.read()
|
| 103 |
+
except Exception as e:
|
| 104 |
+
logging.debug("Jina cache read failed (%s): %s", cpath, e)
|
| 105 |
+
return None
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def _save_to_cache(cpath: str, body: str) -> None:
|
| 109 |
+
try:
|
| 110 |
+
tmp = f"{cpath}.tmp.{os.getpid()}"
|
| 111 |
+
with open(tmp, "w", encoding="utf-8") as f:
|
| 112 |
+
f.write(body)
|
| 113 |
+
os.replace(tmp, cpath) # atomic on the same filesystem
|
| 114 |
+
except Exception as e:
|
| 115 |
+
logging.debug("Jina cache write failed (%s): %s", cpath, e)
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
@retry
|
| 119 |
+
def fetch_jina(url: str) -> str:
|
| 120 |
+
"""Return article text extracted by **Jina AI Read API** with disk cache.
|
| 121 |
+
|
| 122 |
+
Raises:
|
| 123 |
+
RuntimeError β if the endpoint does not yield usable text
|
| 124 |
+
"""
|
| 125 |
+
nurl = _canonicalize_url(url)
|
| 126 |
+
cache_dir, cpath = _cache_paths(nurl)
|
| 127 |
+
|
| 128 |
+
# 1) Try cache
|
| 129 |
+
cached = _load_from_cache(cpath)
|
| 130 |
+
if cached:
|
| 131 |
+
logging.info("Jina fetch (cache hit) β %s", nurl)
|
| 132 |
+
return "[Retrieved from Jina AI] " + cached[: CFG.text_cap]
|
| 133 |
+
|
| 134 |
+
# 2) Fetch from Jina
|
| 135 |
+
api_url = _JINA_ENDPOINT.format(url=_u.quote(nurl, safe=""))
|
| 136 |
+
headers = {"Authorization": f"Bearer {CFG.jina_key}"}
|
| 137 |
+
logging.debug("Jina fetch (cache miss) β %s", api_url)
|
| 138 |
+
|
| 139 |
+
r = _SESS.get(api_url, headers=headers, timeout=(CFG.connect_to, CFG.read_to))
|
| 140 |
+
r.raise_for_status()
|
| 141 |
+
body = r.text.strip()
|
| 142 |
+
|
| 143 |
+
# 3) Validate
|
| 144 |
+
if len(body) < 200 and any(err in body.lower() for err in ("403", "forbidden", "error")):
|
| 145 |
+
raise RuntimeError("Jina AI returned no content")
|
| 146 |
+
|
| 147 |
+
# 4) Save to cache (store the raw body; callers always get the standard prefix)
|
| 148 |
+
_save_to_cache(cpath, body)
|
| 149 |
+
|
| 150 |
+
return "[Retrieved from Jina AI] " + body[: CFG.text_cap]
|
| 151 |
+
|
web_agents_5/fetchers/pdf_fetcher.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
from config import CFG, _SESS
|
| 3 |
+
import io, logging, re, pymupdf as fitz
|
| 4 |
+
|
| 5 |
+
from web_helpers import retry, fetch_blocked_site # β¬
οΈ shared
|
| 6 |
+
# ----------------------------------------------------------------------
|
| 7 |
+
|
| 8 |
+
class PDFExtractError(RuntimeError): ...
|
| 9 |
+
|
| 10 |
+
@retry
|
| 11 |
+
def _download_pdf(url: str) -> bytes:
|
| 12 |
+
with _SESS.get(url, stream=True, timeout=(CFG.connect_to, CFG.read_to)) as r:
|
| 13 |
+
r.raise_for_status()
|
| 14 |
+
total = int(r.headers.get("content-length", 0) or 0)
|
| 15 |
+
if 0 < total > CFG.pdf_size_cap:
|
| 16 |
+
raise RuntimeError("pdf too large")
|
| 17 |
+
buf = io.BytesIO()
|
| 18 |
+
for chunk in r.iter_content(16_384):
|
| 19 |
+
buf.write(chunk)
|
| 20 |
+
if buf.tell() > CFG.pdf_size_cap:
|
| 21 |
+
raise RuntimeError("pdf exceeds cap")
|
| 22 |
+
return buf.getvalue()
|
| 23 |
+
|
| 24 |
+
def _extract_pdf(buf: bytes) -> str:
|
| 25 |
+
try:
|
| 26 |
+
doc = fitz.open(stream=buf, filetype="pdf")
|
| 27 |
+
except Exception as e:
|
| 28 |
+
raise PDFExtractError(e)
|
| 29 |
+
parts, chars = [], 0
|
| 30 |
+
for page in doc:
|
| 31 |
+
if len(parts) >= CFG.pdf_pages_cap:
|
| 32 |
+
break
|
| 33 |
+
text = (
|
| 34 |
+
page.get_text("text")
|
| 35 |
+
.replace("\u00A0", " ")
|
| 36 |
+
.replace("-\n", "")
|
| 37 |
+
)
|
| 38 |
+
parts.append(text)
|
| 39 |
+
chars += len(text)
|
| 40 |
+
if chars > CFG.pdf_chars_cap:
|
| 41 |
+
break
|
| 42 |
+
out = " ".join(parts).strip()[:CFG.pdf_chars_cap]
|
| 43 |
+
if not out:
|
| 44 |
+
raise PDFExtractError("empty / scanned pdf")
|
| 45 |
+
return "[Retrieved from PyMUPDF]" + out
|
| 46 |
+
|
| 47 |
+
def fetch_pdf(url: str) -> str:
|
| 48 |
+
try:
|
| 49 |
+
buf = _download_pdf(url)
|
| 50 |
+
return _extract_pdf(buf)
|
| 51 |
+
except Exception as e:
|
| 52 |
+
logging.error("PDF fetch failed for %s: %s", url, e)
|
| 53 |
+
return fetch_blocked_site(url)[:CFG.text_cap]
|
web_agents_5/fetchers/reddit_fetcher.py
ADDED
|
@@ -0,0 +1,324 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
from config import CFG, _SESS, _RND
|
| 5 |
+
import logging
|
| 6 |
+
import re
|
| 7 |
+
from bs4 import BeautifulSoup
|
| 8 |
+
import functools
|
| 9 |
+
import random
|
| 10 |
+
import requests
|
| 11 |
+
import time
|
| 12 |
+
import trafilatura
|
| 13 |
+
from web_helpers import retry, fetch_blocked_site
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
_REDDIT_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
@retry
|
| 20 |
+
def _reddit_json_api(url: str) -> str | None:
|
| 21 |
+
api_url = re.sub(r"/comments/([a-z0-9]{6,8}).*", r"/comments/\1.json", url)
|
| 22 |
+
try:
|
| 23 |
+
headers = {"User-Agent": _REDDIT_UA, "Accept": "application/json"}
|
| 24 |
+
r = _SESS.get(
|
| 25 |
+
api_url,
|
| 26 |
+
params={"limit": 5, "depth": 2, "raw_json": 1},
|
| 27 |
+
headers=headers,
|
| 28 |
+
timeout=(CFG.connect_to, CFG.read_to),
|
| 29 |
+
)
|
| 30 |
+
if "blocked" in r.text.lower() or r.status_code != 200:
|
| 31 |
+
return None
|
| 32 |
+
|
| 33 |
+
data = r.json()
|
| 34 |
+
post_data = data[0]["data"]["children"][0]["data"]
|
| 35 |
+
title = post_data.get("title", "")
|
| 36 |
+
selftext = post_data.get("selftext", "")
|
| 37 |
+
author = post_data.get("author", "")
|
| 38 |
+
|
| 39 |
+
comments = []
|
| 40 |
+
if len(data) > 1:
|
| 41 |
+
for comment in data[1]["data"]["children"][:50]:
|
| 42 |
+
if comment["kind"] == "t1":
|
| 43 |
+
c_author = comment["data"].get("author", "")
|
| 44 |
+
c_body = comment["data"].get("body", "")
|
| 45 |
+
if c_body:
|
| 46 |
+
comments.append(f"u/{c_author}: {c_body}")
|
| 47 |
+
|
| 48 |
+
result = f"Title: {title}\nPosted by: u/{author}\n\n"
|
| 49 |
+
if selftext:
|
| 50 |
+
result += f"{selftext}\n\n"
|
| 51 |
+
if comments:
|
| 52 |
+
result += "Top comments:\n\n" + "\n\n".join(comments)
|
| 53 |
+
|
| 54 |
+
return result.strip()
|
| 55 |
+
except Exception:
|
| 56 |
+
return None
|
| 57 |
+
import urllib.parse as _u
|
| 58 |
+
|
| 59 |
+
_ID_RE = re.compile(r"([a-z0-9]{6,8})", re.I)
|
| 60 |
+
|
| 61 |
+
def _extract_post_id(url: str) -> str | None:
|
| 62 |
+
"""
|
| 63 |
+
Heuristics to find the 6β8βchar baseβ36 Reddit ID in *any* post URL:
|
| 64 |
+
β’ shortβlink redd.it/<id>
|
| 65 |
+
β’ /r/sub/abc123/β¦ (old style)
|
| 66 |
+
β’ /comments/<id>/β¦ (APIβfriendly)
|
| 67 |
+
"""
|
| 68 |
+
# 1) shortβlink host
|
| 69 |
+
u = _u.urlparse(url)
|
| 70 |
+
if u.netloc in {"redd.it", "www.redd.it"}:
|
| 71 |
+
return u.path.lstrip("/").split("/")[0] or None
|
| 72 |
+
|
| 73 |
+
# 2) /comments/<id>/ pattern (works already)
|
| 74 |
+
m = re.search(r"/comments/([a-z0-9]{6,8})", url, re.I)
|
| 75 |
+
if m:
|
| 76 |
+
return m.group(1)
|
| 77 |
+
|
| 78 |
+
# 3) generic β/r/<sub>/<id>/β or trailing ββ¦/<id>β
|
| 79 |
+
parts = [p for p in u.path.split("/") if p]
|
| 80 |
+
for p in parts[::-1]: # search from rightβmost
|
| 81 |
+
if _ID_RE.fullmatch(p):
|
| 82 |
+
return p
|
| 83 |
+
return None
|
| 84 |
+
|
| 85 |
+
# ----------------------------------------------------------------------
|
| 86 |
+
# Reddit OAuth helper β appβonly token (readβonly)
|
| 87 |
+
# ----------------------------------------------------------------------
|
| 88 |
+
import base64
|
| 89 |
+
import threading
|
| 90 |
+
|
| 91 |
+
_TOKEN_LOCK = threading.Lock()
|
| 92 |
+
_REDDIT_TOKEN_CACHE: dict[str, float | str] = {"token": None, "expires": 0.0}
|
| 93 |
+
|
| 94 |
+
def get_reddit_token(client_id: str, client_secret: str) -> str | None:
|
| 95 |
+
"""
|
| 96 |
+
Return a cached bearer token obtained via Reddit's clientβcredentials flow.
|
| 97 |
+
Returns None on error so callers can fall back to other scraping paths.
|
| 98 |
+
"""
|
| 99 |
+
now = time.time()
|
| 100 |
+
|
| 101 |
+
# Fast path β cached and still valid
|
| 102 |
+
if (_tok := _REDDIT_TOKEN_CACHE["token"]) and now < _REDDIT_TOKEN_CACHE["expires"] - 30:
|
| 103 |
+
return _tok # 30βsec grace
|
| 104 |
+
|
| 105 |
+
with _TOKEN_LOCK: # only one thread refreshes
|
| 106 |
+
# Reβcheck after acquiring the lock
|
| 107 |
+
if (_tok := _REDDIT_TOKEN_CACHE["token"]) and now < _REDDIT_TOKEN_CACHE["expires"] - 30:
|
| 108 |
+
return _tok
|
| 109 |
+
|
| 110 |
+
try:
|
| 111 |
+
auth = requests.auth.HTTPBasicAuth(client_id, client_secret)
|
| 112 |
+
headers = {"User-Agent": _REDDIT_UA}
|
| 113 |
+
data = {"grant_type": "client_credentials"} # appβonly, readβonly
|
| 114 |
+
r = requests.post(
|
| 115 |
+
"https://www.reddit.com/api/v1/access_token",
|
| 116 |
+
auth=auth,
|
| 117 |
+
data=data,
|
| 118 |
+
headers=headers,
|
| 119 |
+
timeout=10,
|
| 120 |
+
)
|
| 121 |
+
r.raise_for_status()
|
| 122 |
+
payload = r.json()
|
| 123 |
+
token = payload["access_token"]
|
| 124 |
+
ttl = int(payload.get("expires_in", 3600))
|
| 125 |
+
_REDDIT_TOKEN_CACHE.update({"token": token, "expires": now + ttl})
|
| 126 |
+
return token
|
| 127 |
+
except Exception as e:
|
| 128 |
+
logging.warning("Reddit token fetch failed: %s", e)
|
| 129 |
+
return None
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
@retry
|
| 134 |
+
def reddit_official_api(url: str, client_id: str, client_secret: str) -> str | None:
|
| 135 |
+
"""
|
| 136 |
+
β’ Works for *any* Reddit permalink or shortβlink.
|
| 137 |
+
β’ If the URL is a subreddit root (/r/<sub>) it still fetches 3 hot posts + top comment (unchanged).
|
| 138 |
+
"""
|
| 139 |
+
token = get_reddit_token(client_id, client_secret)
|
| 140 |
+
if not token:
|
| 141 |
+
return None
|
| 142 |
+
|
| 143 |
+
headers = {
|
| 144 |
+
"Authorization": f"bearer {token}",
|
| 145 |
+
"User-Agent": _REDDIT_UA,
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 149 |
+
# 1. Try to treat it as a *post* link by extracting an ID
|
| 150 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 151 |
+
post_id = _extract_post_id(url)
|
| 152 |
+
if post_id:
|
| 153 |
+
try:
|
| 154 |
+
r = requests.get(
|
| 155 |
+
f"https://oauth.reddit.com/comments/{post_id}",
|
| 156 |
+
headers=headers,
|
| 157 |
+
params={"limit": 5, "depth": 2, "raw_json": 1},
|
| 158 |
+
timeout=10,
|
| 159 |
+
)
|
| 160 |
+
r.raise_for_status()
|
| 161 |
+
data = r.json()
|
| 162 |
+
|
| 163 |
+
post = data[0]["data"]["children"][0]["data"]
|
| 164 |
+
title = post.get("title", "")
|
| 165 |
+
body = post.get("selftext", "")
|
| 166 |
+
author = post.get("author", "")
|
| 167 |
+
|
| 168 |
+
comments = []
|
| 169 |
+
if len(data) > 1:
|
| 170 |
+
for c in data[1]["data"]["children"][:50]:
|
| 171 |
+
if c["kind"] == "t1":
|
| 172 |
+
c_auth = c["data"].get("author", "")
|
| 173 |
+
c_body = c["data"].get("body", "")
|
| 174 |
+
if c_body:
|
| 175 |
+
comments.append(f"u/{c_auth}: {c_body}")
|
| 176 |
+
|
| 177 |
+
out = f"Title: {title}\nPosted by: u/{author}\n\n"
|
| 178 |
+
if body:
|
| 179 |
+
out += f"{body}\n\n"
|
| 180 |
+
if comments:
|
| 181 |
+
out += "Top comments:\n\n" + "\n\n".join(comments)
|
| 182 |
+
return out.strip()
|
| 183 |
+
|
| 184 |
+
except Exception as e:
|
| 185 |
+
logging.debug("Official API post fetch failed (%s); will try other strategies", e)
|
| 186 |
+
|
| 187 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 188 |
+
# 2. If not a post (or the fetch above failed) treat as *subreddit*
|
| 189 |
+
# root and list 3 hot posts, each with top comment (unchanged).
|
| 190 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 191 |
+
m_sub = re.search(r"reddit\.com/r/([^/?#]+)", url)
|
| 192 |
+
if not m_sub:
|
| 193 |
+
return None # allow caller to fall back
|
| 194 |
+
|
| 195 |
+
subreddit = m_sub.group(1)
|
| 196 |
+
try:
|
| 197 |
+
r = requests.get(
|
| 198 |
+
f"https://oauth.reddit.com/r/{subreddit}/hot",
|
| 199 |
+
headers=headers,
|
| 200 |
+
params={"limit": 3, "raw_json": 1},
|
| 201 |
+
timeout=10,
|
| 202 |
+
)
|
| 203 |
+
r.raise_for_status()
|
| 204 |
+
posts = r.json()["data"]["children"]
|
| 205 |
+
|
| 206 |
+
out_blocks = []
|
| 207 |
+
for p in posts:
|
| 208 |
+
pd = p["data"]
|
| 209 |
+
pid = pd["id"]
|
| 210 |
+
title = pd.get("title", "")
|
| 211 |
+
auth = pd.get("author", "")
|
| 212 |
+
link = pd.get("permalink", "")
|
| 213 |
+
|
| 214 |
+
# fetch one top comment
|
| 215 |
+
top_comment = ""
|
| 216 |
+
try:
|
| 217 |
+
c = requests.get(
|
| 218 |
+
f"https://oauth.reddit.com/comments/{pid}",
|
| 219 |
+
headers=headers,
|
| 220 |
+
params={"limit": 1, "depth": 1, "raw_json": 1},
|
| 221 |
+
timeout=10,
|
| 222 |
+
).json()
|
| 223 |
+
if len(c) > 1:
|
| 224 |
+
for cmt in c[1]["data"]["children"]:
|
| 225 |
+
if cmt["kind"] == "t1":
|
| 226 |
+
cauth = cmt["data"].get("author", "")
|
| 227 |
+
cbody = cmt["data"].get("body", "")
|
| 228 |
+
top_comment = f"u/{cauth}: {cbody}"
|
| 229 |
+
break
|
| 230 |
+
except Exception:
|
| 231 |
+
pass
|
| 232 |
+
|
| 233 |
+
block = f"Title: {title}\nPosted by: u/{auth}\nLink: https://www.reddit.com{link}\n"
|
| 234 |
+
if top_comment:
|
| 235 |
+
block += f"Top comment:\n{top_comment}"
|
| 236 |
+
out_blocks.append(block)
|
| 237 |
+
|
| 238 |
+
return "\n\n---\n\n".join(out_blocks)
|
| 239 |
+
|
| 240 |
+
except Exception as e:
|
| 241 |
+
logging.debug("Official API subreddit fetch failed: %s", e)
|
| 242 |
+
return None
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
@retry
|
| 246 |
+
def _reddit_old_version(url: str) -> str | None:
|
| 247 |
+
old_url = url.replace("www.reddit.com", "old.reddit.com")
|
| 248 |
+
try:
|
| 249 |
+
r = _SESS.get(old_url, headers={"User-Agent": _REDDIT_UA}, timeout=(CFG.connect_to, CFG.read_to))
|
| 250 |
+
if r.status_code != 200:
|
| 251 |
+
return None
|
| 252 |
+
|
| 253 |
+
soup = BeautifulSoup(r.text, "lxml")
|
| 254 |
+
title = soup.select_one(".title").text.strip() if soup.select_one(".title") else ""
|
| 255 |
+
author = soup.select_one(".author").text.strip() if soup.select_one(".author") else ""
|
| 256 |
+
post_body = soup.select_one(".usertext-body")
|
| 257 |
+
post_text = post_body.get_text(strip=True) if post_body else ""
|
| 258 |
+
|
| 259 |
+
comments = []
|
| 260 |
+
for comment in soup.select(".comment")[:50]:
|
| 261 |
+
c_author = comment.select_one(".author")
|
| 262 |
+
c_body = comment.select_one(".usertext-body")
|
| 263 |
+
if c_author and c_body:
|
| 264 |
+
comments.append(f"u/{c_author.text}: {c_body.get_text(strip=True)}")
|
| 265 |
+
|
| 266 |
+
result = f"Title: {title}\nPosted by: u/{author}\n\n"
|
| 267 |
+
if post_text:
|
| 268 |
+
result += f"{post_text}\n\n"
|
| 269 |
+
if comments:
|
| 270 |
+
result += "Top comments:\n\n" + "\n\n".join(comments)
|
| 271 |
+
|
| 272 |
+
return result.strip()
|
| 273 |
+
except Exception:
|
| 274 |
+
print("old reddit failed")
|
| 275 |
+
return None
|
| 276 |
+
|
| 277 |
+
@retry
|
| 278 |
+
def _pushshift_fallback(url: str) -> str | None:
|
| 279 |
+
m = re.search(r"/comments/([a-z0-9]{6,8})", url)
|
| 280 |
+
if not m:
|
| 281 |
+
return None
|
| 282 |
+
link_id = m.group(1)
|
| 283 |
+
try:
|
| 284 |
+
pst = _SESS.get(
|
| 285 |
+
"https://api.pushshift.io/reddit/submission/search/",
|
| 286 |
+
params={"ids": link_id, "size": 1},
|
| 287 |
+
timeout=10,
|
| 288 |
+
).json()["data"]
|
| 289 |
+
post_txt = pst[0]["selftext"] if pst else ""
|
| 290 |
+
|
| 291 |
+
com = _SESS.get(
|
| 292 |
+
"https://api.pushshift.io/reddit/comment/search/",
|
| 293 |
+
params={"link_id": link_id, "sort": "desc", "size": 3},
|
| 294 |
+
timeout=10,
|
| 295 |
+
).json()["data"]
|
| 296 |
+
top_txt = "\n\n".join(c["body"] for c in com)
|
| 297 |
+
|
| 298 |
+
txt = (post_txt + "\n\n" + top_txt).strip()
|
| 299 |
+
return txt or None
|
| 300 |
+
except Exception:
|
| 301 |
+
return None
|
| 302 |
+
|
| 303 |
+
def fetch_reddit(url: str) -> str:
|
| 304 |
+
txt = _reddit_old_version(url)
|
| 305 |
+
if txt:
|
| 306 |
+
return "[Retrieved from Reddit]" + txt[:CFG.text_cap]
|
| 307 |
+
|
| 308 |
+
if CFG.reddit_client_id and CFG.reddit_client_secret:
|
| 309 |
+
# print("AAAA")
|
| 310 |
+
txt = reddit_official_api(url, CFG.reddit_client_id, CFG.reddit_client_secret)
|
| 311 |
+
if txt:
|
| 312 |
+
return "[Retrieved from Reddit]" + txt[:CFG.text_cap]
|
| 313 |
+
|
| 314 |
+
txt = _reddit_json_api(url)
|
| 315 |
+
if txt:
|
| 316 |
+
return "[Retrieved from Reddit]" + txt[:CFG.text_cap]
|
| 317 |
+
|
| 318 |
+
txt = _pushshift_fallback(url)
|
| 319 |
+
if txt:
|
| 320 |
+
return "[Retrieved from Reddit]" + txt[:CFG.text_cap]
|
| 321 |
+
|
| 322 |
+
|
| 323 |
+
return fetch_blocked_site(url)[:CFG.text_cap]
|
| 324 |
+
|
web_agents_5/fetchers/youtube_fetcher.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
import logging, re
|
| 3 |
+
from config import CFG, _SESS
|
| 4 |
+
from web_helpers import retry, fetch_blocked_site
|
| 5 |
+
|
| 6 |
+
try:
|
| 7 |
+
import yt_dlp
|
| 8 |
+
_HAS = True
|
| 9 |
+
except ImportError:
|
| 10 |
+
_HAS = False
|
| 11 |
+
|
| 12 |
+
_LANGS = ["en", "en-US"]
|
| 13 |
+
|
| 14 |
+
@retry
|
| 15 |
+
def fetch_youtube(url: str) -> str:
|
| 16 |
+
if not _HAS:
|
| 17 |
+
return fetch_blocked_site(url)[:CFG.text_cap]
|
| 18 |
+
|
| 19 |
+
try:
|
| 20 |
+
ydl_opts = {"quiet": True, "no_warnings": True,
|
| 21 |
+
"writesubtitles": True, "writeautomaticsub": True,
|
| 22 |
+
"skip_download": True}
|
| 23 |
+
with yt_dlp.YoutubeDL(ydl_opts) as y:
|
| 24 |
+
info = y.extract_info(url, download=False)
|
| 25 |
+
|
| 26 |
+
subs = info.get("subtitles", {}) or {}
|
| 27 |
+
auto = info.get("automatic_captions", {}) or {}
|
| 28 |
+
tracks = next((subs.get(l) or auto.get(l) for l in _LANGS
|
| 29 |
+
if subs.get(l) or auto.get(l)), None)
|
| 30 |
+
if not tracks:
|
| 31 |
+
tracks = next(iter(subs.values()), []) or next(iter(auto.values()), [])
|
| 32 |
+
|
| 33 |
+
if tracks:
|
| 34 |
+
cap_url = tracks[0]["url"]
|
| 35 |
+
if "fmt=" not in cap_url: cap_url += "&fmt=json3"
|
| 36 |
+
r = _SESS.get(cap_url, timeout=(CFG.connect_to, CFG.read_to))
|
| 37 |
+
r.raise_for_status()
|
| 38 |
+
if cap_url.endswith(".vtt"):
|
| 39 |
+
text = " ".join(l for l in r.text.splitlines()
|
| 40 |
+
if l and "-->" not in l and not re.match(r"\d{2}:\d{2}", l))
|
| 41 |
+
else:
|
| 42 |
+
text = " ".join(seg["utf8"] for ev in r.json()["events"]
|
| 43 |
+
for seg in ev.get("segs", []))
|
| 44 |
+
if text: return text[:CFG.text_cap]
|
| 45 |
+
|
| 46 |
+
meta = (info.get("title","") + "\n\n" + info.get("description","")).strip()
|
| 47 |
+
return "[Retrieved from yt-dlp] " + meta[:CFG.text_cap]
|
| 48 |
+
except Exception as e:
|
| 49 |
+
logging.error("YouTube fetch failed %s: %s", url, e)
|
| 50 |
+
return fetch_blocked_site(url)[:CFG.text_cap]
|
web_agents_5/fetchers_async.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# """
|
| 2 |
+
# fetchers_async.py β Orchestrates multiple specialised fetchers **without changing
|
| 3 |
+
# its public surface** (`async def fetch_url(url: str) -> str`).
|
| 4 |
+
|
| 5 |
+
# Order of strategies (after specialised handlers):
|
| 6 |
+
# 1. **Jina AI** β fast & cheap fullβtext extraction
|
| 7 |
+
# 2. **Crawl4AI** β browserβbased heavyβweight fallback
|
| 8 |
+
# 3. **Legacy HTML** β trafilatura / readability lastβchance scrape
|
| 9 |
+
|
| 10 |
+
# Specialised fetchers (PDF, YouTube, Reddit) remain unchanged.
|
| 11 |
+
# """
|
| 12 |
+
# from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
# import asyncio, logging
|
| 15 |
+
# from typing import Callable
|
| 16 |
+
|
| 17 |
+
# from web_helpers import retry
|
| 18 |
+
# from fetchers.pdf_fetcher import fetch_pdf
|
| 19 |
+
# from fetchers.youtube_fetcher import fetch_youtube
|
| 20 |
+
# from fetchers.reddit_fetcher import fetch_reddit
|
| 21 |
+
# from fetchers.github_fetcher import fetch_github
|
| 22 |
+
|
| 23 |
+
# from fetchers.jina_fetcher import fetch_jina
|
| 24 |
+
# from fetchers.crawl4ai_fetcher import fetch_crawl4ai
|
| 25 |
+
# from fetchers.basic_fetcher import fetch_html
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
# _ERR_PREFIXES = ("[error", "[failed", "[unable")
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
# def _looks_error(txt: str | None) -> bool:
|
| 32 |
+
# return not txt or txt.strip().lower().startswith(_ERR_PREFIXES)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# async def _thread_wrapper(fn: Callable[[str], str], url: str) -> str | None:
|
| 36 |
+
# try:
|
| 37 |
+
# return await asyncio.to_thread(fn, url)
|
| 38 |
+
# except Exception as exc:
|
| 39 |
+
# logging.debug("%s threw in thread: %s", fn.__name__, exc)
|
| 40 |
+
|
| 41 |
+
# @retry
|
| 42 |
+
# async def fetch_url(url: str) -> str:
|
| 43 |
+
# url_l = url.lower()
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
# # 1Β βΒ Jina AI ------------------------------------------------------------
|
| 47 |
+
# if (out := await _thread_wrapper(fetch_jina, url)) and not _looks_error(out):
|
| 48 |
+
# return out
|
| 49 |
+
|
| 50 |
+
# # if (out := await _thread_wrapper(fetch_html, url)) and not _looks_error(out):
|
| 51 |
+
# # return out
|
| 52 |
+
|
| 53 |
+
# # 2Β βΒ Crawl4AI -----------------------------------------------------------
|
| 54 |
+
# try:
|
| 55 |
+
# md = await fetch_crawl4ai(url)
|
| 56 |
+
# if not _looks_error(md):
|
| 57 |
+
# return md
|
| 58 |
+
# except Exception as e:
|
| 59 |
+
# logging.debug("Crawl4AI failed: %s", e)
|
| 60 |
+
|
| 61 |
+
# if "pdf" in url_l:
|
| 62 |
+
# if (out := await _thread_wrapper(fetch_pdf, url)) and not _looks_error(out):
|
| 63 |
+
# return out
|
| 64 |
+
|
| 65 |
+
# if "reddit" in url_l:
|
| 66 |
+
# if (out := await _thread_wrapper(fetch_reddit, url)) and not _looks_error(out):
|
| 67 |
+
# return out
|
| 68 |
+
# if "youtube" in url_l:
|
| 69 |
+
# if (out := await _thread_wrapper(fetch_youtube, url)) and not _looks_error(out):
|
| 70 |
+
# return out
|
| 71 |
+
# if "github" in url_l:
|
| 72 |
+
# if (out := await _thread_wrapper(fetch_github, url)) and not _looks_error(out):
|
| 73 |
+
# return out
|
| 74 |
+
|
| 75 |
+
# # 3Β βΒ Basic HTML --------------------------------------------------------
|
| 76 |
+
# if (out := await _thread_wrapper(fetch_html, url)) and not _looks_error(out):
|
| 77 |
+
# return out
|
| 78 |
+
|
| 79 |
+
# return "[error fetch_url exhausted all methods]"
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
import asyncio, logging, time
|
| 84 |
+
|
| 85 |
+
from fetchers.pdf_fetcher import fetch_pdf
|
| 86 |
+
from fetchers.youtube_fetcher import fetch_youtube
|
| 87 |
+
from fetchers.reddit_fetcher import fetch_reddit
|
| 88 |
+
from fetchers.github_fetcher import fetch_github
|
| 89 |
+
from fetchers.jina_fetcher import fetch_jina
|
| 90 |
+
from fetchers.crawl4ai_fetcher import fetch_crawl4ai
|
| 91 |
+
from fetchers.basic_fetcher import fetch_html
|
| 92 |
+
|
| 93 |
+
_ERR_PREFIXES = ("[error", "[failed", "[unable]")
|
| 94 |
+
|
| 95 |
+
def _looks_error(txt: str | None) -> bool:
|
| 96 |
+
return not txt or txt.strip().lower().startswith(_ERR_PREFIXES)
|
| 97 |
+
|
| 98 |
+
# per-fetcher hard caps (seconds)
|
| 99 |
+
_FETCHER_TIMEOUTS = {
|
| 100 |
+
"fetch_jina": 20.0,
|
| 101 |
+
"fetch_github": 10.0,
|
| 102 |
+
"fetch_crawl4ai": 40.0,
|
| 103 |
+
"fetch_html": 20.0,
|
| 104 |
+
"fetch_pdf": 30.0,
|
| 105 |
+
"fetch_youtube": 30.0,
|
| 106 |
+
"fetch_reddit": 10.0,
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
async def fetch_url(url: str) -> str:
|
| 111 |
+
url_l = url.lower()
|
| 112 |
+
|
| 113 |
+
async def timed_fetch(fn) -> str | None:
|
| 114 |
+
name = fn.__name__
|
| 115 |
+
timeout = _FETCHER_TIMEOUTS.get(name, 60.0)
|
| 116 |
+
start_ts = time.perf_counter()
|
| 117 |
+
try:
|
| 118 |
+
# choose sync or async execution path
|
| 119 |
+
coro = fn(url) if asyncio.iscoroutinefunction(fn) else asyncio.to_thread(fn, url)
|
| 120 |
+
result = await asyncio.wait_for(coro, timeout=timeout)
|
| 121 |
+
elapsed = (time.perf_counter() - start_ts) * 1000
|
| 122 |
+
if result and not _looks_error(result):
|
| 123 |
+
logging.info(f"[{name}] β
success in {elapsed:.1f} ms")
|
| 124 |
+
return result
|
| 125 |
+
logging.warning(f"[{name}] β error response in {elapsed:.1f} ms")
|
| 126 |
+
except asyncio.TimeoutError:
|
| 127 |
+
logging.warning(f"[{name}] β±οΈ timed-out after {timeout}s")
|
| 128 |
+
except Exception as e:
|
| 129 |
+
elapsed = (time.perf_counter() - start_ts) * 1000
|
| 130 |
+
logging.warning(f"[{name}] π₯ exception in {elapsed:.1f} ms β {e}")
|
| 131 |
+
return None
|
| 132 |
+
|
| 133 |
+
async def try_chain(*fetchers) -> str | None:
|
| 134 |
+
for fn in fetchers:
|
| 135 |
+
if result := await timed_fetch(fn):
|
| 136 |
+
return result
|
| 137 |
+
return None
|
| 138 |
+
|
| 139 |
+
# -------------- domain-specific chains ---------------
|
| 140 |
+
if "github.com" in url_l:
|
| 141 |
+
return await try_chain(fetch_jina, fetch_github, fetch_crawl4ai)
|
| 142 |
+
if "wikipedia.org" in url_l:
|
| 143 |
+
return await try_chain(fetch_html, fetch_jina, fetch_crawl4ai)
|
| 144 |
+
if "reddit.com" in url_l:
|
| 145 |
+
return await try_chain(fetch_jina, fetch_reddit, fetch_html)
|
| 146 |
+
if "quora.com" in url_l:
|
| 147 |
+
return await try_chain(fetch_crawl4ai, fetch_jina, fetch_html)
|
| 148 |
+
if "youtube.com" in url_l or "youtu.be" in url_l:
|
| 149 |
+
return await try_chain(fetch_jina, fetch_youtube)
|
| 150 |
+
if url_l.endswith(".pdf") or "pdf" in url_l:
|
| 151 |
+
return await try_chain(fetch_jina, fetch_pdf, fetch_html, fetch_crawl4ai)
|
| 152 |
+
|
| 153 |
+
# -------------- generic fallback ---------------------
|
| 154 |
+
return (await try_chain(fetch_jina, fetch_crawl4ai, fetch_html)
|
| 155 |
+
or "[error fetch_url exhausted all methods]")
|
web_agents_5/host_serper2.sh
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
export PYTHONPATH=./
|
| 2 |
+
export MAX_OUTBOUND=256
|
| 3 |
+
export JINA_CACHE_DIR=./../.cache/jina_cache
|
| 4 |
+
export SERPER_CACHE_DIR=./../.cache/serper_cache
|
| 5 |
+
|
| 6 |
+
python ./web_agents_5/sandbox_serper.py --port $PORT_SERPER_HOST --workers 256
|
web_agents_5/sandbox_serper.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""sandbox_serper.py β resilient Serper sandbox v2.1
|
| 3 |
+
|
| 4 |
+
Fixes
|
| 5 |
+
-----
|
| 6 |
+
* Moved `global _MAX_OUTBOUND, _SEM` declaration to the **top of `main()`**
|
| 7 |
+
before any reference, eliminating the `SyntaxError: name used prior to
|
| 8 |
+
global declaration`.
|
| 9 |
+
* No functional changes otherwise.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
import argparse, asyncio, logging, os, time, traceback
|
| 14 |
+
from fastapi import FastAPI
|
| 15 |
+
from fastapi.concurrency import run_in_threadpool
|
| 16 |
+
from pydantic import BaseModel
|
| 17 |
+
import uvicorn
|
| 18 |
+
import time
|
| 19 |
+
# βββββββββββββββββββββββββ logging setup ββββββββββββββββββββββββββ
|
| 20 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
| 21 |
+
log = logging.getLogger("sandbox_serper")
|
| 22 |
+
|
| 23 |
+
app = FastAPI()
|
| 24 |
+
|
| 25 |
+
class Req(BaseModel):
|
| 26 |
+
env: str
|
| 27 |
+
call: str
|
| 28 |
+
timeout: int = 60
|
| 29 |
+
|
| 30 |
+
# βββββββββββββββββββββ global throughput gate βββββββββββββββββββββ
|
| 31 |
+
_MAX_OUTBOUND = int(os.getenv("MAX_OUTBOUND", "10"))
|
| 32 |
+
_SEM = asyncio.Semaphore(_MAX_OUTBOUND)
|
| 33 |
+
|
| 34 |
+
# βββββββββββββββββββββββββ endpoint βββββββββββββββββββββββββββββββ
|
| 35 |
+
@app.post("/execute")
|
| 36 |
+
async def execute(req: Req):
|
| 37 |
+
# async with _SEM:
|
| 38 |
+
async with _SEM: # β°β° throttle
|
| 39 |
+
result = await run_in_threadpool(_safe_eval, req.env,
|
| 40 |
+
req.call, req.timeout)
|
| 41 |
+
|
| 42 |
+
return {
|
| 43 |
+
"output": "",
|
| 44 |
+
"result": result,
|
| 45 |
+
"error": None if not str(result).startswith("[tool-error]") else result,
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
# βββββββββββββββββββββ sandbox evaluator ββββββββββββββββββββββββββ
|
| 49 |
+
|
| 50 |
+
def _safe_eval(env: str, call: str, timeout: int):
|
| 51 |
+
start = time.time(); loc: dict = {}
|
| 52 |
+
try:
|
| 53 |
+
exec(env, {}, loc)
|
| 54 |
+
exec(f"response = {call}", {}, loc)
|
| 55 |
+
if time.time() - start > timeout:
|
| 56 |
+
raise TimeoutError(f"wall-clock timeout for call {call}")
|
| 57 |
+
return loc.get("response", "[tool-error] no response var")
|
| 58 |
+
except Exception as e:
|
| 59 |
+
log.error("tool error: %s\n%s", e, traceback.format_exc())
|
| 60 |
+
return f"[tool-error] {e}"
|
| 61 |
+
|
| 62 |
+
# βββββββββββββββββββββββββββ main ββββββββββββββββββββββββββββββββ
|
| 63 |
+
|
| 64 |
+
def main():
|
| 65 |
+
global _MAX_OUTBOUND, _SEM # β moved to top
|
| 66 |
+
|
| 67 |
+
ap = argparse.ArgumentParser()
|
| 68 |
+
ap.add_argument("--port", type=int, default=1211)
|
| 69 |
+
ap.add_argument("--workers", type=int, default=1)
|
| 70 |
+
ap.add_argument("--reload", action="store_true")
|
| 71 |
+
# ap.add_argument("--max_outbound", type=int, default=_MAX_OUTBOUND,
|
| 72 |
+
# help="simultaneous outbound calls across all workers")
|
| 73 |
+
args = ap.parse_args()
|
| 74 |
+
|
| 75 |
+
_SEM = asyncio.Semaphore(_MAX_OUTBOUND)
|
| 76 |
+
|
| 77 |
+
if args.reload and args.workers > 1:
|
| 78 |
+
raise SystemExit("--reload and --workers>1 are mutually exclusive")
|
| 79 |
+
|
| 80 |
+
# log.info("Starting sandbox :%d | workers=%d | max_outbound=%d",
|
| 81 |
+
# args.port, args.workers, _MAX_OUTBOUND)
|
| 82 |
+
|
| 83 |
+
if args.workers > 1:
|
| 84 |
+
uvicorn.run("sandbox_serper:app", host="0.0.0.0", port=args.port, workers=args.workers)
|
| 85 |
+
else:
|
| 86 |
+
uvicorn.run(app, host="0.0.0.0", port=args.port, reload=args.reload)
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
if __name__ == "__main__":
|
| 90 |
+
main()
|
web_agents_5/search_api.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
import asyncio
|
| 3 |
+
from typing import Dict, List
|
| 4 |
+
from utils import (google_search, url_hits_to_markdown,
|
| 5 |
+
search_result_to_markdown,
|
| 6 |
+
async_search_and_extract, _bad)
|
| 7 |
+
from fetchers_async import fetch_url
|
| 8 |
+
from compressor import compress_text, query_text
|
| 9 |
+
from config import CFG
|
| 10 |
+
import logging
|
| 11 |
+
|
| 12 |
+
# print("PYTHONPATH:", sys.path)
|
| 13 |
+
|
| 14 |
+
def web_search(query):
|
| 15 |
+
return search_urls(query = query, top_k = 10)
|
| 16 |
+
|
| 17 |
+
def web_visit(url):
|
| 18 |
+
return open_url(url = url, compress = False)
|
| 19 |
+
|
| 20 |
+
# ββ 1. search_urls ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 21 |
+
def search_urls(query: str, top_k: int = 10) -> str:
|
| 22 |
+
return url_hits_to_markdown(google_search(query, top_k))
|
| 23 |
+
|
| 24 |
+
# ββ 2. open_url βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 25 |
+
def open_url(url: str, *, compress: bool = True, pct: float = CFG.pct,
|
| 26 |
+
model: str = "gpt-4o-mini") -> str:
|
| 27 |
+
if _bad(url): return _bad(url)
|
| 28 |
+
try:
|
| 29 |
+
body = asyncio.run(fetch_url(url))
|
| 30 |
+
body = str(body)
|
| 31 |
+
except Exception as e:
|
| 32 |
+
return f"[error fetching URL: {e}]"
|
| 33 |
+
if compress:
|
| 34 |
+
try:
|
| 35 |
+
body = compress_text(body, pct=pct, model=model)
|
| 36 |
+
except Exception as e:
|
| 37 |
+
body = f"[compression failed: {e}]\n\n{body[:2000]}"
|
| 38 |
+
return body
|
| 39 |
+
|
| 40 |
+
# ββ 3. search_and_parse_query βββββββββββββββββββββββββββββββββββββββββββ
|
| 41 |
+
def search_and_parse_query(query: str, top_k: int = 3, *,
|
| 42 |
+
compress: bool = True, pct: float = CFG.pct) -> str:
|
| 43 |
+
blocks = asyncio.run(async_search_and_extract(query, top_k))
|
| 44 |
+
if compress:
|
| 45 |
+
for b in blocks:
|
| 46 |
+
try:
|
| 47 |
+
cmp = compress_text(b["body"], pct=pct)
|
| 48 |
+
b["body"] = (f"**Summary:**\n{cmp['narrative']}\n\n"
|
| 49 |
+
f"**Facts:**\n{cmp['facts']}\n\n"
|
| 50 |
+
f"**Tables:**\n{cmp['tables']}")
|
| 51 |
+
except Exception as e:
|
| 52 |
+
b["body"] = f"[compression failed: {e}]\n\n{b['body']}"
|
| 53 |
+
return search_result_to_markdown(blocks)
|
| 54 |
+
|
| 55 |
+
# ββ 4. query_url ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 56 |
+
def query_url(url: str, goal: str, *, model: str = "gpt-4.1-mini") -> str:
|
| 57 |
+
if _bad(url): return _bad(url)
|
| 58 |
+
body = asyncio.run(fetch_url(url))
|
| 59 |
+
if not body or body.startswith("[error"):
|
| 60 |
+
return f"[failed to retrieve content from {url}]\n\n{body}"
|
| 61 |
+
return query_text(url, body, goal, model=model)['extracted_info']
|
| 62 |
+
|
| 63 |
+
|
web_agents_5/utils.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
import asyncio, logging, re, tiktoken
|
| 3 |
+
from typing import Dict, List
|
| 4 |
+
from config import CFG, _SESS
|
| 5 |
+
from fetchers_async import fetch_url
|
| 6 |
+
from web_helpers import retry
|
| 7 |
+
from urllib.parse import urlparse
|
| 8 |
+
|
| 9 |
+
enc = tiktoken.get_encoding("cl100k_base")
|
| 10 |
+
|
| 11 |
+
# ββ Google / Serper search ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 12 |
+
|
| 13 |
+
# def google_search(query: str, top_k: int = 10) -> List[Dict[str,str]]:
|
| 14 |
+
# if not CFG.serper_key:
|
| 15 |
+
# raise EnvironmentError("SERPER_API_KEY not set")
|
| 16 |
+
# r = _SESS.post(
|
| 17 |
+
# CFG.serper_ep,
|
| 18 |
+
# headers={"X-API-KEY": CFG.serper_key, "Content-Type": "application/json"},
|
| 19 |
+
# json={"q": query}, timeout=20)
|
| 20 |
+
# r.raise_for_status()
|
| 21 |
+
# hits = []
|
| 22 |
+
# for it in r.json().get("organic", []):
|
| 23 |
+
# hits.append({"title": it.get("title",""),
|
| 24 |
+
# "link": it.get("link",""),
|
| 25 |
+
# "snippet": it.get("snippet","")})
|
| 26 |
+
# if len(hits) == top_k: break
|
| 27 |
+
# return hits
|
| 28 |
+
import hashlib, json, logging, os, time
|
| 29 |
+
from typing import List, Dict
|
| 30 |
+
|
| 31 |
+
def _canon_query(q: str) -> str:
|
| 32 |
+
# Normalize whitespace to avoid duplicate keys for e.g. "foo bar"
|
| 33 |
+
return " ".join((q or "").strip().split())
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def _search_cache_key(query: str, top_k: int) -> str:
|
| 37 |
+
cq = _canon_query(query)
|
| 38 |
+
raw = f"{top_k}|{cq}"
|
| 39 |
+
return hashlib.sha256(raw.encode("utf-8")).hexdigest() + ".json"
|
| 40 |
+
|
| 41 |
+
def _search_cache_paths(query: str, top_k: int) -> str:
|
| 42 |
+
root = CFG.serper_cache_dir
|
| 43 |
+
os.makedirs(root, exist_ok=True)
|
| 44 |
+
return os.path.join(root, _search_cache_key(query, top_k))
|
| 45 |
+
|
| 46 |
+
def _ttl_seconds() -> int:
|
| 47 |
+
# 0 or missing β no expiry
|
| 48 |
+
try:
|
| 49 |
+
return int(getattr(CFG, "search_cache_ttl", 0) or int(os.environ.get("SEARCH_CACHE_TTL", "0")))
|
| 50 |
+
except Exception:
|
| 51 |
+
return 0
|
| 52 |
+
|
| 53 |
+
def _load_search_cache(path: str) -> List[Dict[str, str]] | None:
|
| 54 |
+
try:
|
| 55 |
+
if not os.path.exists(path) or os.path.getsize(path) <= 2:
|
| 56 |
+
return None
|
| 57 |
+
ttl = _ttl_seconds()
|
| 58 |
+
if ttl > 0:
|
| 59 |
+
age = time.time() - os.path.getmtime(path)
|
| 60 |
+
if age > ttl:
|
| 61 |
+
return None
|
| 62 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 63 |
+
data = json.load(f)
|
| 64 |
+
# Basic shape check: list of dicts with expected keys
|
| 65 |
+
if isinstance(data, list):
|
| 66 |
+
return data
|
| 67 |
+
except Exception as e:
|
| 68 |
+
logging.debug("Serper cache read failed (%s): %s", path, e)
|
| 69 |
+
return None
|
| 70 |
+
|
| 71 |
+
def _save_search_cache(path: str, hits: List[Dict[str, str]]) -> None:
|
| 72 |
+
try:
|
| 73 |
+
tmp = f"{path}.tmp.{os.getpid()}"
|
| 74 |
+
with open(tmp, "w", encoding="utf-8") as f:
|
| 75 |
+
json.dump(hits, f, ensure_ascii=False)
|
| 76 |
+
os.replace(tmp, path) # atomic on same FS
|
| 77 |
+
except Exception as e:
|
| 78 |
+
logging.debug("Serper cache write failed (%s): %s", path, e)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
@retry
|
| 82 |
+
def google_search(query: str, top_k: int = 10) -> List[Dict[str,str]]:
|
| 83 |
+
if not CFG.serper_key:
|
| 84 |
+
raise EnvironmentError("SERPER_API_KEY not set")
|
| 85 |
+
|
| 86 |
+
cpath = _search_cache_paths(query, top_k)
|
| 87 |
+
cached = _load_search_cache(cpath)
|
| 88 |
+
if cached is not None:
|
| 89 |
+
logging.info("Serper search (cache hit) β %r (top_k=%d)", _canon_query(query), top_k)
|
| 90 |
+
return cached
|
| 91 |
+
|
| 92 |
+
r = _SESS.post(
|
| 93 |
+
CFG.serper_ep,
|
| 94 |
+
headers={"X-API-KEY": CFG.serper_key, "Content-Type": "application/json"},
|
| 95 |
+
json={"q": query},
|
| 96 |
+
timeout=20
|
| 97 |
+
)
|
| 98 |
+
r.raise_for_status()
|
| 99 |
+
hits: List[Dict[str, str]] = []
|
| 100 |
+
for it in r.json().get("organic", []):
|
| 101 |
+
hits.append({
|
| 102 |
+
"title": it.get("title", ""),
|
| 103 |
+
"link": it.get("link", ""),
|
| 104 |
+
"snippet": it.get("snippet", ""),
|
| 105 |
+
})
|
| 106 |
+
if len(hits) == top_k:
|
| 107 |
+
break
|
| 108 |
+
|
| 109 |
+
_save_search_cache(cpath, hits)
|
| 110 |
+
return hits
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
# ββ async extract per hit βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 114 |
+
async def async_search_and_extract(query: str, top_k: int = 5) -> List[Dict]:
|
| 115 |
+
hits = google_search(query, top_k)
|
| 116 |
+
async def enrich(h):
|
| 117 |
+
return {**h, "body": await fetch_url(h["link"])}
|
| 118 |
+
return await asyncio.gather(*(enrich(h) for h in hits))
|
| 119 |
+
|
| 120 |
+
# ββ markdown helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 121 |
+
def url_hits_to_markdown(hits: List[Dict[str,str]]) -> str:
|
| 122 |
+
out = []
|
| 123 |
+
for i, h in enumerate(hits, 1):
|
| 124 |
+
out.append(f"### {i}. {h['title']}\n**URL**: {h['link']}\n\n**Snippet**: {h['snippet']}\n")
|
| 125 |
+
return "\n---\n\n".join(out)
|
| 126 |
+
|
| 127 |
+
def search_result_to_markdown(blocks: List[Dict]) -> str:
|
| 128 |
+
out = []
|
| 129 |
+
for i, b in enumerate(blocks, 1):
|
| 130 |
+
out.append(f"### {i}. **Title**: {b['title']}\n**URL**: {b['link']}\n\n"
|
| 131 |
+
f"**Snippet**: {b['snippet']}\n\n**Content**:\n{b['body']}\n")
|
| 132 |
+
return "\n---\n\n".join(out)
|
| 133 |
+
|
| 134 |
+
def trim_to_tokens(text: str, limit: int, model: str = "gpt-3.5-turbo") -> str:
|
| 135 |
+
ids = enc.encode(text)
|
| 136 |
+
if len(ids) <= limit: return text
|
| 137 |
+
keep = limit // 2
|
| 138 |
+
return enc.decode(ids[:keep] + ids[-keep:])
|
| 139 |
+
|
| 140 |
+
def _bad(url: str) -> str|None:
|
| 141 |
+
p = urlparse(url)
|
| 142 |
+
if p.scheme not in ("http","https") or not p.netloc:
|
| 143 |
+
return "[error: invalid URL β must start with http:// or https://]"
|
| 144 |
+
return None
|
| 145 |
+
|
web_agents_5/web_helpers.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
import functools, logging, random, re, time, requests, trafilatura
|
| 3 |
+
from typing import Callable
|
| 4 |
+
from bs4 import BeautifulSoup
|
| 5 |
+
from config import CFG, _RND
|
| 6 |
+
|
| 7 |
+
# ββ retry ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 8 |
+
def retry(fn: Callable) -> Callable:
|
| 9 |
+
@functools.wraps(fn)
|
| 10 |
+
def _wrap(*a, **kw):
|
| 11 |
+
for i in range(CFG.retries):
|
| 12 |
+
try:
|
| 13 |
+
return fn(*a, **kw)
|
| 14 |
+
except Exception as e:
|
| 15 |
+
if i == CFG.retries - 1:
|
| 16 |
+
raise
|
| 17 |
+
delay = CFG.backoff * (2 ** i) * (1 + 0.3 * _RND.random())
|
| 18 |
+
logging.warning("Retry %s/%s %s: %s (%.2fs)",
|
| 19 |
+
i+1, CFG.retries, fn.__name__, e, delay)
|
| 20 |
+
time.sleep(delay)
|
| 21 |
+
return _wrap
|
| 22 |
+
|
| 23 |
+
# ββ text extraction ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 24 |
+
def extract_main_text(html: str) -> str:
|
| 25 |
+
txt = trafilatura.extract(html, output_format="txt") or ""
|
| 26 |
+
if len(txt) >= 500:
|
| 27 |
+
return txt
|
| 28 |
+
from readability import Document
|
| 29 |
+
soup = BeautifulSoup(Document(html).summary(), "lxml")
|
| 30 |
+
txt = soup.get_text(" ", strip=True)
|
| 31 |
+
if len(txt) >= 400:
|
| 32 |
+
return txt
|
| 33 |
+
for tag in soup(["script", "style", "noscript"]):
|
| 34 |
+
tag.decompose()
|
| 35 |
+
return re.sub(r"\s+", " ", soup.get_text(" ").strip())
|
| 36 |
+
|
| 37 |
+
# ββ lastβchance fetch when everything fails ββββββββββββββββββββββββββββββ
|
| 38 |
+
@retry
|
| 39 |
+
def fetch_blocked_site(url: str) -> str:
|
| 40 |
+
hdrs = {"User-Agent": CFG.ua, "Referer": "https://www.google.com/"}
|
| 41 |
+
sess = requests.Session(); sess.headers.update(hdrs)
|
| 42 |
+
|
| 43 |
+
# 1) direct
|
| 44 |
+
try:
|
| 45 |
+
r = sess.get(url, timeout=(CFG.connect_to, CFG.read_to))
|
| 46 |
+
r.raise_for_status()
|
| 47 |
+
txt = extract_main_text(r.text)
|
| 48 |
+
if len(txt) > 500:
|
| 49 |
+
return "[Retrieved from redirected attempt]\n\n" + txt[:CFG.text_cap]
|
| 50 |
+
except Exception as e:
|
| 51 |
+
logging.debug("Direct scrape failed %s: %s", url, e)
|
| 52 |
+
|
| 53 |
+
# 2) wayback
|
| 54 |
+
try:
|
| 55 |
+
wb = f"https://web.archive.org/web/2023/{url}"
|
| 56 |
+
r = sess.get(wb, timeout=(CFG.connect_to, CFG.read_to))
|
| 57 |
+
r.raise_for_status()
|
| 58 |
+
txt = extract_main_text(r.text)
|
| 59 |
+
if len(txt) > 500:
|
| 60 |
+
return "[Retrieved from archive.org]\n\n" + txt[:CFG.text_cap]
|
| 61 |
+
except Exception as e:
|
| 62 |
+
logging.debug("Wayback scrape failed %s: %s", url, e)
|
| 63 |
+
|
| 64 |
+
return f"[Error accessing {url}. Try VPN or manual archive.org check.]"
|