Tasmay-Tib commited on
Commit
5ab87e0
Β·
0 Parent(s):
.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.json filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ __pycache__/
2
+ .gradio/
3
+ temp/*
4
+ .cache/*
5
+ nohup.out
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Fathom DeepResearch
3
+ emoji: πŸ“Š
4
+ colorFrom: green
5
+ colorTo: red
6
+ sdk: gradio
7
+ sdk_version: 5.44.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ short_description: Use the fathom search 4b model interactively
12
+ ---
app.py ADDED
The diff for this file is too large to render. See raw diff
 
eval_benchmarks.py ADDED
@@ -0,0 +1,468 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # eval_benchmark_multithreaded.py
2
+ """Unified benchmarking script for ReCall, ZeroSearch, and R1‑Searcher
3
+ with optional multi‑threaded execution.
4
+
5
+ Example usage (single‑threaded)
6
+ -------------------------------
7
+ ```bash
8
+ python eval_benchmark.py \
9
+ --dataset frames \
10
+ --agent r1-searcher \
11
+ --model-url http://0.0.0.0:1233 \
12
+ --out-base /tmp/evals \
13
+ --mode single
14
+ ```
15
+
16
+ Example usage (multi‑threaded, 128 workers)
17
+ ------------------------------------------
18
+ ```bash
19
+ python eval_benchmark.py \
20
+ --dataset frames \
21
+ --agent recall \
22
+ --model-url http://0.0.0.0:1231 \
23
+ --out-base /tmp/evals \
24
+ --mode multi \
25
+ --workers 128
26
+ ```
27
+ The script will:
28
+ 1. Load the specified dataset JSONL file that contains objects with keys
29
+ `question` and `answer`.
30
+ 2. Build the chosen agent wrapper (`recall`, `zerosearch`, or `r1-searcher`).
31
+ 3. Stream one JSONL line per example with *all* details needed for analysis.
32
+ 4. Optionally run the evaluation loop in parallel using a configurable number
33
+ of worker threads.
34
+ 5. Automatically construct the output path as:
35
+ ```
36
+ {out_base}/{model_name}/{dataset}.jsonl
37
+ ```
38
+ where `model_name` is derived from the `--model-url` (characters after the
39
+ last `/`).
40
+ """
41
+ from __future__ import annotations
42
+
43
+ import argparse
44
+ import json
45
+ import logging
46
+ import os
47
+ import pathlib
48
+ import re
49
+ import threading
50
+ import time
51
+ from concurrent.futures import ThreadPoolExecutor, as_completed
52
+ from typing import Dict, List
53
+
54
+ import unicodedata
55
+ from openai import OpenAI, APIStatusError
56
+ from tqdm import tqdm
57
+
58
+ # --------------------------------------------------------------------
59
+ # Agent imports (ensure PYTHONPATH is set appropriately)
60
+ # --------------------------------------------------------------------
61
+ from re_call import ReCall # user's wrapper
62
+ # from re_call import ZeroSearchInference, ZeroSearchConfig
63
+ # from re_call import R1Searcher, R1SearchConfig as R1Cfg
64
+ # from re_call import O1Cfg, O1Searcher
65
+ from pathlib import Path
66
+ # from re_call import SDSCfg, SDSSearcher
67
+
68
+ # --------------------------------------------------------------------
69
+ # Environment Keys – override with real keys or environment variables
70
+ # --------------------------------------------------------------------
71
+ #for recall
72
+ # search_env = "from search_api import web_search, web_visit"
73
+ # search_schemas =[
74
+ # {
75
+ # "name": "web_search",
76
+ # "description": "Google search and return links to web-pages with a brief snippet given a text query",
77
+ # "parameters": {
78
+ # "type": "object",
79
+ # "properties": {
80
+ # "query": {"type": "string"},
81
+ # },
82
+ # "required": ["query"],
83
+ # },
84
+ # },
85
+ # {
86
+ # "name": "web_visit",
87
+ # "description": "Visit webpage and return its content",
88
+ # "parameters": {
89
+ # "type": "object",
90
+ # "properties": {
91
+ # "url": {"type": "string", "description": "The URL of the webpage to visit. Must be a single URL"},
92
+ # },
93
+ # "required": ["url"],
94
+ # },
95
+ # }
96
+ # ]
97
+ # for recall
98
+ search_env = "from search_api import search_urls, open_url, search_and_parse_query, query_url"
99
+ search_schemas =[
100
+ {
101
+ "name": "search_urls",
102
+ "description": "Google search and return links to web-pages with a brief snippet given a text query",
103
+ "parameters": {
104
+ "type": "object",
105
+ "properties": {
106
+ "query": {"type": "string"},
107
+ "top_k": {"type": "integer", "default": 10},
108
+ },
109
+ "required": ["query"],
110
+ },
111
+ },
112
+ {
113
+ "name": "query_url",
114
+ "description": "Visit webpage and return evidence based retrival for the provided goal",
115
+ "parameters": {
116
+ "type": "object",
117
+ "properties": {
118
+ "url": {"type": "string", "description": "The URL of the webpage to visit. Must be a single URL"},
119
+ "goal": {"type": "string", "description": "The specific information goal for visiting webpage"},
120
+ },
121
+ "required": ["url", "goal"],
122
+ },
123
+ }
124
+ ]
125
+
126
+ EXECUTOR_URL = os.environ["HOST_SERPER_URL"]
127
+ DATA_ROOT = pathlib.Path("./eval_datasets")
128
+ SEM = threading.Semaphore(3) # limit concurrent judge calls
129
+ JUDGE_MODEL = "gpt-4.1-mini"
130
+
131
+ try:
132
+ base = Path(__file__).resolve().parent
133
+ except NameError: # e.g., REPL/Jupyter
134
+ base = Path.cwd()
135
+
136
+ TOKENIZER_DIR = (base / "tokenizer-info").resolve()
137
+
138
+ # ───────────────────────── tokenizer ────────────────────────────────────────
139
+ try:
140
+ from transformers import AutoTokenizer
141
+ tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, trust_remote_code=True)
142
+ except Exception as e:
143
+ import sys
144
+ sys.exit(f"❌ Could not load Qwen3 tokenizer: {e}")
145
+
146
+ import hashlib
147
+
148
+ def get_uid(sample: dict) -> str:
149
+ """Generate a UID using SHA256 hash of question."""
150
+ return hashlib.sha256(sample["question"].strip().encode("utf-8")).hexdigest()
151
+
152
+ # --------------------------------------------------------------------
153
+ # Regex & utilities
154
+ # --------------------------------------------------------------------
155
+ def extract_answer_tagged(text: str) -> str:
156
+
157
+ ANS_RE = re.compile(r"<answer>(.*?)</answer>", re.S)
158
+ match = ANS_RE.findall(text)
159
+ if match :
160
+ return match[-1].strip().lower()
161
+ else:
162
+ print("No answer tags found")
163
+ return text[-200:] #because o1-searcher fails to follow format
164
+
165
+ def extract_answer_boxed(response):
166
+ def remove_boxed(s):
167
+ if "\\boxed " in s:
168
+ left = "\\boxed "
169
+ assert s[:len(left)] == left
170
+ return s[len(left):]
171
+
172
+ left = "\\boxed{"
173
+
174
+ assert s[:len(left)] == left
175
+ assert s[-1] == "}"
176
+
177
+ return s[len(left):-1]
178
+
179
+ def last_boxed_only_string(string):
180
+ idx = string.rfind("\\boxed")
181
+ if "\\boxed " in string:
182
+ return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
183
+ if idx < 0:
184
+ idx = string.rfind("\\fbox")
185
+ if idx < 0:
186
+ return None
187
+
188
+ i = idx
189
+ right_brace_idx = None
190
+ num_left_braces_open = 0
191
+ while i < len(string):
192
+ if string[i] == "{":
193
+ num_left_braces_open += 1
194
+ if string[i] == "}":
195
+ num_left_braces_open -= 1
196
+ if num_left_braces_open == 0:
197
+ right_brace_idx = i
198
+ break
199
+ i += 1
200
+
201
+ if right_brace_idx is None:
202
+ retval = None
203
+ else:
204
+ retval = string[idx:right_brace_idx + 1]
205
+
206
+ return retval
207
+ answer = remove_boxed(last_boxed_only_string(response))
208
+ return answer
209
+
210
+
211
+
212
+ JUDGE_SYS = """
213
+ You are an impartial judge evaluating the correctness of a model's answer against a ground-truth answer for a given question. Your task is to:
214
+ 1. Compare the model's answer to the ground-truth answer.
215
+ 2. Determine if the model's answer is correct or incorrect.
216
+
217
+ **Input Format:**
218
+ - Question: {question}
219
+ - Ground Truth: {ground_truth}
220
+ - Model Answer: {model_answer}
221
+
222
+ **Output Format:**
223
+ correct/incorrect/unknown
224
+
225
+ **Guidelines:**
226
+ - The model's answer is correct if it matches the ground-truth answer in meaning and content it is case-insensitive, ignore minor punctuation or formatting differences.
227
+ - If the model's answer contains additional information, it is still correct as long as the core answer matches the ground truth.
228
+ - Be precise output a single word correct / incorrect / unknown and **nothing else**
229
+ - For MCQ questions match the option ID A. B. C. or D. if its correct the answer is correct.
230
+ """
231
+ # - If the model's answer is partially correct or contains errors, it is incorrect.
232
+
233
+
234
+ # Thread‑local OpenAI client cache
235
+
236
+ def _oa() -> OpenAI:
237
+ th = threading.current_thread()
238
+ if not hasattr(th, "_oa"):
239
+ th._oa = OpenAI()
240
+ return th._oa
241
+
242
+
243
+ def judge(q: str, gt: str, pred: str) -> str:
244
+ if pred == "":
245
+ return "unknown"
246
+ prompt = JUDGE_SYS.format(question=q, ground_truth=gt, model_answer=pred)
247
+ try:
248
+ with SEM:
249
+ resp = _oa().chat.completions.create(
250
+ model=JUDGE_MODEL,
251
+ messages=[
252
+ {"role": "system", "content": JUDGE_SYS},
253
+ {"role": "user", "content": prompt},
254
+ ],
255
+ temperature=0.0,
256
+ max_tokens=100,
257
+ )
258
+ return resp.choices[0].message.content.strip().lower()
259
+ except APIStatusError:
260
+ return "unknown"
261
+
262
+
263
+ # --------------------------------------------------------------------
264
+ # Agent factory
265
+ # --------------------------------------------------------------------
266
+ def build_agent(kind: str, model_url: str):
267
+ kind = kind.lower()
268
+ print(kind)
269
+ if kind == "recall":
270
+ return ReCall(executor_url=EXECUTOR_URL)
271
+ else:
272
+ raise ValueError(f"Unknown agent kind: {kind}")
273
+ # if kind == "o1-search" or kind == "sds":
274
+ # cfg = O1Cfg()
275
+ # return O1Searcher(cfg, thinker_url=model_url)
276
+ # if kind == "zerosearch":
277
+ # cfg = ZeroSearchConfig(thinker_url=model_url)
278
+ # return ZeroSearchInference(cfg)
279
+ # if kind in ("r1-search", "r1-searcher", "r1"):
280
+ # cfg = R1Cfg(serper_api_key=os.getenv("SERPER_API_KEY", ""))
281
+ # return R1Searcher(cfg=cfg, model_url=model_url)
282
+ # raise ValueError(f"Unknown agent kind: {kind}")
283
+
284
+
285
+ # --------------------------------------------------------------------
286
+ # Core evaluation routine for a single example (thread‑safe)
287
+ # --------------------------------------------------------------------
288
+ def evaluate_example(example: Dict[str, str], agent_kind: str, model_url: str) -> Dict[str, str]:
289
+ """Run one example through the pipeline and return result row."""
290
+ question = example["question"].strip()
291
+ answer_gt = example["answer"].strip()
292
+ idx = example["id"].strip()
293
+
294
+
295
+ # Build a *fresh* agent per thread to avoid shared‑state issues
296
+ agent = build_agent(agent_kind, model_url=model_url)
297
+
298
+ if agent_kind == "recall" and model_url == "deepseek-ai/DeepSeek-R1":
299
+ # print(agent_kind)
300
+ # print("B"*100)
301
+ transcript, tool_calls = agent.run_deepseek(
302
+ env=search_env,
303
+ func_schemas=search_schemas,
304
+ question=question,
305
+ model_name="deepseek-ai/DeepSeek-R1",
306
+ temperature=0.6,
307
+ max_tokens=40960,
308
+ # tokenizer = tokenizer
309
+ )
310
+ elif agent_kind == "recall":
311
+ transcript, tool_calls, chat = agent.run(
312
+ env=search_env,
313
+ func_schemas=search_schemas,
314
+ question=question,
315
+ model_url=model_url,
316
+ temperature=0.6,
317
+ max_new_tokens=40960,
318
+ tokenizer = tokenizer
319
+ )
320
+ # tool_calls = agent.extract_tool_calls(transcript)
321
+ else: # zerosearch or r1‑searcher
322
+ transcript, tool_calls = agent.run(question)
323
+
324
+ if agent_kind in [
325
+ "r1-searcher",
326
+ "zerosearch",
327
+ # "o1-search",
328
+ ]:
329
+ pred = extract_answer_tagged(transcript)
330
+ if agent_kind in [
331
+ "recall",
332
+ "SDS"
333
+ "o1-searcher"
334
+ ]:
335
+ try:
336
+ pred = extract_answer_boxed(transcript)
337
+ except:
338
+ print("falling to last string")
339
+ pred = transcript[-200:]
340
+ else:
341
+ try:
342
+ pred = extract_answer_boxed(transcript)
343
+ except:
344
+ print("falling to last string")
345
+ pred = transcript[-200:]
346
+
347
+ verdict = judge(question, answer_gt.lower(), pred.lower())
348
+
349
+ return {
350
+ "id": idx,
351
+ "question": question,
352
+ "answer_gt": answer_gt,
353
+ "model_answer": pred,
354
+ "judge": verdict,
355
+ "tool_calls": tool_calls,
356
+ "transcript": transcript,
357
+ "chat": chat
358
+ }
359
+
360
+ # --------------------------------------------------------------------
361
+ # CLI entry‑point
362
+ # --------------------------------------------------------------------
363
+ def build_output_path(out_base, agent, dataset, name) -> pathlib.Path:
364
+ """Construct output path as {out_base}/{model_name}/{dataset}.jsonl."""
365
+ return out_base / f"{agent}" / f"{dataset}-{name}.jsonl"
366
+
367
+ def normalize(s: str) -> str:
368
+ return unicodedata.normalize("NFKD", s.strip().lower())
369
+
370
+ def load_existing_results(path: pathlib.Path) -> tuple[list[dict], set[str]]:
371
+ results = []
372
+ uids = set()
373
+ if not path.exists():
374
+ return results, uids
375
+ with open(path, "r", encoding="utf-8") as f:
376
+ for line in f:
377
+ try:
378
+ row = json.loads(line)
379
+ if row['model_answer'] != "":
380
+ results.append(row)
381
+ uids.add(row["id"])
382
+ except Exception:
383
+ continue
384
+ return results, uids
385
+
386
+ def main():
387
+ parser = argparse.ArgumentParser(description="Benchmark QA agents on a dataset (single or multi‑threaded)")
388
+ parser.add_argument("--dataset", required=True, help="dataset name (frames, …)")
389
+ parser.add_argument("--agent", required=True, choices=["recall", "zerosearch", "r1-searcher", "o1-search", "SDS", "deepseek-r1"], help="agent wrapper")
390
+ parser.add_argument("--out", required=True, help="base directory for outputs")
391
+ parser.add_argument("--model-url", required=False, help="URL of the model server")
392
+ parser.add_argument("--limit", type=int, default=0, help="optional cap on number of questions")
393
+ parser.add_argument("--mode", choices=["single", "multi"], default="single", help="execution mode")
394
+ parser.add_argument("--workers", type=int, default=8, help="number of worker threads for multi‑mode")
395
+ parser.add_argument("--name", type=str, default="", help="suffix for save dir")
396
+
397
+ args = parser.parse_args()
398
+
399
+ # ----------------------------------------------------------------
400
+ # Dataset loading
401
+ # ----------------------------------------------------------------
402
+ ds_path = DATA_ROOT / f"{args.dataset}.jsonl"
403
+ if not ds_path.exists():
404
+ raise FileNotFoundError(ds_path)
405
+
406
+ with ds_path.open() as f:
407
+ data = [json.loads(line) for line in f]
408
+
409
+ # ----------------------------------------------------------------
410
+ # Output path setup
411
+ # ----------------------------------------------------------------
412
+ out_base = pathlib.Path(args.out).expanduser().resolve()
413
+ out_path = build_output_path(out_base, args.agent, args.dataset, args.name)
414
+ print(out_path)
415
+ out_path.parent.mkdir(parents=True, exist_ok=True)
416
+
417
+ if args.limit:
418
+ data = data[: args.limit]
419
+ # data = data[246:]
420
+
421
+ correct = 0
422
+ start_time = time.perf_counter()
423
+
424
+
425
+ # ----------------------------------------------------------------
426
+ # SINGLE‑THREADED EXECUTION
427
+ # ----------------------------------------------------------------
428
+ if args.mode == "single":
429
+ with open(out_path, "w", encoding="utf-8") as fout:
430
+ for ex in tqdm(data, desc="QA loop (single)"):
431
+
432
+ row = evaluate_example(ex, args.agent, args.model_url)
433
+ if row["judge"] == "correct":
434
+ correct += 1
435
+ # context for row
436
+ row.update({"agent": args.agent, "dataset": args.dataset})
437
+ fout.write(json.dumps(row, ensure_ascii=False) + "\n")
438
+ fout.flush()
439
+
440
+ # ----------------------------------------------------------------
441
+ # MULTI‑THREADED EXECUTION
442
+ # ----------------------------------------------------------------
443
+ else:
444
+ workers = max(1, args.workers)
445
+ logging.info("Running in multi‑threaded mode with %d workers", workers)
446
+ with ThreadPoolExecutor(max_workers=workers) as executor, open(out_path, "a", encoding="utf-8") as fout:
447
+ futures = {executor.submit(evaluate_example, ex, args.agent, args.model_url): ex for ex in data}
448
+ for fut in tqdm(as_completed(futures), total=len(futures), desc="QA loop (multi)"):
449
+ try:
450
+ row = fut.result()
451
+ except Exception as exc:
452
+ logging.exception("Evaluation failed: %s", exc)
453
+ continue
454
+ # print(row['id'])
455
+ if row["judge"] == "correct":
456
+ correct += 1
457
+ row.update({"agent": args.agent, "dataset": args.dataset})
458
+ fout.write(json.dumps(row, ensure_ascii=False) + "\n")
459
+ fout.flush()
460
+
461
+ elapsed = time.perf_counter() - start_time
462
+ accuracy = correct / len(data) if data else 0.0
463
+ print(f"Accuracy: {correct}/{len(data)} = {accuracy:.1%}")
464
+ print(f"Elapsed time: {elapsed:.2f}s ({elapsed/len(data):.2f}s per example)")
465
+
466
+
467
+ if __name__ == "__main__":
468
+ main()
re_call/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # from .inference.re_call import ReCall
2
+ # from .inference.r1_searcher import R1Searcher, R1SearchConfig
3
+ # from .inference.zerosearch import ZeroSearchInference, ZeroSearchConfig
4
+ # from .inference.o1_searcher import O1Cfg, O1Searcher
5
+ # from .inference.simpledeepsearch import SDSCfg, SDSearcher
6
+ from .re_call import ReCall
7
+ __all__ = ["ReCall"]
8
+ # __all__ = ["ReCall", "R1Searcher", "ZeroSearchInference", "ZeroSearchConfig", "R1SearchConfig", "O1Cfg", "O1Searcher", "SDSCfg", "SDSearcher"]
re_call/prompts.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Final
2
+
3
+ # DEEPRESEARCH_REPORT_SYS_PROMPT: Final[str] = r"""
4
+ # You are a DeepResearch analyst and Report Converter. Turn a raw investigation trace into a clear,
5
+ # decision-grade report suitable for executives.
6
+
7
+ # INPUTS (provided in the user message)
8
+ # - QUESTION: the research question.
9
+ # - TRACE: the full transcript (may include assistant/user/tool snippets).
10
+ # - TOOL_CALLS: raw list of tool calls (JSON-ish), which may contain URLs.
11
+
12
+ # CRITICAL SOURCING CONSTRAINTS (non-negotiable)
13
+ # - TRAJECTORY_LINKS = every URL you find in TRACE and TOOL_CALLS. Use ONLY these links. Do NOT add new sources.
14
+ # - Evidence density: cite every non-obvious fact/date/figure/evaluative claim.
15
+ # - Citation format: append raw bracketed URLs immediately after the supported sentence/point,
16
+ # e.g., β€œβ€¦ announced in 2003. [https://example.com/page]”.
17
+ # - Prefer primary/official and the most recent authoritative updates. If sources conflict, explain briefly and cite both.
18
+
19
+ # QUALITY & FRESHNESS
20
+ # - Be neutral, precise, and reproducible. No fabrication.
21
+ # - Distinguish **event date**, **publish/update date**, and **effective date** where relevant.
22
+ # - If critical info is missing, state the gap and proceed with best-effort analysis grounded in available links.
23
+
24
+ # OUTPUT RULES
25
+ # - **Markdown only.** No system markers. No boxed answers (\boxed{}).
26
+ # - Public-facing rationale only (no hidden chain-of-thought).
27
+ # - Length proportional to complexity (short for simple, detailed for complex).
28
+ # - **You decide the sectioning and narrative flow** based on the QUESTION and TRACE. Use headings only if they help clarity.
29
+ # - Keep it decision-useful: tight claims tied to evidence, crisp takeaways, explicit uncertainties.
30
+
31
+ # OPERATION
32
+ # 1) Extract TRAJECTORY_LINKS from TRACE and TOOL_CALLS. These are your only allowable citations.
33
+ # 2) Think privately about the best structure for this topic; then write the report accordingly.
34
+ # 3) Map each included claim to at least one link; mark any necessary but unsupported claim as β€œunsupported”.
35
+ # 4) Normalize names/dates/figures; note gaps and conflicts, and how you resolved them.
36
+ # 5) Conclude with a **deduplicated β€œSources used” list** of the raw URLs you actually cited (one per line).
37
+
38
+ # """
39
+
40
+ DEEPRESEARCH_SYS_PROMPT: Final[str] = r"""
41
+ You are a DeepResearch Assistant.
42
+
43
+ Goal: (1) Produce a concise PLAN that breaks the QUESTION into sections and **maps every URL and tool_call content** in the trace to those sections; (2) Produce a public-facing REPORT that synthesizes **all** information from TRACE/TOOL_CALLS into an insightful report.
44
+
45
+ ========================
46
+ INPUTS
47
+ ========================
48
+ - QUESTION: research question.
49
+ - TRACE: transcript (assistant/user/tool snippets).
50
+ - TOOL_CALLS: raw tool calls (includes URLs and tool_responses).
51
+
52
+
53
+ ========================
54
+ CITATIONS (ACCURACY-FIRST)
55
+ ========================
56
+ - **TRAJECTORY_LINKS** = all URLs in TRACE/TOOL_CALLS. Cite **only** these; do not invent/browse.
57
+ - Cite pivotal or non-obvious claims (dates, numbers, quotes, contested points).
58
+ - **Density with accuracy:** Prefer **dense citations** on non-obvious/pivotal claims **only when confident** the link supports the exact statement; avoid stray/low-confidence citations.
59
+ - **Sources used** = only URLs actually cited in REPORT.
60
+ - Citation format: append raw square bracketed full URLs immediately after the supported sentence/point, e.g., β€œβ€¦ announced in 2003. [https://example.com/page]”.
61
+
62
+
63
+ ========================
64
+ PLAN (MANDATORY CONTENT)
65
+ ========================
66
+ 1) **Question β†’ Sections** (derivation):
67
+ - Decompose QUESTION into sub-questions SQ1..SQn, then plan the structure of the report around that to cover all bases.
68
+ - Clearly outline the breakdown and structure of the report and the thought process for it.
69
+
70
+ 2) **Evidence Map: Section β†’ URL/tool_call mapping**
71
+ - **Harvest** all URLs from TRACE and TOOL_CALLS β†’ this forms TRAJECTORY_LINKS.
72
+ - For **each Section (S1..Sn)**, list the **evidence items** (every TRAJECTORY_LINK and its content explored in the TRACE) relevant to it.
73
+ - **Coverage rule:** Ensure **most** URL/tool_call items from TRACE is mapped to at least one Section (unless truly irrelevant to the topic).
74
+ - Use this table (include all rows; add as many as needed):
75
+ | Section | Item | | Content | Confidence |
76
+ |---|---|---|---|---|
77
+ | S1 | <URL_4> | date/stat/quote/context | High/Med/Low |
78
+ | S2 | <URL_1> <URL_2> | stat/definition/quote | High/Med/Low |
79
+ - If something is truly irrelevant, list under **Omitted as Irrelevant (with reason)**; keep this list short do not cite them in the report in this case.
80
+
81
+ 3) **Layout the Strategy for insight generation**:
82
+ - 4–6 bullets on how you will generate higher level insight / aalysis: e.g., contrast/benchmark, timeline, ratios/growth, causal chain, risks.
83
+ - You may generate insights / analysis by concatenating **general background knowledge** with TRACE facts, but only if the TRACE facts remain central.
84
+ - Beyond description, provide **analysis, interpretation, and recommendations** where possible.
85
+ - Recommendations must be **derived strictly from TRACE evidence**. No hallucinated numbers or unsupported claims.
86
+ - If evidence is insufficient for a clear recommendation, state this explicitly.
87
+
88
+ ========================
89
+ REPORT (MANDATORY CONTENT)
90
+ ========================
91
+ - # Executive Summary β€” 5-10 crisp bullets with concrete takeaways; cite pivotal/non-obvious claims.
92
+ - ## Main Body β€” brief scope and inclusion rules; **provide higher-order insights built on the harvested evidence** (e.g., causal explanations, benchmarks, ratios/growth, timelines, scenarios/risks). Add a one-line deviation note if sections differ from PLAN.
93
+ - ## S1..Sn (exactly as defined in PLAN) β€” each section answers its mapped sub-question and **integrates all mapped evidence**:
94
+ - Weave facts; where β‰₯3 related numbers exist, add a small Markdown table.
95
+ - **Integrate as much of the TRACE/TOOL_CALLS information as possible** in a structured way based on the question decomposition; if an item is only contextual, summarize briefly and attribute.
96
+ - Call out conflicts with both sources cited.
97
+ - ## Recommendations β€” actionable, prioritized; must follow from cited evidence.
98
+ - ## Conclusion β€” 3–6 sentences directly answering the QUESTION.
99
+ - ## Sources used β€” deduplicated raw URLs, one per line (only those cited above).
100
+
101
+ ========================
102
+ EXHAUSTIVENESS & COVERAGE
103
+ ========================
104
+ - **Inclusion duty:** Factual detail explored in TRACE must appear in the final report unless completely irrlevant.
105
+ - **Do not compress away specifics.** Prioritize: (1) exact figures/dates, (2) named entities/products, (3) risks/criticisms, (4) methods/assumptions, (5) contextual detail.
106
+ - **Numeric presentation:** For β‰₯3 related numbers, render a small Markdown table with citations.
107
+ - Be verbose in the Main Body; detailed explanations / exhaustive covergage, novel synthesis, insights and dense citations are encouraged.
108
+
109
+ ========================
110
+ QUALITY TARGETS (SCORING GUARDRAILS)
111
+ ========================
112
+ - **Comprehensiveness (COMP):** Every URL/tool_response mapped in the plan is integrated. The REPORT should **strive to integrate maximum trace information** in context.
113
+ - **Insight/Depth (DEPTH):** Use contrast/benchmarks, timelines, ratios/growth, causal links, scenarios, and risk framing to explain β€œwhy it matters,” building insights **on top of the existing evidence** (no new facts).
114
+ - **Instruction-Following (INST):** Sections mirror sub-questions; each SQ is explicitly answered, the report should be precise and not digress from what is asked in the question.
115
+ - **Readability (READ):** Clear headings, short paragraphs, lead sentences with takeaways, tables for numeric clusters, and **dense-but-accurate** citations.
116
+
117
+ ========================
118
+ STRICT OUTPUT FORMAT
119
+ ========================
120
+ - You must give exactly one single output with the private planning / thinking enclosed within the <think></think> and the public facing report follwing that:
121
+ <think>[Plan here]</think>[Report here]
122
+ - The REPORT is strictly public-facing (no meta/process/thinking).
123
+ - Markdown only. Public-facing rationale; no hidden notes or menntion of the search trace or the thinking process in the report.
124
+ - Target lengt for the Report Section: **β‰₯2000 words** (longer if complexity requires).
125
+ """
126
+
127
+ # SUMMARY_SYS_PROMPT: Final[str] = r"""
128
+ # You are a Summary Assistant.
129
+
130
+ # Goal: Produce a public-facing response that structures all information from input trace into a single answer.
131
+
132
+ # ========================
133
+ # INPUTS
134
+ # ========================
135
+ # - QUESTION: user's question.
136
+ # - TRACE: transcript (assistant/user/tool snippets).
137
+ # - TOOL_CALLS: raw tool calls (includes URLs and tool_responses).
138
+
139
+ # ========================
140
+ # RESPONSE (ANSWER) (MANDATORY CONTENT)
141
+ # ========================
142
+ # - The response to the user's question, enclosed in <answer></answer> tags.
143
+ # - The response must be well-structured and detailed, covering all important steps, ideas, and any evidence/calculations found in the trace.
144
+ # - If the task is CLOSED-ENDED (math/logic with a determinate result; factual single value/word; code producing a definite output), think and reason/plan internally and respond with the final part (explanation, method, proof, etc.) and present the result boxed with LaTeX: \boxed{…}.
145
+ # - If the task is OPEN-ENDED (analysis, synthesis, design choices, multiple valid outcomes), think and reason/plan internally and respond containing a detailed explanation of the search trace, sources, investigation, process/methodology, result/outcome/solution, conclusion, etc.; i.e. create a nicely-structured and detailed structure of the answer for the question, that can be shown to the user who asked it.
146
+ # - Keep the answer detailed and well-structured, providing a thorough explanation/methodology/solution for the final response, whatever is desired for in the user query. Do not just give a one-line/very-short final response. The answer maybe short if the question is trivial, but it must be well-structured and thorough.
147
+
148
+ # ========================
149
+ # STRICT OUTPUT FORMAT
150
+ # ========================
151
+ # - You must give exactly one single output with the private planning / thinking enclosed within the <think></think> and the public facing report follwing that:
152
+ # <think>[Plan here]</think><answer>[Final Answer here]</answer>
153
+ # - The final answer is strictly public-facing (no meta/process/thinking).
154
+ # - Markdown only.
155
+ # """
156
+
157
+ SUMMARY_SYS_PROMPT: Final[str] = r"""
158
+ You are an expert search trace structurer. Given a QUESTION and the full search TRACE (may include tool-call notes),
159
+ write a clear, accurate, self-contained explanation/solution using only the information in the trace. Do not add external facts.
160
+
161
+ What to produce:
162
+ - A single, readable well-structured narrative / solution that covers all important steps, ideas, and any evidence/calculations found in the trace.
163
+ - If the task is CLOSED-ENDED (math/logic with a determinate result; factual single value/word; code producing a definite output), think and reason/plan internally and respond with the final part (explanation, method, proof, etc.) and present the result boxed with LaTeX: \boxed{…}.
164
+ - If the task is OPEN-ENDED (analysis, synthesis, design choices, multiple valid outcomes), think and reason/plan internally and respond containing a detailed explanation of the search trace, sources, investigation, process/methodology, result/outcome/solution, conclusion, etc.; i.e. create a nicely-structured and detailed structure of the answer for the question, that can be shown to the user who asked it.
165
+ - Note: The final part is strictly public-facing (no meta/process/thinking) and is to be enclosed in <answer></answer> tags and the thinking/planning/reasoning is internal and to be compulsorily enclosed within <think></think> tags.
166
+ - The final part can be short or detailed depending on the question but has to be seperately enclosed in <answer></answer> tags and will be after the thinking block (which is to be enclosed in <think></think> tags).
167
+
168
+ Style:
169
+ - Clear prose and paragraphs; use LaTeX sparingly for clarity in math.
170
+ - Prefer thorough and detailed coverage; keep it shorter for trivial items.
171
+ - Use only facts present in the trace. If something is uncertain or missing, state it plainly and proceed with best-effort reasoning.
172
+ - Provide detailed explanation/methodology/solution for the final response (public-facing part), whatever is desired for in the user query. Do not just give a one-line/very-short final response.
173
+ - The final response should be well-structured and detailed and is to be enclosed within <answer></answer> tags.
174
+ - The reasoning part is non-public facing and internal, and should be enclosed within <think></think> tags.
175
+
176
+ **OUTPUT FORMAT:**
177
+ - Enclose your thinking/reasoning/planning (if you are thinking before answering) within the <think></think> tags: <think>{thinking here}</think>{response here}
178
+ - It is compulsory to use the <think></think> tags for enclosing planning/thinking/internal reasoning.
179
+ - Return the final answer in the format:
180
+ ```<think>{your thinking here}</think>
181
+ <answer>{your final answer here}</answer>```
182
+ - The final answer part of the response is strictly public-facing and should be well-structured and detailed.
183
+ - Markdown only.
184
+ """
re_call/re_call.py ADDED
@@ -0,0 +1,1490 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ import json
4
+ import requests
5
+ import time
6
+ from typing import List, Optional, Dict
7
+ from .prompts import DEEPRESEARCH_SYS_PROMPT, SUMMARY_SYS_PROMPT
8
+ from functools import wraps
9
+ from together import Together # pip install together
10
+ from datetime import datetime # needed for retries / logging and date string (for giving current date and time to LLM)
11
+
12
+ # return decorator
13
+ def retry(max: int = 10, sleep: int = 1, fallback=None):
14
+ """
15
+ Retry `max` times and, if still failing, return `fallback`
16
+ instead of raising. This keeps outer loops alive.
17
+ """
18
+ def decorator(func):
19
+ @wraps(func)
20
+ def wrapper(*args, **kwargs):
21
+ for i in range(max):
22
+ try:
23
+ return func(*args, **kwargs)
24
+ except Exception as e:
25
+ print(f"[retry] attempt {i+1}/{max} failed: {e}")
26
+ if i == max - 1: # last try exhausted
27
+ print(f"[retry] giving up – returning {fallback!r}")
28
+ return fallback # ← swallow the error
29
+ if sleep:
30
+ time.sleep(sleep)
31
+ return wrapper
32
+ return decorator
33
+
34
+ class ReCall():
35
+ date_str = \
36
+ f"""
37
+
38
+ **Note**: Today's Date is {datetime.now().strftime("%Y-%m-%d")}, and time is {datetime.now().strftime("%H:%M:%S")}. This may be useful for answering questions about current events."""
39
+
40
+ anti_chinese_str = \
41
+ """
42
+
43
+ **Note**: Do not respond in chinese, do not think in chinese, only think and respond/answer in English, unless explicitly instructed by the user to respond in some other language."""
44
+
45
+ # proper_formatting_str = \
46
+ # """
47
+ # **Note**: Provide a well-structured answer first, then put only the final short answer in \\boxed{{}}.
48
+
49
+ # **How to format your response**
50
+ # - Write in clear English prose and use Markdown headings/bullets where helpful.
51
+ # - Give a detailed, in-depth explanation of the steps or facts used.
52
+ # - Use LaTeX only for short formulas/equations. For multi-line LaTeX, include line breaks (\\\\) or environments like \\begin{{align}} ... \\end{{align}} when genuinely helpful.
53
+ # - Do **not** wrap the whole response in LaTeX. Only the final short answer goes in \\boxed{{...}} on its own line at the end.
54
+
55
+ # **Examples**
56
+
57
+ # 1) **Simple fact question**
58
+ # **Question:** What is the capital of India?
59
+ # **Brief rationale:** India’s seat of government and primary national institutions are located in New Delhi.
60
+ # **Final:** \\boxed{{New Delhi}}
61
+
62
+ # 2) **Quick calculation**
63
+ # **Question:** Convert 68^\\circ F to Celsius.
64
+ # **Approach:** Use C = (F - 32) \\times \\tfrac{{5}}{{9}}.
65
+ # **Computation:** (68 - 32) \\times \\tfrac{{5}}{{9}} = 20.
66
+ # **Final:** \\boxed{{20^\\circ C}}
67
+
68
+ # 3) **Search & synthesis (structured, detailed)**
69
+ # **Question:** When did the EU’s GDPR go into effect?
70
+
71
+ # **Complete Final Response:**
72
+ # '''**Key findings (evidence, concise):**
73
+ # - **European Commission overview** states GDPR β€œapplies from 25 May 2018.”
74
+ # - **EUR-Lex (Regulation (EU) 2016/679), Article 99**: entered into force 20 days after publication in the OJ (2016), and **applies from 25 May 2018**.
75
+ # - **EDPB FAQs/communications** reiterate that enforcement/application begins **25 May 2018**.
76
+
77
+ # **Cross-check & validation:**
78
+ # - Independent primary sources (Commission portal and EUR-Lex) agree on the same application date. A supervisory body source (EDPB) corroborates.
79
+
80
+ # **Common pitfalls addressed:**
81
+ # - Some secondary blogs list **24 May 2018**β€”this confuses the **last day before** applicability with the first day **of** applicability.
82
+ # - β€œEntered into force” in **2016** (post-publication) is not the same as β€œapplication/effective for obligations,” which is **2018**.
83
+
84
+ # **Date normalization:**
85
+ # - Normalize to an unambiguous calendar date and present in a clear format (e.g., β€œMay 25, 2018”).
86
+
87
+ # **Conclusion:**
88
+ # - The effective (application) date for GDPR obligations across the EU is the same in all Member States and is confirmed by multiple primary sources.
89
+
90
+ # **Final:** \\boxed{{May\ 25,\ 2018}}'''
91
+ # """
92
+
93
+ # print(f"Date string:\n'{date_str}'")
94
+
95
+ # proper_formatting_str = \
96
+ # """
97
+ # **DeepResearch Response Protocol**
98
+ # Provide a comprehensive, decision-grade report first, then put only the short final answer in \\boxed{{}} on its own line at the very end.
99
+
100
+ # ---
101
+
102
+ # ## Mandatory Sections (in order)
103
+
104
+ # 1) **Executive Summary**
105
+ # - 5–10 bullets capturing the direct answer, key numbers/dates, and the top implications.
106
+ # - Include any material uncertainty (e.g., β€œmoderate confidence due to limited primary data”).
107
+
108
+ # 2) **Problem Framing & Scope**
109
+ # - One short paragraph restating the question, goals, and audience.
110
+ # - Clarify interpretations, exclusions, and assumptions. Define key terms and acronyms.
111
+
112
+ # 3) **Method (Search & Validation Plan)**
113
+ # - 5–8 bullets detailing how you searched and validated. Include:
114
+ # - **Source priority:** primary/official (laws, filings, standards, regulator notices) β†’ reputable secondary (major outlets, respected orgs) β†’ tertiary/background.
115
+ # - **Query strategy:** main queries and alternates (synonyms, regional spellings, technical names).
116
+ # - **Freshness policy:** prefer the most recent authoritative updates; when dates matter, distinguish **event date**, **publication/update date**, and **effective date**.
117
+ # - **Triangulation rule:** corroborate all key claims with β‰₯2 independent reputable sources (or 1 clear primary).
118
+ # - **Inclusion/Exclusion:** note discarded sources (paywalled, low quality, self-published without review) and why.
119
+ # - **Conflict resolution:** how disagreements will be weighed (mandate, jurisdiction, methodological rigor, recency).
120
+
121
+ # 4) **Evidence Ledger (Cited Facts)**
122
+ # - 6–15 bullets. Each bullet is a **Fact Card**:
123
+ # - **Claim:** one-sentence fact.
124
+ # - **Evidence:** short quote/figure/line (paraphrase unless a short quote is essential).
125
+ # - **Source:** Publisher/Title β€” (Event Date if applicable) β€” Publish/Update Date β€” Access Date.
126
+ # - **Confidence:** High / Medium / Low.
127
+ # - Group with mini-subheadings where helpful (e.g., β€œOfficial notices”, β€œRegulatory filings”, β€œPress coverage”).
128
+ # - Explicitly flag contradictions.
129
+
130
+ # 5) **Timeline of Key Events**
131
+ # - A compact, chronological list linking milestones to sources; include both event and publication dates where relevant.
132
+
133
+ # 6) **Data Extraction & Normalization** (as needed)
134
+ # - Present important numbers in a small table (≀8 rows) with units, currency (ISO codes, e.g., **USD**), and rounding policy (state precision, e.g., β€œrounded to 2 decimals”).
135
+ # - Perform any conversions or calculations and show formulas succinctly (LaTeX inline for short formulas, e.g., \\( C = (F-32)\\times\\tfrac{{5}}{{9}} \\); use \\begin{{align}}…\\end{{align}} for multi-step math).
136
+ # - Specify timezones for dates/times when relevant.
137
+
138
+ # 7) **Comparative & Sensitivity Analysis** (if applicable)
139
+ # - Contrast competing interpretations, options, or sources; note trade-offs.
140
+ # - Include a brief sensitivity or scenario check if a key parameter could materially change the conclusion.
141
+
142
+ # 8) **Synthesis & Conclusion**
143
+ # - 2–4 tight paragraphs that integrate the evidence, resolve conflicts, and explain *why* the conclusion follows.
144
+ # - Be explicit about scope limits and residual uncertainties.
145
+
146
+ # 9) **Risks, Caveats & Unknowns**
147
+ # - Bullet the major risks, data gaps, and what would most change the answer.
148
+ # - Note any ethical, legal, or safety considerations.
149
+
150
+ # 10) **Recommendations / Next Steps** (if applicable)
151
+ # - Actionable items tailored to the user’s likely goal (e.g., verify with regulator X, monitor source Y weekly, collect dataset Z).
152
+
153
+ # 11) **Answer (one sentence)**
154
+ # - State the direct answer clearly with units/timezone as needed.
155
+
156
+ # 12) **Final**
157
+ # - Repeat only the short final answer inside \\boxed{{...}} with no extra words.
158
+
159
+ # 13) **Source Log (Audit Trail)**
160
+ # - A compact, reproducible list: *Title β€” Publisher/Author β€” (Event Date, if any) β€” Publish/Update Date β€” Access Date β€” URL*.
161
+ # - Prefer diverse, authoritative domains; avoid duplicates.
162
+
163
+ # ---
164
+
165
+ # ## Formatting & Quality Rules
166
+
167
+ # - Use clear English with Markdown headings and bullets; favor short paragraphs.
168
+ # - Do **not** reveal inner monologue or hidden chain-of-thought; provide only public-facing rationale.
169
+ # - Use LaTeX sparingly for math; do **not** wrap the entire response in LaTeX. Only the final short answer goes in \\boxed{{...}}.
170
+ # - Always specify units, currency codes, and timezones when relevant.
171
+ # - When listing β‰₯3 items or comparing options, include a small, focused table rather than long prose.
172
+ # - If information is uncertain or contested, *quantify* the uncertainty (confidence labels or ranges) and state why.
173
+
174
+ # ---
175
+
176
+ # ## Depth & Completeness Expectations
177
+
178
+ # - **Complex/high-stakes queries**: Populate all sections thoroughly; provide triangulated citations and explicit conflict resolution.
179
+ # - **Simple fact queries**: Keep Sections 3–9 concise (one to two lines each) but still cite at least one authoritative source.
180
+ # - Strive for neutrality, reproducibility, and decision usefulness over verbosity.
181
+
182
+ # ---
183
+ # """
184
+
185
+ proper_formatting_str = """"""
186
+
187
+ sys_prompt_non_search = """You are a helpful assistant. You will answer the user's question based on your knowledge and reasoning ability. You do not have access to the internet or any external tools. Do not use search. Answer all questions yourself.""" + date_str + anti_chinese_str
188
+
189
+ sys_prompt_websailor_start = """
190
+ You are a Web Information Seeking Master. Your task is to thoroughly seek the internet for information and provide accurate answers to questions. No matter how complex the query, you will not give up until you find the corresponding information.
191
+ In this environment you have access to a set of tools you can use to assist with the user query.
192
+ You may perform multiple rounds of function calls. In each round, you can call one or more functions.
193
+
194
+ As you proceed, adhere to the following principles:
195
+
196
+ 1. **Persistent Actions for Answers**: You will engage in many interactions, delving deeply into the topic to explore all possible aspects until a satisfactory answer is found.
197
+
198
+ 2. **Repeated Verification**: Before presenting a Final Answer, you will **cross-check** and **validate the information** you've gathered to confirm its accuracy and reliability.
199
+
200
+ 3. **Attention to Detail**: You will carefully analyze each information source to ensure that all data is current, relevant, and from credible origins."""
201
+
202
+ sys_prompt_websailor = """
203
+ You are a Web Information Seeking Master. Your task is to thoroughly seek the internet for information and provide accurate answers to questions. No matter how complex the query, you will not give up until you find the corresponding information.
204
+ In this environment you have access to a set of tools you can use to assist with the user query.
205
+ You may perform multiple rounds of function calls. In each round, you can call one or more functions.
206
+
207
+ As you proceed, adhere to the following principles:
208
+
209
+ 1. **Persistent Actions for Answers**: You will engage in many interactions, delving deeply into the topic to explore all possible aspects until a satisfactory answer is found.
210
+
211
+ 2. **Repeated Verification**: Before presenting a Final Answer, you will **cross-check** and **validate the information** you've gathered to confirm its accuracy and reliability.
212
+
213
+ 3. **Attention to Detail**: You will carefully analyze each information source to ensure that all data is current, relevant, and from credible origins.
214
+
215
+
216
+
217
+ Here are available functions in JSONSchema format: \n```json\n{func_schemas}\n```
218
+
219
+ In your response, you need to first think about the reasoning process in the mind and then conduct function calling to get the information or perform the actions if needed. \
220
+ The reasoning process and function calling are enclosed within <think> </think> and <tool_call> </tool_call> tags. \
221
+ The results of the function calls will be given back to you after execution, \
222
+ and you can continue to call functions until you get the final answer for the user's question.
223
+
224
+ For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
225
+ <tool_call>
226
+ {{"name": <function-name>, "arguments": <args-json-object>}}
227
+ </tool_call>
228
+ """ + date_str + anti_chinese_str + proper_formatting_str
229
+
230
+ sys_prompt_websailor_deepseek = """
231
+ You are a Web Information Seeking Master. Your task is to thoroughly seek the internet for information and provide accurate answers to questions. No matter how complex the query, you will not give up until you find the corresponding information.
232
+ In this environment you have access to a set of tools you can use to assist with the user query.
233
+ You may perform multiple rounds of function calls. In each round, you can call one or more functions.
234
+
235
+ As you proceed, adhere to the following principles:
236
+
237
+ 1. **Persistent Actions for Answers**: You will engage in many interactions, delving deeply into the topic to explore all possible aspects until a satisfactory answer is found.
238
+
239
+ 2. **Repeated Verification**: Before presenting a Final Answer, you will **cross-check** and **validate the information** you've gathered to confirm its accuracy and reliability.
240
+
241
+ 3. **Attention to Detail**: You will carefully analyze each information source to ensure that all data is current, relevant, and from credible origins.
242
+
243
+
244
+
245
+ Here are available functions in JSONSchema format: \n```json\n{func_schemas}\n```
246
+
247
+ In your response, you need to first think about the reasoning process in the mind and then conduct function calling to get the information or perform the actions if needed. \
248
+ The reasoning process and function calling are enclosed within <think> </think> and <tool_calls_begin> <tool_calls_end> tags. \
249
+ The results of the function calls will be given back to you after execution, \
250
+ and you can continue to call functions until you get the final answer for the user's question. \
251
+ Finally, if you have got the answer, enclose it within \\boxed{{}} with latex format and do not continue to call functions, \
252
+ i.e., <think> Based on the response from the function call, I get the weather information. </think> The weather in Beijing on 2025-04-01 is \\[ \\boxed{{20C}} \\].
253
+ """ + date_str + anti_chinese_str + proper_formatting_str
254
+
255
+ # sys_prompt_websailor_deepseek = """
256
+ # You are a Web Information Seeking Master. Seek the internet thoroughly and provide accurate answers. You may use tools multiple times.
257
+
258
+ # Principles:
259
+ # 1) Persistent Actions for Answers: explore deeply until you find satisfactory information.
260
+ # 2) Repeated Verification: cross-check and validate before the final answer.
261
+ # 3) Attention to Detail: ensure sources are current, relevant, and credible.
262
+
263
+ # You have the following tools (JSONSchema):
264
+ # ```json
265
+ # {func_schemas}
266
+ # Follow this EXACT tool-call I/O protocol.
267
+
268
+ # TO CALL ONE OR MORE TOOLS:
269
+ # Respond only with this block (no extra text before/after):
270
+ # <|tool▁call▁begin|>function<|tool▁sep|>{tool_name}{args_json}
271
+ # <|tool▁call▁end|>
272
+ # ... (repeat <|tool▁call▁begin|>…<|tool▁call▁end|> for multiple tools)
273
+ # <|tool▁calls▁end|><|end▁of▁sentence|>
274
+
275
+ # HOW TOOL RESULTS ARRIVE:
276
+ # I will send tool outputs back embedded inside a single user message, each wrapped like:
277
+ # <tool_response>{one_tool_call_you_made}
278
+ # {tool_return_text_or_json}
279
+ # </tool_response>
280
+
281
+ # WHAT TO DO NEXT:
282
+
283
+ # If you still need info, emit another tool-calls block (same exact format).
284
+
285
+ # If you have the final answer, output:
286
+ # <answer> …your final answer… </answer>
287
+ # and DO NOT call any more tools.
288
+
289
+ # Important:
290
+
291
+ # Do not expose your internal reasoning; keep thoughts private.
292
+
293
+ # When emitting a tool-calls block, do not include any explanations, only the block specified above.
294
+
295
+ # Arguments must be valid JSON.
296
+
297
+ # Stop tokens to respect: <|end▁of▁sentence|>
298
+ # """
299
+
300
+ system_prompt = """In this environment you have access to a set of tools you can use to assist with the user query. \
301
+ You may perform multiple rounds of function calls. \
302
+ In each round, you can call one or more functions. \
303
+
304
+ Here are available functions in JSONSchema format: \n```json\n{func_schemas}\n```
305
+
306
+ In your response, you need to first think about the reasoning process in the mind and then conduct function calling to get the information or perform the actions if needed. \
307
+ The reasoning process and function calling are enclosed within <think> </think> and <tool_call> </tool_call> tags. \
308
+ The results of the function calls will be given back to you after execution, \
309
+ and you can continue to call functions until you get the final answer for the user's question. You are encouraged to utilize as many function calls as possible. \
310
+ Finally, if you have got the answer, wrap it in <answer> </answer> **and do not call any more functions**, \
311
+ e.g. <think> Based on the tool results … </think> <answer>20 Β°C</answer>.
312
+
313
+ For each function call, return a JSON object with function name and arguments within <tool_call></tool_call> XML tags:
314
+ <tool_call>
315
+ {{"name": <function-name-1>, "arguments": <args-json-object>}}
316
+ </tool_call>""" + date_str + anti_chinese_str + proper_formatting_str
317
+
318
+ system_prompt_budget = """
319
+ You are an autonomous reasoning agent with access to external tools.
320
+
321
+ The conversation will retain only the *most-recent* <tool_response> block; older ones disappear.
322
+ As soon as you receive tool results, extract the *essential facts tables links etc* that might be needed for later and restate them inside your <think> section.
323
+  **Never copy large bodies of text** or raw JSON from tool output into your visible reply; summarise instead.
324
+
325
+ β—Ž **Workflow**
326
+ 1. In every round, start with <think> … </think> to lay out your short reasoning.
327
+ 2. If you need external information or an action, emit one or more <tool_call> … </tool_call> blocks (JSON spec below).
328
+ 3. When the environment returns <tool_response>, continue reasoning; you may call more tools.
329
+ 4. Once you can answer the user, wrap the final result in <answer> … </answer> and STOP calling tools.
330
+
331
+ β—Ž **Tool call format** (do **not** restate the schema or any explanations):
332
+ <tool_call>
333
+ {{"name": <function-name-1>, "arguments": <args-json-object>}}
334
+ </tool_call>
335
+
336
+ Here are available functions in JSONSchema format: \n```json\n{func_schemas}\n```
337
+ """ + date_str + anti_chinese_str + proper_formatting_str
338
+
339
+
340
+
341
+ system_prompt_forcing_tool_call = """
342
+ In this environment you have access to a set of tools you can use to assist with the user query.
343
+ You may perform multiple rounds of function calls upto ten. In each round, you can call upto three functions.
344
+
345
+ ──────────────────────── AVAILABLE TOOLS ────────────────────────
346
+ ```json
347
+ [
348
+ {
349
+ "type": "function",
350
+ "function": {
351
+ "name": "pubmed_search",
352
+ "description": "Search PubMed for Medical related queries.",
353
+ "parameters": {
354
+ "type": "object",
355
+ "properties": {
356
+ "query": { "type": "string", "description": "Query to search for." },
357
+ "top_n": { "type": "integer", "description": "Number of hits", "default": 3 }
358
+ },
359
+ "required": ["query"]
360
+ }
361
+ }
362
+ }
363
+ ]
364
+ ```
365
+
366
+ ────────────────────────────── RULES ──────────────────────────────
367
+ 1. You MUST issue one pubmed_search tool call for each answer choice. Each query must relate the clinical context to that option.
368
+ 2. You MAY NOT skip any option or decide based only on internal reasoning. Evidence must be retrieved for all choices.
369
+ 3. You MAY issue follow-up tool calls if your reasoning leads you to need more evidence.
370
+ 4. You MUST wrap all reasoning in <think> </think> tags and all tool usage in <tool_call> </tool_call> tags. Number of <tool_call> and </tool_call> tokens in the entire trace MUST always match.
371
+ 5. Do NOT casually emit the <tool_call> </tool_call> during reasoning unless explicitly calling a tool in the proper format.
372
+ 5. Your final answer must be enclosed a single letter corresponding to the correct option enclosed in the <answer> </answer> tags. Do not output anything else inside these tags.
373
+ 6. DO NOT use any other confusing tags like <thiking> or </thinking>.
374
+ 7. Each <think> </think> block MUST be followed by a <tool_call> </tool_call> or <answer> </answer> or else the program will break without an answer.
375
+
376
+ ───────────────────── DUMMY EXAMPLE INTERLEAVED SKELETON ─────────────────────
377
+ <think>
378
+ We are presented with a 54-year-old woman with invasive ductal carcinoma of the breast and osteolytic lesions in the thoracic spine. This strongly suggests metastatic spread. Our task is to determine the most likely anatomical route of metastasis to the spine.
379
+
380
+ Let’s examine the given options:
381
+ A. Hemiazygos vein
382
+ B. Posterior intercostal veins
383
+ C. Batson’s vertebral venous plexus
384
+ D. Internal mammary lymphatics
385
+
386
+ We'll evaluate each option in turn using available literature and known anatomical pathways.
387
+ **Option A: Hemiazygos vein**
388
+ We begin by evaluating whether the hemiazygos vein could be involved in metastatic spread from breast cancer to the spine.
389
+ </think>
390
+ <tool_call>
391
+ {"name": "pubmed_search", "arguments": {"query": "breast cancer metastasis hemiazygos vein", "top_n": 2}}
392
+ </tool_call>
393
+ <tool_response>
394
+ ...
395
+ </tool_response>
396
+ <think>
397
+ There is limited or no strong evidence suggesting the hemiazygos vein is a common or primary route for vertebral metastasis from breast cancer.
398
+ Lets explore **Option B: Posterior intercostal veins** and **Option C: Batson’s vertebral venous plexus** and **Option D:Internal mammary lymphatics**
399
+ </think>
400
+ <tool_call>
401
+ {"name": "pubmed_search", "arguments": {"query": "posterior intercostal veins breast cancer spinal metastasis", "top_n": 3}}
402
+ </tool_call>
403
+ <tool_call>
404
+ {"name": "pubmed_search", "arguments": {"query": "Batson vertebral venous plexus breast cancer metastasis", "top_n": 3}}
405
+ </tool_call>
406
+ <tool_call>
407
+ {"name": "pubmed_search", "arguments": {"query": "Internal mammary lymphatics breast cancer metastasis", "top_n": 3}}
408
+ </tool_call>
409
+ <tool_response>
410
+ ...
411
+ </tool_response>
412
+ <think>
413
+ While the posterior intercostal veins may be involved in venous drainage, there is insufficient evidence to support them as a primary route for metastasis to the vertebral column.
414
+ where as Batson’s vertebral venous plexus β€” a valveless venous network that connects the thoracic and abdominal veins directly to the spine. I to find more specific information about option C.
415
+ </think>
416
+ <tool_call>
417
+ {"name": "pubmed_search", "arguments": {"query": ""Batson vertebral venous plexus breast cancer metastasis in people over 50", "top_n": 1}}
418
+ </tool_call>
419
+ <think>
420
+ After evaluating all four options, the most plausible route for breast cancer metastasis to the thoracic spine is clearly via Batson’s vertebral venous plexus:
421
+ </think>
422
+ <answer>C</answer>
423
+ """ + date_str + anti_chinese_str + proper_formatting_str
424
+ # STOP_TOKENS =STOP_TOKENS = ["<|im_end|>", "<|endoftext|>"
425
+
426
+
427
+ def __init__(self, executor_url):
428
+ self.executor_url = executor_url
429
+
430
+ def init_prompt(self, func_schemas, question, old_prompt: Optional[str] = None, search_on: bool = True) -> str:
431
+ if old_prompt is None or len(old_prompt.strip()) == 0:
432
+ if search_on:
433
+ system_prompt = f"<|im_start|>system\n{self.sys_prompt_websailor.format(func_schemas=func_schemas)}<|im_end|>"
434
+ else:
435
+ system_prompt = f"<|im_start|>system\n{self.sys_prompt_non_search}<|im_end|>"
436
+ user_prompt = f"<|im_start|>user\n{question}<|im_end|>"
437
+ assistant_prefix = f"<|im_start|>assistant\n<think>"
438
+ return system_prompt + "\n" + user_prompt + "\n" + assistant_prefix
439
+ else:
440
+ user_prompt = f"<|im_start|>user\n{question}<|im_end|>"
441
+ assistant_prefix = f"<|im_start|>assistant\n<think>"
442
+ return old_prompt + "\n" + user_prompt + "\n" + assistant_prefix
443
+
444
+ def replace_sys_prompt(self, old_prompt: str, func_schemas: str, search_on: bool = True) -> str:
445
+ if search_on:
446
+ new_sys_prompt = f"<|im_start|>system\n{self.sys_prompt_websailor.format(func_schemas=func_schemas)}<|im_end|>"
447
+ old_sys_prompt = f"<|im_start|>system\n{self.sys_prompt_non_search}<|im_end|>"
448
+ else:
449
+ new_sys_prompt = f"<|im_start|>system\n{self.sys_prompt_non_search}<|im_end|>"
450
+ old_sys_prompt = f"<|im_start|>system\n{self.sys_prompt_websailor.format(func_schemas=func_schemas)}<|im_end|>"
451
+
452
+ return old_prompt.replace(old_sys_prompt, new_sys_prompt)
453
+
454
+ def _strip_old_tool_responses(self, prompt: str) -> str:
455
+ TOOL_RESPONSE_RE = re.compile(r"<tool_response>.*?</tool_response>\s*", re.DOTALL)
456
+ """Remove every existing <tool_response> … </tool_response> block."""
457
+ return TOOL_RESPONSE_RE.sub("", prompt)
458
+
459
+ def cat_assistant_response(self, curr_prompt, assistant_response):
460
+ return curr_prompt + assistant_response + "<|im_end|>"
461
+
462
+ def cat_tool_results(self, curr_prompt, tool_calls, results):
463
+ tool_response_str = ""
464
+ for tool_call, result in zip(tool_calls, results):
465
+ tool_response_str += f"<tool_response>{tool_call}\n{result}\n</tool_response>\n"
466
+ tool_response_str = f"<|im_start|>user\n{tool_response_str}<|im_end|>"
467
+ assistant_prefix = f"<|im_start|>assistant\n<think>"
468
+ return curr_prompt + "\n" + tool_response_str + "\n" + assistant_prefix
469
+
470
+ def format_tool_call(self, tool_call_str: str):
471
+ """Convert JSON function call description to Python executable code string."""
472
+ try:
473
+ call_json = json.loads(tool_call_str)
474
+ func_name = call_json['name']
475
+ arguments = call_json.get('arguments', {})
476
+
477
+ args_str = ', '.join(f"{k}={repr(v)}" for k, v in arguments.items())
478
+ return f"{func_name}({args_str})"
479
+ except Exception as e:
480
+ return f"Parse tool call failed: {e}"
481
+
482
+ def execute_tool_calls(self, env: str, tool_calls: List[str]) -> List[str]:
483
+ def exe_tool_call(env, call):
484
+ url = self.executor_url + '/execute'
485
+
486
+ call_str = self.format_tool_call(call)
487
+ # print(call_str)
488
+ if call_str.startswith("error: parse tool call failed"):
489
+ return call_str
490
+
491
+ try:
492
+ data = {
493
+ 'env': env,
494
+ 'call': call_str
495
+ }
496
+ response = requests.post(url, json=data, timeout=60)
497
+ if response.status_code != 200:
498
+ return f"error: {response.status_code}"
499
+ response = response.json()
500
+ ret_str = ''
501
+ if response['result']:
502
+ ret_str += f'result: \n{response["result"]}\n'
503
+ if response['output']:
504
+ ret_str += f'output: \n{response["output"]}\n'
505
+ if response['error']:
506
+ ret_str += f'error: \n{response["error"]}\n'
507
+ return ret_str.strip()
508
+ except requests.exceptions.Timeout:
509
+ return "error: execution timed out"
510
+ except Exception as e:
511
+ return str(e)
512
+
513
+ results = []
514
+ for tool_call in tool_calls:
515
+ result = exe_tool_call(env, tool_call)
516
+ results.append(result)
517
+ return results
518
+
519
+ def validate_tool_calls(self, output_str):
520
+ start_tags = re.findall(r'<tool_call>', output_str)
521
+ end_tags = re.findall(r'</tool_call>', output_str)
522
+
523
+ if len(start_tags) != len(end_tags):
524
+ return False
525
+
526
+ start_positions = [m.start() for m in re.finditer(r'<tool_call>', output_str)]
527
+ end_positions = [m.start() for m in re.finditer(r'</tool_call>', output_str)]
528
+
529
+ for start, end in zip(start_positions, end_positions):
530
+ if start >= end:
531
+ return False
532
+
533
+ return True
534
+
535
+ def extract_tool_calls(self, output_str):
536
+ if not self.validate_tool_calls(output_str):
537
+ return []
538
+
539
+ try:
540
+ pattern = r'<tool_call>((?:(?!</tool_call>).)*)</tool_call>'
541
+ matches = re.finditer(pattern, output_str, re.DOTALL)
542
+
543
+ return [match.group(1).strip() for match in matches]
544
+ except Exception as e:
545
+ return []
546
+
547
+ def extract_tool_calls_deepseek(self, output_str):
548
+ if not self.validate_tool_calls(output_str):
549
+ return []
550
+
551
+ try:
552
+ pattern = r'<tool_calls_begin>((?:(?!</tool_calls_end>).)*)<tool_calls_end>'
553
+ matches = re.finditer(pattern, output_str, re.DOTALL)
554
+
555
+ return [match.group(1).strip() for match in matches]
556
+ except Exception as e:
557
+ return []
558
+
559
+
560
+
561
+ @retry(max=5, sleep=1, fallback={"score": 0})
562
+ def run_ii_searcher(
563
+ self,
564
+ env: str,
565
+ func_schemas: str,
566
+ question: str,
567
+ tokenizer,
568
+ model_url="http://0.0.0.0:1214",
569
+ temperature: float = 0.0,
570
+ max_new_tokens: int = 40960,
571
+ ):
572
+ curr_prompt = self.init_prompt(func_schemas, question)
573
+ all_tool_calls= []
574
+
575
+ for _ in range(16):
576
+ prompt_tokens = tokenizer(curr_prompt, return_tensors=None, add_special_tokens=False)["input_ids"]
577
+ max_tokens_left = max_new_tokens - len(prompt_tokens) - 100
578
+ # for oss model served via vllm
579
+ # response = requests.post(
580
+ # f'{model_url}/v1/chat/completions',
581
+ # json={
582
+ # "text": curr_prompt,
583
+ # # "reasoning": "medium"
584
+ # },
585
+ # ).json()
586
+ # for sglang served models hf models
587
+ response = requests.post(
588
+ f'{model_url}/generate',
589
+ json={
590
+ "text": curr_prompt,
591
+ "sampling_params": {
592
+ "temperature": temperature,
593
+ "max_new_tokens": max_tokens_left,
594
+ "repetition_penalty": 1.05
595
+ },
596
+
597
+ }
598
+ ).json()
599
+ if "error" in response.keys():
600
+ print("resp",response)
601
+ curr_prompt = self.cat_assistant_response(curr_prompt, response['text'])
602
+
603
+ tool_calls: List[str] = self.extract_tool_calls(response['text'])
604
+ all_tool_calls += tool_calls
605
+
606
+ if len(tool_calls) == 0:
607
+ break
608
+
609
+ else:
610
+ results: List[str] = self.execute_tool_calls(env, tool_calls)
611
+ curr_prompt = self.cat_tool_results(curr_prompt, tool_calls, results)
612
+
613
+ return curr_prompt, all_tool_calls
614
+
615
+ # @retry(max=5, sleep=1, fallback={"score": 0})
616
+ # def run(
617
+ # self,
618
+ # env: str,
619
+ # func_schemas: str,
620
+ # question: str,
621
+ # tokenizer,
622
+ # model_url="http://0.0.0.0:1214",
623
+ # temperature: float = 0.0,
624
+ # max_new_tokens: int = 40960,
625
+ # ):
626
+ # curr_prompt = self.init_prompt(func_schemas, question)
627
+ # all_tool_calls= []
628
+
629
+ # for i in range(32):
630
+ # prompt_tokens = tokenizer(curr_prompt, return_tensors=None, add_special_tokens=False)["input_ids"]
631
+ # max_tokens_left = max_new_tokens - len(prompt_tokens) - 100
632
+ # # for oss model served via vllm
633
+ # # response = requests.post(
634
+ # # f'{model_url}/v1/chat/completions',
635
+ # # json={
636
+ # # "text": curr_prompt,
637
+ # # # "reasoning": "medium"
638
+ # # },
639
+ # # ).json()
640
+ # # for sglang served models hf models
641
+ # response = requests.post(
642
+ # f'{model_url}/generate',
643
+ # json={
644
+ # "text": curr_prompt,
645
+ # "sampling_params": {
646
+ # "temperature": temperature,
647
+ # "max_new_tokens": max_tokens_left,
648
+ # "repetition_penalty": 1.05
649
+ # },
650
+
651
+ # }
652
+ # ).json()
653
+ # if "error" in response.keys():
654
+ # print("resp",response)
655
+ # curr_prompt = self.cat_assistant_response(curr_prompt, response['text'])
656
+
657
+ # tool_calls: List[str] = self.extract_tool_calls(response['text'])
658
+ # all_tool_calls += tool_calls
659
+
660
+ # if len(tool_calls) == 0:
661
+ # break
662
+
663
+ # else:
664
+ # # print(f"Step-{i+1}")
665
+ # results: List[str] = self.execute_tool_calls(env, tool_calls)
666
+ # curr_prompt = self.cat_tool_results(curr_prompt, tool_calls, results)
667
+
668
+ # return curr_prompt, all_tool_calls
669
+ from typing import List, Dict, Any, Tuple
670
+ import requests
671
+
672
+ def build_summary_prompt(self, question: str, transcript: str, tool_calls: Any) -> str:
673
+ """Assemble a compact but detailed prompt for summarization."""
674
+ tool_str = ""
675
+ if tool_calls is not None:
676
+ try:
677
+ tool_str = str(tool_calls)
678
+ except Exception:
679
+ tool_str = "<unprintable tool_calls>"
680
+ return (
681
+ "You are given a DeepSearch investigation trace.\n\n"
682
+ f"Question:\n{question}\n\n"
683
+ "Trace (model transcript):\n"
684
+ f"{transcript}\n\n"
685
+ "Tool Calls (as-recorded):\n"
686
+ f"{tool_str}\n\n"
687
+ "β€” End of trace β€”"
688
+ )
689
+
690
+ def reformat_trace(self, s: str) -> str:
691
+ if not s:
692
+ return s
693
+
694
+ t = s
695
+
696
+ # 1) Speaker tags: <|im_start|>assistant -> "ASSISTANT:\n"
697
+ def _speaker(m: re.Match) -> str:
698
+ role = (m.group(1) or "").strip().upper()
699
+ return f"\n{role}:\n"
700
+ t = re.sub(r"<\|im_start\|\>(\w+)", _speaker, t, flags=re.IGNORECASE)
701
+
702
+ # 2) End-of-message tag: drop but keep spacing
703
+ t = re.sub(r"<\|im_end\|\>", "\n", t, flags=re.IGNORECASE)
704
+
705
+ # 3) THINK blocks: replace tags with label, keep content
706
+ t = re.sub(r"<think\s*>", "", t, flags=re.IGNORECASE)
707
+ t = re.sub(r"</think\s*>", "\n", t, flags=re.IGNORECASE)
708
+
709
+ # 4) TOOL RESPONSE blocks: support both 'response' and the misspelt 'resonse'
710
+ t = re.sub(r"<tool_respon[sc]e\s*>", "SEARCH RESULT\n", t, flags=re.IGNORECASE)
711
+ t = re.sub(r"</tool_respon[sc]e\s*>", "\n", t, flags=re.IGNORECASE)
712
+
713
+ # 5) TOOL CALL wrappers: drop tags, keep the JSON/content
714
+ t = re.sub(r"</?tool_call\s*>", "", t, flags=re.IGNORECASE)
715
+
716
+ # 6) Any remaining ChatML specials like <|eot_id|>, <|...|> -> remove
717
+ t = re.sub(r"<\|[^>]+?\|>", "", t)
718
+
719
+ # 7) Remove any other angle-bracket tags we didn’t explicitly keep
720
+ # (leaves inner text intact). This will strip e.g. <tool_response_extra>
721
+ t = re.sub(r"</?[^>\n]+?>", "", t)
722
+
723
+ # 8) Normalize whitespace (collapse 3+ newlines to 2)
724
+ t = re.sub(r"\n{3,}", "\n\n", t).strip()
725
+
726
+ return t
727
+
728
+ def _openai_client(self):
729
+ try:
730
+ from openai import OpenAI # type: ignore
731
+ except Exception as e:
732
+ raise RuntimeError("openai package not installed. `pip install openai`") from e
733
+ return OpenAI()
734
+
735
+ def init_summary_prompt(self, system_prompt: str, prompt: str) -> str:
736
+ system_prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>"
737
+ user_prompt = f"<|im_start|>user\n{prompt}<|im_end|>"
738
+ assistant_prefix = f"<|im_start|>assistant\n<think>"
739
+ return system_prompt + "\n" + user_prompt + "\n" + assistant_prefix
740
+
741
+ def _call_hf_endpoint(self, base_url: str, system_prompt: str, prompt: str, temperature: float, max_tokens: int, deepresearch_on: bool) -> str:
742
+ curr_prompt = self.init_summary_prompt(system_prompt, prompt)
743
+
744
+ hf_token= os.environ['HF_TOKEN']
745
+
746
+ headers = {
747
+ "Accept" : "application/json",
748
+ "Authorization": f"Bearer {hf_token}",
749
+ "Content-Type": "application/json"
750
+ }
751
+
752
+ # print(f"User Prompt:\n{curr_prompt}\n\n")
753
+
754
+ response_summary = requests.post(
755
+ url=f"{base_url}",
756
+ headers=headers,
757
+ json={
758
+ "inputs": curr_prompt,
759
+ "parameters": {
760
+ "temperature": temperature,
761
+ "max_new_tokens": max_tokens,
762
+ "top_p": 0.95,
763
+ "repetition_penalty": 1.05,
764
+ },
765
+ },
766
+ timeout=300,
767
+ ).json()
768
+
769
+ if isinstance(response_summary, list):
770
+ response_summary = response_summary[0]
771
+
772
+ if isinstance(response_summary, dict) and "error" in response_summary:
773
+ # Log the error as assistant text for visibility and break
774
+ err_msg = f"[model_error] {response_summary.get('error')}"
775
+ print("Got error response from summarising model:", err_msg, end="\n\n")
776
+
777
+ assistant_text = response_summary.get("generated_text", "")
778
+
779
+ if curr_prompt == assistant_text[:len(curr_prompt)]:
780
+ assistant_text = assistant_text[len(curr_prompt):]
781
+
782
+ # print(assistant_text)
783
+
784
+ report = re.split(r"</think\s*>", assistant_text, flags=re.IGNORECASE)[-1]
785
+ # plan = re.split(r"</think\s*>", assistant_text, flags=re.IGNORECASE)[0]
786
+
787
+ # print(report, "\n\n")
788
+
789
+ if not deepresearch_on:
790
+ report = report.strip()
791
+ # report = report[::-1]
792
+ # str_find = "Final Answer:"
793
+ # pos = report.find(str_find[::-1])
794
+ # pos += len(str_find)
795
+ # report = report[pos:][::-1]
796
+ # report = report.rstrip('# \n-').strip(' \n-')
797
+
798
+ start_tag = "<answer>"
799
+ end_tag = "</answer>"
800
+ pos_start = report.find(start_tag)
801
+ pos_end = report[pos_start:].find(end_tag) + pos_start
802
+ answer = report
803
+ if pos_start != -1 and pos_end != -1:
804
+ answer = report[pos_start + len(start_tag):pos_end].strip()
805
+
806
+ str_find = "Final Answer:"
807
+ if str_find in answer:
808
+ answer = answer[::-1]
809
+ pos = answer.find(str_find[::-1])
810
+ pos += len(str_find)
811
+ answer = answer[pos:][::-1]
812
+ answer = answer.rstrip('# \n-').strip(' \n-')
813
+
814
+ # print("answer:")
815
+ # print(answer, "\n\n")
816
+
817
+ return answer
818
+
819
+ report = report.strip()
820
+ report = report[::-1]
821
+ str_find = "Sources used"
822
+ pos = report.find(str_find[::-1])
823
+ pos += len(str_find)
824
+ report = report[pos:][::-1]
825
+ report = report.rstrip('# \n-').strip(' \n-')
826
+
827
+ if not report.startswith("##") and report.startswith("#"):
828
+ report = "#" + report
829
+ elif not report.startswith("##") and not report.startswith("#"):
830
+ report = "## " + report
831
+
832
+ # report = '\n\n' + report.strip()
833
+
834
+ # print(report.find('Executive Summary'), report.find('#'))
835
+ # print(f"'{report[:20]}'")
836
+
837
+ # print(report,"\n\n")
838
+
839
+ urls = {}
840
+ count = 1
841
+
842
+ while "[http" in report:
843
+ start_idx = report.find("[http")
844
+ end_idx = report.find("]", start_idx)
845
+ if end_idx != -1:
846
+ url_string = report[start_idx + 1:end_idx]
847
+ url_list = []
848
+ while len(url_string) > 0:
849
+ pos1 = url_string.find(";")
850
+ pos2 = url_string.find(",")
851
+ pos3 = url_string.find(" ")
852
+
853
+ if pos1 == -1:
854
+ pos1 = len(url_string) + 1
855
+ if pos2 == -1:
856
+ pos2 = len(url_string) + 1
857
+ if pos3 == -1:
858
+ pos3 = len(url_string) + 1
859
+
860
+ pos = min(pos1, pos2, pos3)
861
+
862
+ if pos == len(url_string) + 1:
863
+ url = url_string
864
+ else:
865
+ url = url_string[:pos]
866
+
867
+ url_list.append(url)
868
+
869
+ if pos < len(url_string):
870
+ url_string = url_string[pos + 1:].lstrip(" ,;")
871
+ else:
872
+ break
873
+
874
+ report_new = report[:start_idx] + '(**'
875
+ for url in url_list:
876
+ if url not in urls:
877
+ urls[url] = count
878
+ count += 1
879
+ report_new += f'[{urls[url]}], '
880
+ report_new = report_new[:-2]
881
+ report_new += '**)' + report[end_idx+1:]
882
+ report = report_new
883
+ else:
884
+ break
885
+
886
+ if len(urls) > 0:
887
+ report += "\n\n## Sources used:\n"
888
+ sorted_urls = sorted(urls.items(), key=lambda x: x[1])
889
+ for url, idx in sorted_urls:
890
+ report += f"- **{idx}**: {url}\n"
891
+ report += '\n'
892
+ # adding references (auto-removed in markdown)
893
+ for url, idx in sorted_urls:
894
+ report += f"[{idx}]: {url}\n"
895
+
896
+ # print(report,"\n\n")
897
+
898
+ return report
899
+
900
+ def _route_and_summarize(
901
+ self,
902
+ summary_llm: str,
903
+ system_prompt: str,
904
+ prompt: str,
905
+ *,
906
+ temperature: float,
907
+ max_tokens: int,
908
+ deepresearch_on: bool,
909
+ ) -> str:
910
+ """
911
+ If `summary_llm` starts with 'http', treat as vLLM base_url; else treat as an OpenAI model id.
912
+ For vLLM, prepend [SYSTEM]/[USER] tags; for OpenAI, pass messages with system+user.
913
+ """
914
+ if not summary_llm.strip().lower().startswith("gpt-"):
915
+ # print(system_prompt)
916
+ # print(prompt)
917
+ return self._call_hf_endpoint(summary_llm, system_prompt, prompt, temperature=temperature, max_tokens=max_tokens, deepresearch_on=deepresearch_on)
918
+
919
+ else:
920
+ client = self._openai_client()
921
+ rsp = client.chat.completions.create(
922
+ model=summary_llm,
923
+ temperature=temperature,
924
+ messages=[
925
+ {"role": "system", "content": system_prompt},
926
+ {"role": "user", "content": prompt},
927
+ ],
928
+ max_tokens=max_tokens,
929
+ )
930
+
931
+ return rsp.choices[0].message.content or ""
932
+
933
+ @retry(max=5, sleep=1, fallback={"score": 0})
934
+ def run(
935
+ self,
936
+ env: str,
937
+ func_schemas: str,
938
+ question: str,
939
+ tokenizer,
940
+ model_url: str = "http://0.0.0.0:1214",
941
+ temperature: float = 0.0,
942
+ max_new_tokens: int = 40960,
943
+ top_p: float = 0.6,
944
+ old_prompt: Optional[str] = None,
945
+ deepresearch_on: bool = True,
946
+ summary_llm: str = "gpt-4.1-mini"
947
+ ):
948
+ # ) -> Tuple[str, List[str], List[Dict[str, str]]]:
949
+ """
950
+ Returns:
951
+ curr_prompt: the final prompt buffer (with assistant/tool traces you maintain internally)
952
+ all_tool_calls: flat list of all tool call strings extracted across steps
953
+ chat: a lightweight chat transcript list[{"role": "...", "content": "..."}]
954
+ β€’ 'user' items = the original question + aggregated tool responses
955
+ β€’ 'assistant' items = model responses (and a compact line-list of tool calls)
956
+ """
957
+ # off_str = "\n\n**User has TURNED OFF search**. **DO NOT use search**. **Answer all questions YOURSELF**. **DO NOT use any tools**.\n**YOUR FIRST-RESPONSE WILL BE CONSIDERED AS THE FINAL ANSWER**. **YOU WILL NOT GET TO CALL TOOLS AND WAIT FOR TOOL RESULTS AND THEN ANSWER**.\n**YOU WON'T BE ALLOWED TO CHAT AND CALL TOOLS, IN A MULTI-TURN FASHION**. **YOU WILL CHAT IN A SINGLE-TURN FORMAT**.\n**SO MAKE SURE YOUR FIRST RESPONSE IS THE FINAL ANSWER**.\n"
958
+
959
+ # if not search_on and (old_prompt is not None and self.sys_prompt_websailor_start not in old_prompt):
960
+ # question += off_str
961
+
962
+ search_on = True
963
+
964
+ if old_prompt is not None:
965
+ old_prompt = self.replace_sys_prompt(old_prompt, func_schemas, search_on)
966
+
967
+ # Build runtime prompt and initialize accumulators
968
+ curr_prompt = self.init_prompt(func_schemas, question, old_prompt, search_on)
969
+ all_tool_calls: List[str] = []
970
+ chat: List[Dict[str, str]] = []
971
+
972
+ # Seed transcript with JUST the question (no system prompt)
973
+ chat.append({"role": "user", "content": question})
974
+
975
+ for i in range(64):
976
+ # Budget tokens for this step
977
+ prompt_tokens = tokenizer(curr_prompt, return_tensors=None, add_special_tokens=False)["input_ids"]
978
+ max_tokens_left = max(1, max_new_tokens - len(prompt_tokens) - 100)
979
+
980
+ # ---- Model call (sglang/vLLM-style JSON) ----
981
+ # If you switch to /v1/chat/completions, adjust accordingly.
982
+ hf_token= os.environ['HF_TOKEN']
983
+
984
+ headers = {
985
+ "Accept" : "application/json",
986
+ "Authorization": f"Bearer {hf_token}",
987
+ "Content-Type": "application/json"
988
+ }
989
+
990
+ # print(f"User Prompt:\n{curr_prompt}\n\n")
991
+
992
+ response = requests.post(
993
+ url=f"{model_url}",
994
+ headers=headers,
995
+ json={
996
+ "inputs": curr_prompt,
997
+ "parameters": {
998
+ "temperature": temperature,
999
+ "max_new_tokens": max_tokens_left,
1000
+ "top_p": top_p,
1001
+ "repetition_penalty": 1.05,
1002
+ },
1003
+ },
1004
+ timeout=300,
1005
+ ).json()
1006
+
1007
+ if isinstance(response, list):
1008
+ response = response[0]
1009
+
1010
+ if isinstance(response, dict) and "error" in response:
1011
+ # Log the error as assistant text for visibility and break
1012
+ err_msg = f"[model_error] {response.get('error')}"
1013
+ print("Got error response from model:", err_msg, end="\n\n")
1014
+ chat.append({"role": "assistant", "content": err_msg})
1015
+ break
1016
+
1017
+ assistant_text = response.get("generated_text", "")
1018
+
1019
+ if curr_prompt == assistant_text[:len(curr_prompt)]:
1020
+ # print("Current prompt is a prefix to generated text.")
1021
+ # If the assistant's response is just a continuation of the prompt, we can use it directly
1022
+ assistant_text = assistant_text[len(curr_prompt):]
1023
+
1024
+ # print(f"Assistant Text:\n{assistant_text}\n\n")
1025
+
1026
+ # Append assistant's raw text to chat
1027
+ chat.append({"role": "assistant", "content": assistant_text})
1028
+
1029
+ # Update your running prompt with assistant text
1030
+ curr_prompt = self.cat_assistant_response(curr_prompt, assistant_text)
1031
+
1032
+ # Extract tool calls from the assistant text
1033
+ if search_on:
1034
+ tool_calls: List[str] = self.extract_tool_calls(assistant_text)
1035
+
1036
+ else:
1037
+ tool_calls: List[str] = []
1038
+
1039
+ # yield "assistant_resp", (assistant_text, tool_calls)
1040
+
1041
+ if tool_calls:
1042
+ yield "assistant_resp", (assistant_text, tool_calls)
1043
+ all_tool_calls.extend(tool_calls)
1044
+
1045
+ # Log tool calls as an assistant message (newline-joined)
1046
+ chat.append({"role": "assistant", "content": "\n".join(tool_calls)})
1047
+
1048
+ # Execute tools and collect results
1049
+ results: List[str] = self.execute_tool_calls(env, tool_calls)
1050
+
1051
+ yield "tool_results", (results, )
1052
+
1053
+ # Feed tool results back into prompt
1054
+ curr_prompt = self.cat_tool_results(curr_prompt, tool_calls, results)
1055
+
1056
+ # Aggregate tool responses into a single user message
1057
+ tool_res_blocks = []
1058
+ for idx, (call, res) in enumerate(zip(tool_calls, results), 1):
1059
+ tool_res_blocks.append(f"[Tool {idx}] Result:\n{res}")
1060
+ chat.append({"role": "user", "content": "\n\n".join(tool_res_blocks)})
1061
+
1062
+ else:
1063
+ if search_on:
1064
+ prompt = self.build_summary_prompt(question, self.reformat_trace(curr_prompt) or "", all_tool_calls)
1065
+ system_prompt = DEEPRESEARCH_SYS_PROMPT if deepresearch_on else SUMMARY_SYS_PROMPT
1066
+
1067
+ summary_text = self._route_and_summarize(
1068
+ summary_llm=summary_llm if deepresearch_on else model_url,
1069
+ system_prompt=system_prompt,
1070
+ prompt=prompt,
1071
+ temperature=0.6,
1072
+ max_tokens=16000,
1073
+ deepresearch_on=deepresearch_on
1074
+ )
1075
+
1076
+ summary_text_splits = summary_text.split("</think>")
1077
+ summary_text_initial = summary_text_splits[0]
1078
+ summary_text_initial = summary_text_initial.replace("<think>", "").strip()
1079
+ summary_text_final = summary_text_splits[-1]
1080
+
1081
+ if len(summary_text_initial) > 0 and "</think>" in summary_text:
1082
+ yield "assistant_resp", (summary_text_initial, [])
1083
+ yield "tool_results", ([], )
1084
+ yield "assistant_resp", (summary_text_final, tool_calls)
1085
+ # print(f"No tool calls found in assistant response.\nAssistant Response:\n{assistant_text}\n\n")
1086
+ else:
1087
+ yield "assistant_resp", (assistant_text, tool_calls)
1088
+ print(f"Search is off, so no tool calls expected and no tool calls called.\nAssistant Response:\n{assistant_text}\n\n")
1089
+ # No tool calls β†’ model produced a final answer; stop.
1090
+ break
1091
+
1092
+ # Return the original outputs plus the chat-style transcript
1093
+ # return curr_prompt, all_tool_calls, chat
1094
+
1095
+ return "end", (curr_prompt, )
1096
+
1097
+ @retry(max=5, sleep=1, fallback={"score": 0})
1098
+ def run_deepseek(
1099
+ self,
1100
+ env: str,
1101
+ func_schemas: str,
1102
+ question: str,
1103
+ model_name: str,
1104
+ temperature: float = 0.0,
1105
+ top_p: float = 0.95,
1106
+ max_tokens: int = 32768,
1107
+ ):
1108
+ # print("AA"* 100)
1109
+ """
1110
+ Chat-based ReCall loop for DeepSeek-R1 on Together.
1111
+ """
1112
+ sys_content = self.sys_prompt_websailor_deepseek.format(func_schemas=func_schemas)
1113
+ # sys_content = self.init_prompt(func_schemas, question)
1114
+
1115
+ messages = [
1116
+ {"role": "system", "content": sys_content},
1117
+ {"role": "user", "content": question},
1118
+ ]
1119
+
1120
+ # client = Together(api_key="")
1121
+ client = Together(api_key="")
1122
+ all_tool_calls = []
1123
+ for turn in range(32): # up to 10 reasoning turns
1124
+ resp = client.chat.completions.create(
1125
+ model=model_name,
1126
+ # model="Qwen/Qwen3-235B-A22B-fp8-tput",
1127
+ messages=messages,
1128
+ temperature=temperature,
1129
+ top_p=top_p,
1130
+ max_tokens=39000,
1131
+ stop=["<|end▁of▁sentence|>", "<|im_end|>"]
1132
+ )
1133
+ # print(resp)
1134
+
1135
+
1136
+ assistant_text = resp.choices[0].message.content
1137
+ # print(assistant_text)
1138
+ messages.append({"role": "assistant", "content": assistant_text})
1139
+ # print(f"assistant_output: {assistant_text}")
1140
+
1141
+ # β›‘ Safe tool call extraction with diagnostic
1142
+ # try:
1143
+ # print("Extracting tool calls")
1144
+ tool_calls = self.extract_tool_calls_deepseek(assistant_text)
1145
+ print(tool_calls)
1146
+ all_tool_calls += tool_calls
1147
+ # except Exception as e:
1148
+ # print(f"Extraction failed with exception {e}")
1149
+ # err_msg = f"<tool_response>Tool call extraction failed on turn {turn+1}: {str(e)}</tool_response>"
1150
+ # messages.append({"role": "user", "content": err_msg})
1151
+ # continue # continue to next turn instead of breaking
1152
+ if "<answer>" in assistant_text:
1153
+ break
1154
+
1155
+ if len(tool_calls) != 0:
1156
+ results = self.execute_tool_calls(env, tool_calls)
1157
+ tool_resp_block = "".join(
1158
+ f"<tool_response>{c}\n{r}\n</tool_response>\n"
1159
+ for c, r in zip(tool_calls, results)
1160
+ )
1161
+ messages.append({"role": "user", "content": tool_resp_block})
1162
+ # print(f"Tool Response {tool_resp_block}")
1163
+ else:
1164
+ print("no answer or tool call")
1165
+ break
1166
+
1167
+ trajectory = "\n".join(
1168
+ f"<{m['role']}>\n{m['content']}" for m in messages
1169
+ if m["role"] != "system"
1170
+ )
1171
+ return trajectory, all_tool_calls
1172
+
1173
+
1174
+ # ────────────────────────────────────────────────────────────────
1175
+ # HF-endpoint version of β€œretrieve β†’ inject β†’ tool loop”
1176
+ # ────────────────────────────────────────────────────────────────
1177
+ @retry(max=5, sleep=1, fallback=None)
1178
+ def run_with_prompt_injection(
1179
+ self,
1180
+ env: str,
1181
+ func_schemas: str,
1182
+ question: str,
1183
+ model_url: str = "http://0.0.0.0:1214",
1184
+ temperature: float = 0.0,
1185
+ max_new_tokens: int = 512,
1186
+ top_n: int = 5,
1187
+ ):
1188
+ """
1189
+ 0) call pubmed_search(question, top_n) once via the sandbox
1190
+ 1) inject those snippets into the very first user message
1191
+ 2) continue with the normal multi-turn ReCall loop against *model_url*
1192
+ """
1193
+
1194
+ # 0️⃣ do a single retrieval tool call
1195
+ retrieve_call = json.dumps({
1196
+ "name": "pubmed_search",
1197
+ "arguments": {"query": question, "top_n": top_n}
1198
+ })
1199
+ retrieval_raw = self.execute_tool_calls(env, [retrieve_call])[0]
1200
+ try:
1201
+ snippets_block = retrieval_raw.split("result:", 1)[-1].strip()
1202
+ except Exception:
1203
+ snippets_block = ""
1204
+
1205
+ # 1️⃣ build initial prompt with injected snippets
1206
+ user_msg = (
1207
+ f"Question: {question}\n\n"
1208
+ "Here are some relevant PubMed snippets:\n"
1209
+ f"{snippets_block}"
1210
+ ) if snippets_block else f"Question: {question}"
1211
+
1212
+ sys_prompt = self.init_prompt(func_schemas, question)
1213
+ system_prompt = f"<|im_start|>system\n{sys_prompt}<|im_end|>"
1214
+ user_prompt = f"<|im_start|>user\n{user_msg}<|im_end|>"
1215
+ assistant_pref= f"<|im_start|>assistant\n<think>"
1216
+ curr_prompt = system_prompt + "\n" + user_prompt + "\n" + assistant_pref
1217
+
1218
+ # 2️⃣ normal ReCall loop hitting the HF inference endpoint
1219
+ for _ in range(10):
1220
+ resp = requests.post(
1221
+ f"{model_url}/generate",
1222
+ json={
1223
+ "text": curr_prompt,
1224
+ "sampling_params": {
1225
+ "temperature": temperature,
1226
+ "max_new_tokens": max_new_tokens,
1227
+ }
1228
+ },
1229
+ timeout=120,
1230
+ ).json()
1231
+ if "error" in resp.keys():
1232
+ print("resp", resp)
1233
+ assistant_txt = resp["text"]
1234
+ curr_prompt = self.cat_assistant_response(curr_prompt, assistant_txt)
1235
+
1236
+ tool_calls = self.extract_tool_calls(assistant_txt)
1237
+ if len(tool_calls) != 0:
1238
+ # break # model produced an answer β†’ done
1239
+
1240
+ results = self.execute_tool_calls(env, tool_calls)
1241
+ curr_prompt = self.cat_tool_results(curr_prompt, tool_calls, results)
1242
+
1243
+ else:
1244
+ continue
1245
+ return curr_prompt
1246
+
1247
+
1248
+
1249
+ @retry(max=5, sleep=1, fallback={"score": 0})
1250
+ def run_budget(
1251
+ self,
1252
+ env: str,
1253
+ func_schemas: str,
1254
+ question: str,
1255
+ model_url: str = "http://0.0.0.0:1214",
1256
+ temperature: float = 0.0,
1257
+ max_new_tokens: int = 2048,
1258
+ ) -> str:
1259
+ """
1260
+ Execute an agentic dialogue with external tools while *pruning* previous
1261
+ <tool_response> blocks to prevent context-length explosion.
1262
+ """
1263
+ curr_prompt = self.init_prompt(func_schemas, question)
1264
+
1265
+ for _ in range(16): # hard loop-limit
1266
+ # ── 1. Call the model
1267
+ rsp = requests.post(
1268
+ f"{model_url}/generate",
1269
+ json={
1270
+ "text": curr_prompt,
1271
+ "sampling_params": {
1272
+ "temperature": temperature,
1273
+ "max_new_tokens": max_new_tokens,
1274
+ "stop": ["<|im_end|>", "</think>", "</think>\n" "</think>\n\n"],
1275
+ },
1276
+
1277
+ },
1278
+ timeout=120,
1279
+ ).json()
1280
+ generated = rsp["text"] # what you have now
1281
+ matched = rsp["meta_info"]["finish_reason"].get("matched")
1282
+
1283
+ # β‡’Β append the tag back only if it was removed
1284
+ if matched and not generated.endswith(matched):
1285
+ generated += matched
1286
+
1287
+ # Fail fast on server error
1288
+ if "error" in rsp:
1289
+ raise RuntimeError(rsp["error"])
1290
+
1291
+ assistant_text: str = rsp["text"]
1292
+ curr_prompt = self.cat_assistant_response(curr_prompt, assistant_text)
1293
+
1294
+ # ── 2. Check for final answer ────────────────────────────────────
1295
+ if "<answer>" in assistant_text:
1296
+ break
1297
+
1298
+ # ── 3. Extract & execute tool calls ──────────────────────────────
1299
+ tool_calls: List[str] = self.extract_tool_calls(assistant_text)
1300
+ if not tool_calls: # continue reasoning without calling a tool
1301
+ continue
1302
+
1303
+ results: List[str] = self.execute_tool_calls(env, tool_calls)
1304
+
1305
+
1306
+ # ── 4. BEFORE appending new tool output, drop all old ones ───────
1307
+ curr_prompt =self. _strip_old_tool_responses(curr_prompt)
1308
+
1309
+ # ── 5. Append *only* the fresh tool_response block ───────────────
1310
+ curr_prompt = self.cat_tool_results(curr_prompt, tool_calls, results)
1311
+
1312
+ return curr_prompt
1313
+
1314
+
1315
+
1316
+
1317
+ def _strip_old_tool_responses_msgs(self, messages: list[dict]) -> list[dict]:
1318
+ """
1319
+ Return a copy of `messages` with every *user* message that starts with
1320
+ <tool_response> removed. Keeps assistant turns untouched.
1321
+ """
1322
+ return [
1323
+ m for m in messages
1324
+ if not (m["role"] == "user" and m["content"].lstrip().startswith("<tool_response>"))
1325
+ ]
1326
+ # ────────── budget version ──────────
1327
+ @retry(max=5, sleep=1, fallback={"score": 0})
1328
+ def run_deepseek_budget(
1329
+ self,
1330
+ env: str,
1331
+ func_schemas: str,
1332
+ question: str,
1333
+ api_key: str,
1334
+ model_name: str,
1335
+ temperature: float = 0.0,
1336
+ top_p: float = 0.95,
1337
+ max_tokens: int = 32768,
1338
+ max_turns: int = 10,
1339
+ ):
1340
+ """
1341
+ Chat-based ReCall loop for DeepSeek-R1 **with context-budget pruning**.
1342
+ Keeps only the *latest* <tool_response> block to avoid prompt bloat.
1343
+ """
1344
+ sys_content = self.system_prompt_budget.format(func_schemas=func_schemas)
1345
+
1346
+ messages = [
1347
+ {"role": "system", "content": sys_content},
1348
+ {"role": "user", "content": question},
1349
+ ]
1350
+
1351
+ client = Together(api_key=api_key)
1352
+
1353
+ for turn in range(max_turns):
1354
+ # ── 1. model call ───────────────────────────────────────────────
1355
+ resp = client.chat.completions.create(
1356
+ model=model_name,
1357
+ messages=messages,
1358
+ temperature=temperature,
1359
+ top_p=top_p,
1360
+ max_tokens=max_tokens,
1361
+ stop=["</tool_call>", "<|end▁of▁sentence|>"],
1362
+ )
1363
+ assistant_text = resp.choices[0].message.content
1364
+ messages.append({"role": "assistant", "content": assistant_text})
1365
+
1366
+ print(f"**assistant** \n {assistant_text}")
1367
+
1368
+ # ── 2. finished? ────────────────────────────────────────────────
1369
+ if "<answer>" in assistant_text:
1370
+ break
1371
+
1372
+ # ── 3. parse tool calls ────────────────────────────────────────
1373
+ tool_calls = self.extract_tool_calls(assistant_text)
1374
+ print(f"**tool_calls** \n {tool_calls}")
1375
+ if not tool_calls:
1376
+ continue # keep reasoning without tools
1377
+
1378
+ # ── 4. execute tools ───────────────────────────────────────────
1379
+ results = self.execute_tool_calls(env, tool_calls)
1380
+ print(f"**tool_response** \n {results}")
1381
+
1382
+ # ── 5. prune & append fresh tool_response ──────────────────────
1383
+ messages = self._strip_old_tool_responses_msgs(messages)
1384
+
1385
+ tool_resp_block = "".join(
1386
+ f"<tool_response>{c}\n{r}\n</tool_response>\n"
1387
+ for c, r in zip(tool_calls, results)
1388
+ )
1389
+ messages.append({"role": "user", "content": tool_resp_block})
1390
+
1391
+ # ── 6. flatten & return trajectory (sans system for readability) ───
1392
+ trajectory = "\n".join(
1393
+ f"<{m['role']}>\n{m['content']}" for m in messages if m["role"] != "system"
1394
+ )
1395
+ return trajectory
1396
+
1397
+
1398
+ @retry(max=5, sleep=1, fallback=None)
1399
+ def run_deepseek_with_prompt_injection(
1400
+ self,
1401
+ env: str,
1402
+ func_schemas: str,
1403
+ question: str,
1404
+ api_key: str,
1405
+ model_name: str,
1406
+ temperature: float = 0.0,
1407
+ top_p: float = 0.95,
1408
+ max_tokens: int = 32768,
1409
+ ):
1410
+ """
1411
+ 1) Call pubmed_search(question, top_n=5) as a tool to get snippets.
1412
+ 2) Inject them into the first user message.
1413
+ 3) Proceed with the usual DeepSeek-R1 tool‐based rollout.
1414
+ """
1415
+
1416
+ # ── Step 0: prepare the single‐tool call for retrieval ───────────────
1417
+ retrieve_call = json.dumps({
1418
+ "name": "pubmed_search",
1419
+ "arguments": {
1420
+ "query": question,
1421
+ "top_n": 5
1422
+ }
1423
+ })
1424
+
1425
+ # Execute it once via your helper
1426
+ # note: `env` must include whatever import / client‐setup
1427
+ # your sandbox needs to run pubmed_search(...)
1428
+ raw_retrieval_results = self.execute_tool_calls(env, [retrieve_call])[0]
1429
+ # print("AAAAA"*100)
1430
+ try:
1431
+ snippets = raw_retrieval_results[9:] #"remove result: str"
1432
+ # print(snippets)
1433
+ except:
1434
+ snippets = ""
1435
+ # print(f"[ReCall] Retriever call failed to parse JSON, got:\n{raw_retrieval_results!r}")
1436
+
1437
+ # ── Step 1: build the injected user prompt ────────────────────────────
1438
+ if snippets:
1439
+
1440
+ user_content = (
1441
+ f"Question: {question}\n\n"
1442
+ "Here are some relevant PubMed snippets:\n"
1443
+ f"{snippets}"
1444
+ )
1445
+ else:
1446
+ user_content = f"Question: {question}"
1447
+
1448
+ # ── Step 2: start the chat history ────────────────────────────────────
1449
+ sys_content = self.system_prompt_forcing_tool_call
1450
+ messages = [
1451
+ {"role": "system", "content": sys_content},
1452
+ {"role": "user", "content": user_content},
1453
+ ]
1454
+ client = Together(api_key=api_key)
1455
+
1456
+ # ── Step 3: your normal ReCall tool‐calling loop ─────────────────────
1457
+ for turn in range(10):
1458
+ resp = client.chat.completions.create(
1459
+ model = model_name,
1460
+ messages = messages,
1461
+ temperature = temperature,
1462
+ top_p = top_p,
1463
+ max_tokens = max_tokens,
1464
+ stop = ["</tool_call>", "<|end▁of▁sentence|>"]
1465
+ )
1466
+
1467
+ assistant_text = resp.choices[0].message.content
1468
+ messages.append({"role": "assistant", "content": assistant_text})
1469
+
1470
+ tool_calls = self.extract_tool_calls(assistant_text)
1471
+ if not tool_calls:
1472
+ break
1473
+
1474
+ # Execute all of the tool calls in one go
1475
+ results = self.execute_tool_calls(env, tool_calls)
1476
+ # and append them back in the required <tool_response> format
1477
+ tool_resp_block = "".join(
1478
+ f"<tool_response>{call}\n{out}\n</tool_response>\n"
1479
+ for call, out in zip(tool_calls, results)
1480
+ )
1481
+ messages.append({"role": "user", "content": tool_resp_block})
1482
+
1483
+ # ── Step 4: flatten to a single trajectory ────────────────────────────
1484
+ trajectory = "\n".join(
1485
+ f"<{m['role']}>\n{m['content']}"
1486
+ for m in messages
1487
+ if m["role"] != "system"
1488
+ )
1489
+ return trajectory
1490
+
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ transformers
2
+ accelerate
3
+ openai
4
+ tiktoken
5
+ pix2tex
6
+ Pillow
7
+ gradio
8
+ fastapi
9
+ pydantic
10
+ uvicorn
11
+ together
12
+ beautifulsoup4
13
+ trafilatura
14
+ wikipedia
15
+ PyMuPDF
16
+ Crawl4AI
run_question.py ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # single_question_recall.py
2
+ from __future__ import annotations
3
+ import re
4
+ import os
5
+ from typing import Any, Dict, Optional
6
+
7
+ from re_call import ReCall
8
+ from transformers import AutoTokenizer
9
+
10
+
11
+ import re
12
+ from typing import Optional, Any, Dict, Tuple, List
13
+
14
+ def _extract_answer_boxed(s: str) -> Optional[str]:
15
+ """
16
+ Return the content of the *last* \\boxed{...} or \\fbox{...} in `s`,
17
+ with proper matching of nested braces. Escaped braces (\\{, \\}) are ignored
18
+ for counting. If no balanced block is found, returns None.
19
+ """
20
+ def _iter_box_like_spans(text: str):
21
+ # Find openings for \boxed{ and \fbox{
22
+ openings: List[Tuple[str, int, int]] = []
23
+ for m in re.finditer(r'\\boxed\s*\{', text):
24
+ openings.append(("boxed", m.start(), m.end()))
25
+ for m in re.finditer(r'\\fbox\s*\{', text):
26
+ openings.append(("fbox", m.start(), m.end()))
27
+ openings.sort(key=lambda x: x[1])
28
+ # For each opening, scan forward to find its matching closing brace
29
+ for kind, start, open_end in openings:
30
+ depth = 1
31
+ i = open_end
32
+ n = len(text)
33
+ while i < n:
34
+ ch = text[i]
35
+ # Skip escaped characters: backslash escapes the next char (including { or })
36
+ if ch == '\\' and i + 1 < n:
37
+ i += 2
38
+ continue
39
+ if ch == '{':
40
+ depth += 1
41
+ elif ch == '}':
42
+ depth -= 1
43
+ if depth == 0:
44
+ # content is text[open_end:i]
45
+ yield (kind, start, open_end, i)
46
+ break
47
+ i += 1
48
+
49
+ last_content: Optional[str] = None
50
+ for _, _start, open_end, close_idx in _iter_box_like_spans(s):
51
+ last_content = s[open_end:close_idx] # keep the *last* one
52
+
53
+ return last_content.strip() if last_content is not None else None
54
+
55
+
56
+ def _extract_answer_tagged(s: str) -> Optional[str]:
57
+ answer_tag_re = re.compile(r"<answer>(.*?)</answer>", re.S)
58
+ m = answer_tag_re.findall(s)
59
+ return m[-1].strip() if m else None
60
+
61
+
62
+ def _parse_answer_from_transcript(transcript: str) -> str:
63
+ """
64
+ Prefer balanced \\boxed{...}/\\fbox{...} content, then <answer>...</answer>,
65
+ else fall back to the last 200 chars.
66
+ """
67
+ return (
68
+ _extract_answer_boxed(transcript)
69
+ or _extract_answer_tagged(transcript)
70
+ # or transcript[-200:].strip()
71
+ )
72
+
73
+
74
+ # --- main API: recall only ---
75
+ def answer_question_recall(
76
+ question: str,
77
+ *,
78
+ model_url: Optional[str] = None, # your thinker endpoint (if recall uses one)
79
+ executor_url: Optional[str] = None,
80
+ tokenizer_dir: str = "./tokenizer-info",
81
+ temperature: float = 0.6,
82
+ max_new_tokens: int = 40960,
83
+ top_p: float = 0.95,
84
+ search_env: str = "from search_api import search_urls, open_url, search_and_parse_query, query_url",
85
+ func_schemas = [
86
+ {
87
+ "name": "search_urls",
88
+ "description": "Google search and return links to web-pages with a brief snippet given a text query",
89
+ "parameters": {
90
+ "type": "object",
91
+ "properties": {"query": {"type": "string"}, "top_k": {"type": "integer", "default": 10}},
92
+ "required": ["query"],
93
+ },
94
+ },
95
+ {
96
+ "name": "query_url",
97
+ "description": "Visit webpage and return evidence based retrival for the provided goal",
98
+ "parameters": {
99
+ "type": "object",
100
+ "properties": {
101
+ "url": {"type": "string", "description": "The URL of the webpage to visit. Must be a single URL"},
102
+ "goal": {"type": "string", "description": "The specific information goal for visiting webpage"},
103
+ },
104
+ "required": ["url", "goal"],
105
+ },
106
+ },
107
+ ],
108
+ deepseek_name: str = "deepseek-ai/DeepSeek-R1",
109
+ old_prompt: Optional[str] = None,
110
+ deepresearch_on: bool = True,
111
+ summary_llm: str = "gpt-4.1-mini",
112
+ ):
113
+ # ) -> Dict[str, Any]:
114
+ """
115
+ Runs a single question through ReCall and returns:
116
+ {
117
+ "answer": str,
118
+ "transcript": str,
119
+ "tool_calls": Any,
120
+ "chat": Any | None
121
+ }
122
+ """
123
+ if executor_url is None:
124
+ executor_url = os.environ["HOST_SERPER_URL"]
125
+
126
+ if model_url is None:
127
+ model_url = os.environ["HF_MODEL_URL"]
128
+
129
+ # 1) tokenizer (REQUIRED by ReCall.run)
130
+ tok = AutoTokenizer.from_pretrained(tokenizer_dir, trust_remote_code=True)
131
+
132
+ # 2) build agent
133
+ agent = ReCall(executor_url=executor_url)
134
+
135
+ last_out = ""
136
+
137
+ # 3) call the correct entrypoint
138
+ if model_url == deepseek_name:
139
+ # some setups use a special deepseek path that returns (transcript, tool_calls)
140
+ out = agent.run_deepseek(
141
+ env=search_env,
142
+ func_schemas=func_schemas,
143
+ question=question,
144
+ model_name=deepseek_name,
145
+ temperature=temperature,
146
+ max_tokens=max_new_tokens,
147
+ top_p=top_p,
148
+ )
149
+ transcript, tool_calls, chat = _normalize_out(out, expect_chat=False)
150
+ last_out = transcript
151
+ else:
152
+ # standard ReCall.run MUST receive tokenizer
153
+ agent_generator = agent.run(
154
+ env=search_env,
155
+ func_schemas=func_schemas,
156
+ question=question,
157
+ model_url=model_url,
158
+ temperature=temperature,
159
+ max_new_tokens=max_new_tokens,
160
+ tokenizer=tok, # <- fixes your "missing tokenizer" error
161
+ top_p=top_p,
162
+ old_prompt=old_prompt,# <- you can pass the raw old prompt here if there exists an older chat history
163
+ # the function will append the question to the raw old prompt string (chat history) if it is not None
164
+ deepresearch_on=deepresearch_on,
165
+ summary_llm=summary_llm
166
+ # deepresearch=deepresearch, # <- use the deepresearch prompt
167
+ )
168
+
169
+ while True:
170
+ try:
171
+ tag, out = next(agent_generator)
172
+ if tag == "assistant_resp":
173
+ last_out = out[0]
174
+ yield tag, out
175
+ if tag == "end":
176
+ break
177
+ except StopIteration as e:
178
+ # the chat_str variable contains the whole conversation in the raw string form
179
+ # it contains the raw tokens like "<|im_start|>system\n", "<|im_end|>"
180
+ # and "<|im_start|>assistant\n<think>", "<tool_response>", "\n</tool_response>\n", etc.
181
+ chat_str: str = e.value[1][0]
182
+ yield "end", (chat_str,)
183
+ break
184
+
185
+ # 4) parse final answer
186
+ answer = _parse_answer_from_transcript(last_out)
187
+
188
+ return "answer", (answer,)
189
+
190
+
191
+ def _normalize_out(out: Any, expect_chat: bool) -> tuple[str, Any, Any]:
192
+ """
193
+ Normalize ReCall outputs to (transcript, tool_calls, chat)
194
+ Handles:
195
+ - (transcript, tool_calls, chat)
196
+ - (transcript, tool_calls)
197
+ - "transcript"
198
+ - {"transcript": ..., "tool_calls": ..., "chat": ...} variants
199
+ """
200
+ transcript, tool_calls, chat = "", None, None
201
+
202
+ if isinstance(out, tuple):
203
+ if len(out) == 3:
204
+ transcript, tool_calls, chat = out
205
+ elif len(out) == 2:
206
+ transcript, tool_calls = out
207
+ elif len(out) == 1:
208
+ transcript = out[0]
209
+ else:
210
+ transcript = str(out[-1])
211
+ elif isinstance(out, dict):
212
+ transcript = out.get("transcript") or out.get("output") or out.get("response") or ""
213
+ tool_calls = out.get("tool_calls")
214
+ chat = out.get("chat")
215
+ else:
216
+ transcript = str(out)
217
+
218
+ # Some implementations return None/empty; keep things predictable
219
+ if chat is None and expect_chat is False:
220
+ chat = None
221
+ return transcript, tool_calls, chat
222
+
223
+
224
+ # quick demo
225
+ if __name__ == "__main__":
226
+ old_prompt = None
227
+
228
+ answer_generator = answer_question_recall(
229
+ "What is the most popular restraunt in kolkata?",
230
+ old_prompt=old_prompt
231
+ )
232
+
233
+ # print("ANSWER:", res["answer"])
234
+ # print("\n")
235
+ # # print(type(res["tool_calls"]), len(res["tool_calls"]))
236
+ # for i in res["tool_calls"]:
237
+ # print(f"{i}\n")
238
+ # print("\n")
239
+ # if res["chat"] is not None:
240
+ # # print(type(res["chat"]), len(res["chat"]))
241
+ # for i in res["chat"]:
242
+ # print(f"{i}\n")
243
+ # print("\n")
244
+ # print("TRANSCRIPT (tail):\n", res["transcript"][-300:])
245
+
246
+ final_chat_str = ""
247
+
248
+ while True:
249
+ try:
250
+ tag, out = next(answer_generator)
251
+ if tag == "assistant_resp":
252
+ assistant_text, tool_calls = out
253
+ print(f"ASSISTANT RESPONSE:\n{assistant_text}\n\n")
254
+ print("TOOL CALLS:\n")
255
+ for tool_call in tool_calls:
256
+ print(f"{tool_call}")
257
+ print("\n")
258
+ elif tag == "tool_results":
259
+ results = out[0]
260
+ print("TOOL RESULTS:\n")
261
+ for result in results:
262
+ print(f"{result}")
263
+ print("\n")
264
+ elif tag == "end":
265
+ print(f"{'='*20}\nASSISTANT RESPONSE ENDED\n{'='*20}\n\n")
266
+ final_chat_str = out[0]
267
+ elif tag == "answer":
268
+ answer = out[0]
269
+ print(f"FINAL ANSWER:\n{answer}\n\n")
270
+ break
271
+ except StopIteration as e:
272
+ print(f"FINAL ANSWER:\n{e.value[1][0]}\n\n")
273
+ break
274
+
275
+ print(f"{'='*20}\nEND\n{'='*20}\n\n\nFINAL CHAT STRING:\n{final_chat_str}\n\n")
tokenizer-info/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0284b582e14987fbd3d5a2cb2bd139084371ed9acbae488829a1c900833c680
3
+ size 707
tokenizer-info/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer-info/special_tokens_map.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76862e765266b85aa9459767e33cbaf13970f327a0e88d1c65846c2ddd3a1ecd
3
+ size 613
tokenizer-info/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
+ size 11422654
tokenizer-info/tokenizer_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:443bfa629eb16387a12edbf92a76f6a6f10b2af3b53d87ba1550adfcf45f7fa0
3
+ size 5404
tokenizer-info/vocab.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca10d7e9fb3ed18575dd1e277a2579c16d108e32f27439684afa0e10b1440910
3
+ size 2776833
web_agents_5/compressor.py ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # compressor.py
2
+ from __future__ import annotations
3
+ import functools, json, logging, re
4
+ from difflib import SequenceMatcher
5
+ from io import StringIO
6
+ from typing import Dict, List, Tuple
7
+
8
+ import pandas as pd
9
+ import regex # needed by tiktoken
10
+ import tiktoken
11
+ from bs4 import BeautifulSoup
12
+ from config import CFG
13
+ from web_helpers import retry
14
+
15
+ # ────────────────────────────────────────────────────────────────────────
16
+ # 0. shared helpers
17
+ # ------------------------------------------------------------------------
18
+ enc = tiktoken.get_encoding("cl100k_base")
19
+ _tok = lambda s: len(enc.encode(s)) # fast inline counter
20
+
21
+ @functools.lru_cache(maxsize=1)
22
+ def _nlp():
23
+ import spacy
24
+ return spacy.load("en_core_web_sm")
25
+
26
+ def _openai_client():
27
+ """Import OpenAI lazily to avoid overhead when not needed."""
28
+ import importlib
29
+ mod = importlib.import_module("openai")
30
+ return getattr(mod, "OpenAI", None)() if hasattr(mod, "OpenAI") else mod
31
+
32
+ # ────────────────────────────────────────────────────────────────────────
33
+ # 1. regex patterns (compiled once)
34
+ # ------------------------------------------------------------------------
35
+ DATE_PATS = [re.compile(p, re.I) for p in [
36
+ r"\d{4}-\d{2}-\d{2}",
37
+ r"(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2},\s+\d{4}",
38
+ r"(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4}",
39
+ r"\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{4}",
40
+ r"\b\d{4}/\d{2}\b",
41
+ r"\b\d{4}\b(?!\s*(?:%|million|billion|thousand))",
42
+ ]]
43
+ EMAIL_PAT = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+")
44
+ URL_PAT = re.compile(r"https?://[^\s\)]+")
45
+ PHONE_PAT = re.compile(r"\+?\d[\d\s\-().]{7,}\d")
46
+ CURR_PAT = re.compile(r"(\$\s?\d+(?:,\d{3})*(?:\.\d+)?|\d+(?:,\d{3})*(?:\.\d+)?\s*(USD|EUR|GBP|INR|Β₯|β‚©|β‚Ή|€))", re.I)
47
+ DEF_PAT = re.compile(r"([A-Z][A-Za-z0-9\s]+?)\s+(is|are|refers to|means)\s+(.*?)(?:[\.\n])")
48
+
49
+ MD_TABLE_PAT = re.compile(
50
+ r"(?:^\|.*?\|\n?)+(?:^\|[-:\s|]+\|\n?)?(?:^\|.*?\|\n?)+", re.M)
51
+ CSV_PAT = re.compile(r"((?:^.*?,.*?\n){2,})", re.M)
52
+ TSV_PAT = re.compile(r"((?:^.*?\t.*?\n){2,})", re.M)
53
+
54
+ # ────────────────────────────────────────────────────────────────────────
55
+ # 2. core utilities
56
+ # ------------------------------------------------------------------------
57
+ def deduplicate_items(items: List[str], *, similarity=0.5,
58
+ other: List[str] | None = None) -> List[str]:
59
+ """Drop near‑duplicates; prefer the longest variant."""
60
+ if not items: return []
61
+ other = other or []
62
+
63
+ def _clean(x: str) -> str:
64
+ x = re.sub(r'\[edit\]|\[\d+\]', '', x)
65
+ return re.sub(r'\s+', ' ', x).strip()
66
+
67
+ out, out_clean = [], []
68
+ for orig in items:
69
+ clean = _clean(orig)
70
+ dup = False
71
+ for ref in out_clean + list(map(_clean, other)):
72
+ sim = SequenceMatcher(None, clean, ref).ratio()
73
+ if sim >= similarity or clean in ref or ref in clean:
74
+ dup = True
75
+ # if current is longer than stored, replace
76
+ if clean not in out_clean and len(clean) > len(ref):
77
+ idx = out_clean.index(ref)
78
+ out[idx], out_clean[idx] = orig, clean
79
+ break
80
+ if not dup:
81
+ out.append(orig)
82
+ out_clean.append(clean)
83
+ return out
84
+
85
+ # ────────────────────────────────────────────────────────────────────────
86
+ # 3. fact & table extractor
87
+ # ------------------------------------------------------------------------
88
+ def extract_facts_and_tables(text: str) -> Tuple[str, List[str], List[str]]:
89
+ facts, spans = [], []
90
+
91
+ def _add(match):
92
+ facts.append(match.group())
93
+ spans.append(match.span())
94
+
95
+ for pat in DATE_PATS: [_add(m) for m in pat.finditer(text)]
96
+ for m in EMAIL_PAT.finditer(text): _add(m)
97
+ for m in URL_PAT.finditer(text): _add(m)
98
+ for m in PHONE_PAT.finditer(text): _add(m)
99
+ for m in CURR_PAT.finditer(text): _add(m)
100
+ for m in DEF_PAT.finditer(text): _add(m)
101
+
102
+ # contextual sentences around facts
103
+ doc = _nlp()(text)
104
+ ctx = [s.text.strip() for s in doc.sents
105
+ if any(s.start_char <= s_ <= s.end_char for s_, _ in spans)]
106
+ facts.extend(ctx)
107
+ facts = sorted(set(facts))
108
+
109
+ # ── tables
110
+ tables = []
111
+
112
+ for tbl in MD_TABLE_PAT.findall(text):
113
+ cleaned = "\n".join(l for l in tbl.splitlines()
114
+ if l.strip() and not re.match(r"^\|[-:\s|]+\|$", l))
115
+ if len(cleaned.splitlines()) < 2: continue
116
+ try:
117
+ df = pd.read_csv(StringIO(cleaned), sep="|").dropna(how="all", axis=1)
118
+ tables.append(df.to_markdown(index=False))
119
+ except Exception:
120
+ tables.append(cleaned)
121
+
122
+ soup = BeautifulSoup(text, "lxml")
123
+ for html_tbl in soup.find_all("table"):
124
+ try:
125
+ df = pd.read_html(str(html_tbl))[0]
126
+ tables.append(df.to_markdown(index=False))
127
+ except Exception:
128
+ tables.append(str(html_tbl))
129
+
130
+ for m in CSV_PAT.finditer(text):
131
+ try:
132
+ df = pd.read_csv(StringIO(m.group(1)))
133
+ if not df.empty:
134
+ tables.append(df.to_markdown(index=False))
135
+ except Exception:
136
+ pass
137
+ for m in TSV_PAT.finditer(text):
138
+ try:
139
+ df = pd.read_csv(StringIO(m.group(1)), sep="\t")
140
+ if not df.empty:
141
+ tables.append(df.to_markdown(index=False))
142
+ except Exception:
143
+ pass
144
+
145
+ # ── clean narrative (remove facts & tables)
146
+ narrative = text
147
+ for tbl in tables: narrative = narrative.replace(tbl, " ")
148
+ for s, e in sorted(spans, reverse=True): narrative = narrative[:s] + narrative[e:]
149
+ narrative = re.sub(r"\s{2,}", " ", narrative).strip()
150
+
151
+ return narrative, facts, tables
152
+
153
+ # ────────────────────────────────────────────────────────────────────────
154
+ # 4. OpenAI summariser helpers
155
+ # ------------------------------------------------------------------------
156
+ def _summarise(text: str, pct: float, model: str) -> str:
157
+ target_tokens = int(_tok(text) * pct)
158
+ sys_prompt = (
159
+ "You are an expert abstractor. Summarize the text below to "
160
+ f"approximately {pct*100:.0f}% of its original length (β‰ˆ{target_tokens} tokens), "
161
+ "while **retaining all key facts, figures, names, dates, places, and events**. "
162
+ "Ensure the summary remains accurate, informative, and faithful to the original content."
163
+ )
164
+ client = _openai_client()
165
+ rsp = client.chat.completions.create(
166
+ model=model, temperature=0.2,
167
+ messages=[{"role":"system","content":sys_prompt},
168
+ {"role":"user","content":text}],
169
+ max_tokens=CFG.output_limit_per_link
170
+ )
171
+ return rsp.choices[0].message.content
172
+
173
+ # ────────────────────────────────────────────────────────────────────────
174
+ # 5. compress_text (public)
175
+ # ------------------------------------------------------------------------
176
+ def compress_text(text: str, *, pct: float = 0.3,
177
+ model: str = "gpt-4o-mini") -> Dict[str, str]:
178
+
179
+ FACTS_TABLES_LIMIT = CFG.output_limit_per_link - CFG.disable_narrative_compress_thresh
180
+ narrative, facts, tables = extract_facts_and_tables(text)
181
+
182
+ # narrative compression
183
+ if _tok(narrative) > CFG.disable_narrative_compress_thresh:
184
+ narrative_txt = _summarise(narrative, pct, model)
185
+ else:
186
+ narrative_txt = narrative
187
+ return narrative_txt
188
+
189
+ # ────────────────────────────────────────────────────────────────────────
190
+ # 6. query_text (goal‑oriented extraction)
191
+ # ------------------------------------------------------------------------
192
+ EXTRACTOR_SYS_PROMPT = (
193
+ "You are a highly skilled information extraction agent. Your job is to analyze long, complex webpages "
194
+ "in the context of a specific user goal. You excel at identifying relevant sections, capturing supporting evidence "
195
+ "in full original context, and providing logically structured summaries. Always ensure precision, completeness, "
196
+ "and alignment with the user’s intent."
197
+ )
198
+ EXTRACTOR_PROMPT_TEMPLATE = """You are a highly skilled information extraction agent. Your task is to analyze the following webpage content in light of a specific user goal, and extract accurate, well-structured information using plain text format.
199
+
200
+ ## Webpage Content
201
+ {webpage_content}
202
+
203
+ ## User Goal
204
+ {goal}
205
+
206
+ ## Task Guidelines
207
+ 1. **Rational**: Briefly explain why this content is relevant to the user’s goal.
208
+ 2. **Evidence**: Quote the most relevant parts of the webpage that directly support or address the goal. Use bullet points or numbered lines separated by newlines.
209
+ 3. **Summary**: Provide a clear, logically structured summary of the extracted evidence that addresses the user's goal.
210
+
211
+ ## Output Format
212
+ Your response must follow **exactly this format** with the three sections:
213
+ Rational: <one paragraph>
214
+ Evidence: <first point>\n<second point>...
215
+ Summary:<concise paragraph summarizing the evidence>
216
+ """
217
+
218
+ def extract_regex(text: str) -> Dict[str, str]:
219
+ def extract_section(header: str) -> str:
220
+ # Match the section starting with `Header:` until the next capitalized line followed by `:` or end
221
+ pattern = rf"{header}:\s*(.*?)(?=\n[A-Z][a-z]+:|\Z)"
222
+ match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
223
+ return match.group(1).strip() if match else "(not found)"
224
+
225
+ return {
226
+ "rational": extract_section("Rational"),
227
+ "evidence": extract_section("Evidence"),
228
+ "summary": extract_section("Summary")
229
+ }
230
+
231
+ def query_text(
232
+ url: str,
233
+ text: str,
234
+ goal: str,
235
+ *,
236
+ model: str = "gpt-4.1-mini",
237
+ max_attempts: int = 3,
238
+ ) -> Dict[str, str]:
239
+ """Goal‑oriented extractor with retries β†’ compress fallback β†’ token trim fallback."""
240
+ prompt = EXTRACTOR_PROMPT_TEMPLATE.format(
241
+ webpage_content=text[:15_000], # clip for safety
242
+ goal=goal,
243
+ )
244
+ client = _openai_client()
245
+
246
+ for attempt in range(1, max_attempts + 1):
247
+ try:
248
+ rsp = client.chat.completions.create(
249
+ model=model,
250
+ temperature=0.0,
251
+ messages=[
252
+ {"role": "system", "content": EXTRACTOR_SYS_PROMPT},
253
+ {"role": "user", "content": prompt},
254
+ ],
255
+ max_tokens = 1024
256
+ ).choices[0].message.content
257
+
258
+ extracted = extract_regex(rsp)
259
+
260
+ # Sanity check: evidence + summary must be > 20 characters
261
+ if len(extracted.get("evidence", "")) + len(extracted.get("summary", "")) > 20:
262
+ return {
263
+ "extracted_info": (
264
+ f"The useful information in {url} for goal β€œ{goal}”:\n\n"
265
+ f"Rationale:\n{extracted.get('rational')}\n\n"
266
+ f"Evidence:\n{extracted.get('evidence')}\n\n"
267
+ f"Summary:\n{extracted.get('summary')}"
268
+ )
269
+ }
270
+
271
+ raise ValueError("LLM returned empty or malformed extraction")
272
+
273
+ except Exception as e:
274
+ logging.warning("Attempt %d/%d failed for query-based extraction: %s",
275
+ attempt, max_attempts, e)
276
+
277
+ # ── Retry fallback: compress text ─────────────────────────────────────
278
+ try:
279
+ compressed = compress_text(text, model=model)
280
+ return {
281
+ "extracted_info": (
282
+ f"Goal-based extraction failed after {max_attempts} attempts; "
283
+ f"returning compressed webpage:\n\n{compressed}"
284
+ )
285
+ }
286
+ except Exception as ce:
287
+ logging.error("compress_text also failed: %s", ce)
288
+
289
+ # ── Final fallback: hard truncate to token budget ────────────────────
290
+ return {
291
+ "extracted_info": (
292
+ "Goal-based extraction and compression both failed; "
293
+ "returning truncated webpage content:\n\n" +
294
+ trim_to_budget(text, CFG.output_limit_per_link, model=model)
295
+ )
296
+ }
297
+
298
+
299
+ # ────────────────────────────────────────────────────────────────────────
300
+ # 7. helper: trim long lists to token budget
301
+ # ------------------------------------------------------------------------
302
+ def trim_to_budget(items: List[str], budget: int, *,
303
+ is_table: bool) -> Tuple[str, int]:
304
+ build, used = [], 0
305
+ for it in items:
306
+ toks = _tok(it)
307
+ if used + toks > budget:
308
+ break
309
+ build.append(it)
310
+ used += toks
311
+ if len(build) < len(items):
312
+ build.append(f"[{len(items)-len(build)} {'tables' if is_table else 'facts'} omitted]")
313
+ joined = "\n\n".join(build) if is_table else "\n".join(build)
314
+ return joined, _tok(joined)
web_agents_5/config.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import logging, os, random, requests
3
+
4
+ class _Cfg:
5
+ ua: str = (
6
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
7
+ "(KHTML, like Gecko) Chrome/125.0 Safari/537.36"
8
+ )
9
+ serper_key = os.getenv("SERPER_API_KEY", "")
10
+ jina_cache_dir = os.getenv("JINA_CACHE_DIR", "")
11
+ serper_cache_dir = os.getenv("SERPER_CACHE_DIR", "")
12
+ jina_key = os.getenv("JINA_API_KEY", "")
13
+ serper_ep = "https://google.serper.dev/search"
14
+ crawl4ai_ep = os.getenv("CRAWL4AI_EP", "http://localhost:8080")
15
+ retries = 3
16
+ backoff = 0.8
17
+ connect_to = 5
18
+ read_to = 10
19
+ stream_html_cap = 200_000
20
+ pdf_size_cap = 32_000_000
21
+ pdf_pages_cap = 40
22
+ pdf_chars_cap = 40_000
23
+ text_cap = 400_000
24
+ output_limit_per_link = 6_000
25
+ disable_narrative_compress_thresh = 2_000
26
+ pct = 0.25 # narrative compression pct
27
+ reddit_client_id = "Q2tovcGfYmo3hPNvzTpkXA"
28
+ reddit_client_secret = "geu4gH3pEOrNnsMpQvdTTVhQvDABgg"
29
+
30
+
31
+ CFG = _Cfg()
32
+ _RND = random.Random()
33
+ _SESS = requests.Session()
34
+ _SESS.headers.update({"User-Agent": CFG.ua})
35
+
36
+ # logging.basicConfig(level=logging.INFO,
37
+ # format="%(asctime)s - %(levelname)s - %(message)s")
38
+ logging.getLogger().setLevel(logging.INFO) # bump root to DEBUG
web_agents_5/fetchers/__init__.py ADDED
File without changes
web_agents_5/fetchers/basic_fetcher.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import logging
3
+ from urllib.parse import unquote
4
+ from config import CFG, _SESS
5
+ from web_helpers import extract_main_text, fetch_blocked_site
6
+
7
+ _BINARY = (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".zip", ".tar",
8
+ ".gz", ".mp3", ".mp4", ".mkv", ".exe")
9
+
10
+ _ERROR = ["wrong", "error", "try again"]
11
+
12
+ def _looks_like_error(txt):
13
+ if len(txt) < 300:
14
+ for err in _ERROR:
15
+ if err in txt:
16
+ return True
17
+ return False
18
+
19
+
20
+ def fetch_html(url: str) -> str:
21
+ if url.lower().endswith(_BINARY):
22
+ return "[binary omitted]"
23
+ try:
24
+ r = _SESS.get(url, stream=True, timeout=(CFG.connect_to, CFG.read_to))
25
+ r.raise_for_status()
26
+ ctype = (r.headers.get("content-type") or "").lower()
27
+ if "pdf" in ctype or not ("text" in ctype or "html" in ctype):
28
+ return "[binary omitted]"
29
+ raw = r.raw.read(CFG.stream_html_cap, decode_content=True)
30
+ html = raw.decode(r.encoding or "utf-8", errors="ignore")
31
+ txt = extract_main_text(html).strip()
32
+ if "wikipedia.org" in url:
33
+ slug = unquote(url.rsplit("/", 1)[-1]).replace("_", " ")
34
+ if slug.lower() not in txt.lower():
35
+ txt = f"{slug}\n\n{txt}"
36
+ if _looks_like_error(txt):
37
+ return f"[Error fetching url: {url}]"
38
+ else:
39
+ return "[Retrived using HTML] " + txt
40
+ except Exception as e:
41
+ logging.error("Generic fetch failed %s: %s", url, e)
42
+ return fetch_blocked_site(url)
web_agents_5/fetchers/crawl4ai_fetcher.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Asynchronous wrapper around **Crawl4AI** so that other coroutines can await a
3
+ single call – identical to the previous implementation but isolated in its own
4
+ module to satisfy clean‑architecture / layering.
5
+
6
+ Public API
7
+ ==========
8
+ async def fetch_crawl4ai(url: str) -> str
9
+ Returns markdown extracted by Crawl4AI or raises `RuntimeError` on failure.
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import asyncio, logging
14
+ from dataclasses import dataclass, field
15
+ from typing import Any
16
+
17
+ from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
18
+ from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
19
+ from config import CFG
20
+
21
+ # ----------------------------------------------------------------------------
22
+ _MAX_CONCURRENT_PAGES = 6
23
+ _MAX_ATTEMPTS = 5
24
+ _RETRYABLE = (
25
+ "handler is closed",
26
+ "browser has disconnected",
27
+ "transport closed",
28
+ "target crashed",
29
+ )
30
+
31
+ # Globals bound to the *event‑loop* currently active
32
+ _CRAWLER: AsyncWebCrawler | None = None
33
+ _CRAWLER_LOOP: asyncio.AbstractEventLoop | None = None
34
+ _SEMAPHORES: dict[asyncio.AbstractEventLoop, asyncio.Semaphore] = {}
35
+ _CFG = CrawlerRunConfig(markdown_generator=DefaultMarkdownGenerator())
36
+
37
+
38
+ def _get_semaphore() -> asyncio.Semaphore:
39
+ loop = asyncio.get_running_loop()
40
+ if loop not in _SEMAPHORES:
41
+ _SEMAPHORES[loop] = asyncio.Semaphore(_MAX_CONCURRENT_PAGES)
42
+ return _SEMAPHORES[loop]
43
+
44
+
45
+ async def _ensure_crawler() -> None:
46
+ global _CRAWLER, _CRAWLER_LOOP
47
+ loop = asyncio.get_running_loop()
48
+ if _CRAWLER is None or loop is not _CRAWLER_LOOP:
49
+ if _CRAWLER is not None:
50
+ try:
51
+ await _CRAWLER.aclose()
52
+ except Exception:
53
+ pass
54
+ _CRAWLER = AsyncWebCrawler()
55
+ await _CRAWLER.__aenter__()
56
+ _CRAWLER_LOOP = loop
57
+
58
+
59
+ @dataclass
60
+ class _Page:
61
+ success: bool
62
+ markdown: str | None = None
63
+ error: str | None = None
64
+ meta: dict[str, Any] = field(default_factory=dict)
65
+
66
+
67
+ async def _crawl_once(url: str) -> _Page:
68
+ global _CRAWLER
69
+ await _ensure_crawler()
70
+
71
+ try:
72
+ result = await _CRAWLER.arun(url, config=_CFG)
73
+ if result.success and result.markdown:
74
+ return _Page(True, result.markdown, meta=result.__dict__)
75
+ return _Page(False, error=result.error_message or "no markdown")
76
+ except Exception as exc:
77
+ return _Page(False, error=str(exc))
78
+
79
+
80
+ async def fetch_crawl4ai(url: str) -> str:
81
+ """Return markdown extracted by Crawl4AI or raise on failure."""
82
+ sem = _get_semaphore()
83
+ async with sem:
84
+ for attempt in range(1, _MAX_ATTEMPTS + 1):
85
+ page = await _crawl_once(url)
86
+ if page.success and page.markdown:
87
+ print(len(page.markdown))
88
+ return "[Retrieved from Craw4AI]" + page.markdown[:CFG.text_cap]
89
+
90
+ err = page.error or "unknown"
91
+ logging.warning("Crawl4AI attempt %d/%d failed: %s", attempt, _MAX_ATTEMPTS, err)
92
+
93
+ if attempt < _MAX_ATTEMPTS and any(p in err.lower() for p in _RETRYABLE):
94
+ # reset shared browser & retry after back‑off
95
+ global _CRAWLER
96
+ try:
97
+ await _CRAWLER.aclose()
98
+ except Exception:
99
+ pass
100
+ _CRAWLER = None
101
+ await asyncio.sleep(1.5 * attempt)
102
+ continue
103
+
104
+ raise RuntimeError(err)
web_agents_5/fetchers/github_fetcher.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ from config import CFG, _SESS, _RND
3
+ import logging
4
+ import re
5
+ from bs4 import BeautifulSoup
6
+ import functools
7
+ import random
8
+ import requests
9
+ import trafilatura
10
+ import time
11
+ from web_helpers import retry, fetch_blocked_site # ⬅️ shared
12
+
13
+
14
+ def fetch_github(url: str) -> str:
15
+ def _markdown_cleanup(md: str) -> str:
16
+ md = re.sub(r"```.*?```", "", md, flags=re.S)
17
+ md = re.sub(r"^#+\s*", "", md, flags=re.M)
18
+ return re.sub(r"[ \t]{2,}", " ", md).strip()
19
+
20
+ hdr = {"User-Agent": "ii-research-bot/0.6"}
21
+ try:
22
+ m = re.match(r"https?://github\.com/([^/]+)/([^/]+?)(?:\.git)?/?$", url)
23
+ if m:
24
+ owner, repo = m.groups()
25
+ api = f"https://api.github.com/repos/{owner}/{repo}/readme"
26
+ hdr_api = hdr | {"Accept": "application/vnd.github.v3.raw"}
27
+ if (tok := os.getenv("GITHUB_TOKEN")):
28
+ hdr_api["Authorization"] = f"Bearer {tok}"
29
+ r = _SESS.get(api, headers=hdr_api, timeout=(CFG.connect_to, CFG.read_to))
30
+ if r.ok and len(r.text) > 30:
31
+ return _markdown_cleanup(r.text)[:CFG.text_cap]
32
+
33
+ if "/blob/" in url or "/tree/" in url:
34
+ raw = re.sub(
35
+ r"https://github\.com/([^/]+)/([^/]+)/(?:blob|tree)/",
36
+ r"https://raw.githubusercontent.com/\\1/\\2/",
37
+ url,
38
+ count=1,
39
+ ).split("?", 1)[0]
40
+ r = _SESS.get(raw, headers=hdr, timeout=(CFG.connect_to, CFG.read_to))
41
+ if r.ok and "text" in (r.headers.get("content-type") or "") and len(r.text) > 0:
42
+ return r.text[:CFG.text_cap]
43
+
44
+ raw1 = url + ("?raw=1" if "?" not in url else "&raw=1")
45
+ r = _SESS.get(raw1, headers=hdr, timeout=(CFG.connect_to, CFG.read_to))
46
+ if r.ok and "text" in (r.headers.get("content-type") or "") and len(r.text) > 0:
47
+ return r.text[:CFG.text_cap]
48
+
49
+ plain = url + ("?plain=1" if "?" not in url else "&plain=1")
50
+ html = _SESS.get(plain, headers=hdr, timeout=(CFG.connect_to, CFG.read_to)).text
51
+ soup = BeautifulSoup(html, "lxml")
52
+ pre = soup.find("pre")
53
+ if pre and len(pre.text) > 10:
54
+ return pre.text[:CFG.text_cap]
55
+
56
+ if "raw.githubusercontent.com" in url:
57
+ r = _SESS.get(url.split("?", 1)[0], headers=hdr, timeout=(CFG.connect_to, CFG.read_to))
58
+ if r.ok and "text" in (r.headers.get("content-type") or ""):
59
+ return "[Retrieved from raw.githubusercontent.com]" + r.text[:CFG.text_cap]
60
+
61
+ raise RuntimeError("github: unable to retrieve plain text")
62
+ except Exception as e:
63
+ logging.error(f"GitHub fetch failed for {url}: {e}")
64
+ return _fetch_blocked_site(url)
web_agents_5/fetchers/jina_fetcher.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # """
2
+ # Jina AI powered web-page fetcher.
3
+
4
+ # Provides `fetch_jina(url: str) -> str` which returns a **plain-text or markdown** body
5
+ # prefixed with `[Retrieved from Jina AI]` so callers can recognise the source.
6
+ # If the Jina endpoint cannot return usable text (HTTP error, short / empty body, etc.)
7
+ # this function raises an Exception – letting the orchestrator fall back to other
8
+ # fetchers.
9
+
10
+ # The implementation is **stateless** and thread-safe – no global mutable state is
11
+ # kept apart from the shared requests session from `config` (mirroring the rest of
12
+ # the code-base).
13
+ # """
14
+
15
+ # from __future__ import annotations
16
+
17
+ # import logging
18
+ # import os
19
+ # import urllib.parse as _u
20
+
21
+ # from config import CFG, _SESS # shared requests session and config
22
+ # from web_helpers import retry
23
+
24
+ # _JINA_ENDPOINT = "https://r.jina.ai/{url}" # Note: will prepend http:// when formatting
25
+
26
+
27
+ # @retry
28
+ # def fetch_jina(url: str) -> str:
29
+ # """Return article text extracted by **Jina AI Read API**.
30
+
31
+ # Raises:
32
+ # RuntimeError – if the endpoint does not yield usable text
33
+ # """
34
+ # api_url = _JINA_ENDPOINT.format(url=url)
35
+ # headers = {
36
+ # "Authorization": f"Bearer {CFG.jina_key}"
37
+ # }
38
+ # logging.debug("Jina fetch β†’ %s", api_url)
39
+
40
+ # # Make request
41
+ # r = _SESS.get(api_url, headers=headers, timeout=(CFG.connect_to, CFG.read_to))
42
+ # r.raise_for_status()
43
+
44
+ # txt = r.text.strip()
45
+
46
+ # # Treat short or errorful body as failure
47
+ # if len(txt) < 200 and any(err in txt.lower() for err in ["403", "forbidden", "error"]):
48
+ # raise RuntimeError("Jina AI returned no content")
49
+
50
+ # return "[Retrieved from Jina AI] " + txt[: CFG.text_cap]
51
+
52
+ """
53
+ Jina AI powered web-page fetcher with URL-based disk cache.
54
+
55
+ - Cache key: canonicalized URL (sha256)
56
+ - Cache location: <CFG.cache_dir or $CACHE_DIR or ".cache">/jina_read/
57
+ - Always stores the *raw* Jina body (without the "[Retrieved...]" prefix).
58
+ - Atomic writes via os.replace for basic thread/process safety.
59
+ """
60
+
61
+ from __future__ import annotations
62
+
63
+ import hashlib
64
+ import logging
65
+ import os
66
+ import urllib.parse as _u
67
+ from typing import Tuple
68
+
69
+ from config import CFG, _SESS # shared requests session and config
70
+ from web_helpers import retry
71
+
72
+ _JINA_ENDPOINT = "https://r.jina.ai/{url}" # expects a fully-qualified, url-encoded target
73
+
74
+
75
+ def _canonicalize_url(url: str) -> str:
76
+ """Ensure URL has a scheme and is normalized for caching/API calls."""
77
+ p = _u.urlparse(url.strip())
78
+ if not p.scheme:
79
+ # Default to http if missing; Jina reader prefers explicit scheme.
80
+ p = _u.urlparse("http://" + url.strip())
81
+
82
+ # Normalize: lowercase scheme/netloc, drop fragment, keep query & path
83
+ p = p._replace(scheme=p.scheme.lower(), netloc=p.netloc.lower(), fragment="")
84
+ # Ensure path is at least "/"
85
+ path = p.path if p.path else "/"
86
+ return _u.urlunparse((p.scheme, p.netloc, path, "", p.query, ""))
87
+
88
+
89
+ def _cache_paths(nurl: str) -> Tuple[str, str]:
90
+ """Return (cache_dir, cache_file_path) for a normalized URL."""
91
+ cache_root = CFG.jina_cache_dir
92
+ cache_dir = os.path.join(cache_root, "jina_read")
93
+ os.makedirs(cache_dir, exist_ok=True)
94
+ h = hashlib.sha256(nurl.encode("utf-8")).hexdigest()
95
+ return cache_dir, os.path.join(cache_dir, f"{h}.txt")
96
+
97
+
98
+ def _load_from_cache(cpath: str) -> str | None:
99
+ try:
100
+ if os.path.exists(cpath) and os.path.getsize(cpath) > 0:
101
+ with open(cpath, "r", encoding="utf-8") as f:
102
+ return f.read()
103
+ except Exception as e:
104
+ logging.debug("Jina cache read failed (%s): %s", cpath, e)
105
+ return None
106
+
107
+
108
+ def _save_to_cache(cpath: str, body: str) -> None:
109
+ try:
110
+ tmp = f"{cpath}.tmp.{os.getpid()}"
111
+ with open(tmp, "w", encoding="utf-8") as f:
112
+ f.write(body)
113
+ os.replace(tmp, cpath) # atomic on the same filesystem
114
+ except Exception as e:
115
+ logging.debug("Jina cache write failed (%s): %s", cpath, e)
116
+
117
+
118
+ @retry
119
+ def fetch_jina(url: str) -> str:
120
+ """Return article text extracted by **Jina AI Read API** with disk cache.
121
+
122
+ Raises:
123
+ RuntimeError – if the endpoint does not yield usable text
124
+ """
125
+ nurl = _canonicalize_url(url)
126
+ cache_dir, cpath = _cache_paths(nurl)
127
+
128
+ # 1) Try cache
129
+ cached = _load_from_cache(cpath)
130
+ if cached:
131
+ logging.info("Jina fetch (cache hit) ← %s", nurl)
132
+ return "[Retrieved from Jina AI] " + cached[: CFG.text_cap]
133
+
134
+ # 2) Fetch from Jina
135
+ api_url = _JINA_ENDPOINT.format(url=_u.quote(nurl, safe=""))
136
+ headers = {"Authorization": f"Bearer {CFG.jina_key}"}
137
+ logging.debug("Jina fetch (cache miss) β†’ %s", api_url)
138
+
139
+ r = _SESS.get(api_url, headers=headers, timeout=(CFG.connect_to, CFG.read_to))
140
+ r.raise_for_status()
141
+ body = r.text.strip()
142
+
143
+ # 3) Validate
144
+ if len(body) < 200 and any(err in body.lower() for err in ("403", "forbidden", "error")):
145
+ raise RuntimeError("Jina AI returned no content")
146
+
147
+ # 4) Save to cache (store the raw body; callers always get the standard prefix)
148
+ _save_to_cache(cpath, body)
149
+
150
+ return "[Retrieved from Jina AI] " + body[: CFG.text_cap]
151
+
web_agents_5/fetchers/pdf_fetcher.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ from config import CFG, _SESS
3
+ import io, logging, re, pymupdf as fitz
4
+
5
+ from web_helpers import retry, fetch_blocked_site # ⬅️ shared
6
+ # ----------------------------------------------------------------------
7
+
8
+ class PDFExtractError(RuntimeError): ...
9
+
10
+ @retry
11
+ def _download_pdf(url: str) -> bytes:
12
+ with _SESS.get(url, stream=True, timeout=(CFG.connect_to, CFG.read_to)) as r:
13
+ r.raise_for_status()
14
+ total = int(r.headers.get("content-length", 0) or 0)
15
+ if 0 < total > CFG.pdf_size_cap:
16
+ raise RuntimeError("pdf too large")
17
+ buf = io.BytesIO()
18
+ for chunk in r.iter_content(16_384):
19
+ buf.write(chunk)
20
+ if buf.tell() > CFG.pdf_size_cap:
21
+ raise RuntimeError("pdf exceeds cap")
22
+ return buf.getvalue()
23
+
24
+ def _extract_pdf(buf: bytes) -> str:
25
+ try:
26
+ doc = fitz.open(stream=buf, filetype="pdf")
27
+ except Exception as e:
28
+ raise PDFExtractError(e)
29
+ parts, chars = [], 0
30
+ for page in doc:
31
+ if len(parts) >= CFG.pdf_pages_cap:
32
+ break
33
+ text = (
34
+ page.get_text("text")
35
+ .replace("\u00A0", " ")
36
+ .replace("-\n", "")
37
+ )
38
+ parts.append(text)
39
+ chars += len(text)
40
+ if chars > CFG.pdf_chars_cap:
41
+ break
42
+ out = " ".join(parts).strip()[:CFG.pdf_chars_cap]
43
+ if not out:
44
+ raise PDFExtractError("empty / scanned pdf")
45
+ return "[Retrieved from PyMUPDF]" + out
46
+
47
+ def fetch_pdf(url: str) -> str:
48
+ try:
49
+ buf = _download_pdf(url)
50
+ return _extract_pdf(buf)
51
+ except Exception as e:
52
+ logging.error("PDF fetch failed for %s: %s", url, e)
53
+ return fetch_blocked_site(url)[:CFG.text_cap]
web_agents_5/fetchers/reddit_fetcher.py ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ from __future__ import annotations
4
+ from config import CFG, _SESS, _RND
5
+ import logging
6
+ import re
7
+ from bs4 import BeautifulSoup
8
+ import functools
9
+ import random
10
+ import requests
11
+ import time
12
+ import trafilatura
13
+ from web_helpers import retry, fetch_blocked_site
14
+
15
+
16
+ _REDDIT_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
17
+
18
+
19
+ @retry
20
+ def _reddit_json_api(url: str) -> str | None:
21
+ api_url = re.sub(r"/comments/([a-z0-9]{6,8}).*", r"/comments/\1.json", url)
22
+ try:
23
+ headers = {"User-Agent": _REDDIT_UA, "Accept": "application/json"}
24
+ r = _SESS.get(
25
+ api_url,
26
+ params={"limit": 5, "depth": 2, "raw_json": 1},
27
+ headers=headers,
28
+ timeout=(CFG.connect_to, CFG.read_to),
29
+ )
30
+ if "blocked" in r.text.lower() or r.status_code != 200:
31
+ return None
32
+
33
+ data = r.json()
34
+ post_data = data[0]["data"]["children"][0]["data"]
35
+ title = post_data.get("title", "")
36
+ selftext = post_data.get("selftext", "")
37
+ author = post_data.get("author", "")
38
+
39
+ comments = []
40
+ if len(data) > 1:
41
+ for comment in data[1]["data"]["children"][:50]:
42
+ if comment["kind"] == "t1":
43
+ c_author = comment["data"].get("author", "")
44
+ c_body = comment["data"].get("body", "")
45
+ if c_body:
46
+ comments.append(f"u/{c_author}: {c_body}")
47
+
48
+ result = f"Title: {title}\nPosted by: u/{author}\n\n"
49
+ if selftext:
50
+ result += f"{selftext}\n\n"
51
+ if comments:
52
+ result += "Top comments:\n\n" + "\n\n".join(comments)
53
+
54
+ return result.strip()
55
+ except Exception:
56
+ return None
57
+ import urllib.parse as _u
58
+
59
+ _ID_RE = re.compile(r"([a-z0-9]{6,8})", re.I)
60
+
61
+ def _extract_post_id(url: str) -> str | None:
62
+ """
63
+ Heuristics to find the 6–8‑char base‑36 Reddit ID in *any* post URL:
64
+ β€’ short‑link redd.it/<id>
65
+ β€’ /r/sub/abc123/… (old style)
66
+ β€’ /comments/<id>/… (API‑friendly)
67
+ """
68
+ # 1) short‑link host
69
+ u = _u.urlparse(url)
70
+ if u.netloc in {"redd.it", "www.redd.it"}:
71
+ return u.path.lstrip("/").split("/")[0] or None
72
+
73
+ # 2) /comments/<id>/ pattern (works already)
74
+ m = re.search(r"/comments/([a-z0-9]{6,8})", url, re.I)
75
+ if m:
76
+ return m.group(1)
77
+
78
+ # 3) generic β€œ/r/<sub>/<id>/” or trailing β€œβ€¦/<id>”
79
+ parts = [p for p in u.path.split("/") if p]
80
+ for p in parts[::-1]: # search from right‑most
81
+ if _ID_RE.fullmatch(p):
82
+ return p
83
+ return None
84
+
85
+ # ----------------------------------------------------------------------
86
+ # Reddit OAuth helper – app‑only token (read‑only)
87
+ # ----------------------------------------------------------------------
88
+ import base64
89
+ import threading
90
+
91
+ _TOKEN_LOCK = threading.Lock()
92
+ _REDDIT_TOKEN_CACHE: dict[str, float | str] = {"token": None, "expires": 0.0}
93
+
94
+ def get_reddit_token(client_id: str, client_secret: str) -> str | None:
95
+ """
96
+ Return a cached bearer token obtained via Reddit's client‑credentials flow.
97
+ Returns None on error so callers can fall back to other scraping paths.
98
+ """
99
+ now = time.time()
100
+
101
+ # Fast path – cached and still valid
102
+ if (_tok := _REDDIT_TOKEN_CACHE["token"]) and now < _REDDIT_TOKEN_CACHE["expires"] - 30:
103
+ return _tok # 30‑sec grace
104
+
105
+ with _TOKEN_LOCK: # only one thread refreshes
106
+ # Re‑check after acquiring the lock
107
+ if (_tok := _REDDIT_TOKEN_CACHE["token"]) and now < _REDDIT_TOKEN_CACHE["expires"] - 30:
108
+ return _tok
109
+
110
+ try:
111
+ auth = requests.auth.HTTPBasicAuth(client_id, client_secret)
112
+ headers = {"User-Agent": _REDDIT_UA}
113
+ data = {"grant_type": "client_credentials"} # app‑only, read‑only
114
+ r = requests.post(
115
+ "https://www.reddit.com/api/v1/access_token",
116
+ auth=auth,
117
+ data=data,
118
+ headers=headers,
119
+ timeout=10,
120
+ )
121
+ r.raise_for_status()
122
+ payload = r.json()
123
+ token = payload["access_token"]
124
+ ttl = int(payload.get("expires_in", 3600))
125
+ _REDDIT_TOKEN_CACHE.update({"token": token, "expires": now + ttl})
126
+ return token
127
+ except Exception as e:
128
+ logging.warning("Reddit token fetch failed: %s", e)
129
+ return None
130
+
131
+
132
+
133
+ @retry
134
+ def reddit_official_api(url: str, client_id: str, client_secret: str) -> str | None:
135
+ """
136
+ β€’ Works for *any* Reddit permalink or short‑link.
137
+ β€’ If the URL is a subreddit root (/r/<sub>) it still fetches 3 hot posts + top comment (unchanged).
138
+ """
139
+ token = get_reddit_token(client_id, client_secret)
140
+ if not token:
141
+ return None
142
+
143
+ headers = {
144
+ "Authorization": f"bearer {token}",
145
+ "User-Agent": _REDDIT_UA,
146
+ }
147
+
148
+ # ────────────────────────────────────────────────────────────────────
149
+ # 1. Try to treat it as a *post* link by extracting an ID
150
+ # ────────────────────────────────────────────────────────────────────
151
+ post_id = _extract_post_id(url)
152
+ if post_id:
153
+ try:
154
+ r = requests.get(
155
+ f"https://oauth.reddit.com/comments/{post_id}",
156
+ headers=headers,
157
+ params={"limit": 5, "depth": 2, "raw_json": 1},
158
+ timeout=10,
159
+ )
160
+ r.raise_for_status()
161
+ data = r.json()
162
+
163
+ post = data[0]["data"]["children"][0]["data"]
164
+ title = post.get("title", "")
165
+ body = post.get("selftext", "")
166
+ author = post.get("author", "")
167
+
168
+ comments = []
169
+ if len(data) > 1:
170
+ for c in data[1]["data"]["children"][:50]:
171
+ if c["kind"] == "t1":
172
+ c_auth = c["data"].get("author", "")
173
+ c_body = c["data"].get("body", "")
174
+ if c_body:
175
+ comments.append(f"u/{c_auth}: {c_body}")
176
+
177
+ out = f"Title: {title}\nPosted by: u/{author}\n\n"
178
+ if body:
179
+ out += f"{body}\n\n"
180
+ if comments:
181
+ out += "Top comments:\n\n" + "\n\n".join(comments)
182
+ return out.strip()
183
+
184
+ except Exception as e:
185
+ logging.debug("Official API post fetch failed (%s); will try other strategies", e)
186
+
187
+ # ────────────────────────────────────────────────────────────────────
188
+ # 2. If not a post (or the fetch above failed) treat as *subreddit*
189
+ # root and list 3 hot posts, each with top comment (unchanged).
190
+ # ────────────────────────────────────────────────────────────────────
191
+ m_sub = re.search(r"reddit\.com/r/([^/?#]+)", url)
192
+ if not m_sub:
193
+ return None # allow caller to fall back
194
+
195
+ subreddit = m_sub.group(1)
196
+ try:
197
+ r = requests.get(
198
+ f"https://oauth.reddit.com/r/{subreddit}/hot",
199
+ headers=headers,
200
+ params={"limit": 3, "raw_json": 1},
201
+ timeout=10,
202
+ )
203
+ r.raise_for_status()
204
+ posts = r.json()["data"]["children"]
205
+
206
+ out_blocks = []
207
+ for p in posts:
208
+ pd = p["data"]
209
+ pid = pd["id"]
210
+ title = pd.get("title", "")
211
+ auth = pd.get("author", "")
212
+ link = pd.get("permalink", "")
213
+
214
+ # fetch one top comment
215
+ top_comment = ""
216
+ try:
217
+ c = requests.get(
218
+ f"https://oauth.reddit.com/comments/{pid}",
219
+ headers=headers,
220
+ params={"limit": 1, "depth": 1, "raw_json": 1},
221
+ timeout=10,
222
+ ).json()
223
+ if len(c) > 1:
224
+ for cmt in c[1]["data"]["children"]:
225
+ if cmt["kind"] == "t1":
226
+ cauth = cmt["data"].get("author", "")
227
+ cbody = cmt["data"].get("body", "")
228
+ top_comment = f"u/{cauth}: {cbody}"
229
+ break
230
+ except Exception:
231
+ pass
232
+
233
+ block = f"Title: {title}\nPosted by: u/{auth}\nLink: https://www.reddit.com{link}\n"
234
+ if top_comment:
235
+ block += f"Top comment:\n{top_comment}"
236
+ out_blocks.append(block)
237
+
238
+ return "\n\n---\n\n".join(out_blocks)
239
+
240
+ except Exception as e:
241
+ logging.debug("Official API subreddit fetch failed: %s", e)
242
+ return None
243
+
244
+
245
+ @retry
246
+ def _reddit_old_version(url: str) -> str | None:
247
+ old_url = url.replace("www.reddit.com", "old.reddit.com")
248
+ try:
249
+ r = _SESS.get(old_url, headers={"User-Agent": _REDDIT_UA}, timeout=(CFG.connect_to, CFG.read_to))
250
+ if r.status_code != 200:
251
+ return None
252
+
253
+ soup = BeautifulSoup(r.text, "lxml")
254
+ title = soup.select_one(".title").text.strip() if soup.select_one(".title") else ""
255
+ author = soup.select_one(".author").text.strip() if soup.select_one(".author") else ""
256
+ post_body = soup.select_one(".usertext-body")
257
+ post_text = post_body.get_text(strip=True) if post_body else ""
258
+
259
+ comments = []
260
+ for comment in soup.select(".comment")[:50]:
261
+ c_author = comment.select_one(".author")
262
+ c_body = comment.select_one(".usertext-body")
263
+ if c_author and c_body:
264
+ comments.append(f"u/{c_author.text}: {c_body.get_text(strip=True)}")
265
+
266
+ result = f"Title: {title}\nPosted by: u/{author}\n\n"
267
+ if post_text:
268
+ result += f"{post_text}\n\n"
269
+ if comments:
270
+ result += "Top comments:\n\n" + "\n\n".join(comments)
271
+
272
+ return result.strip()
273
+ except Exception:
274
+ print("old reddit failed")
275
+ return None
276
+
277
+ @retry
278
+ def _pushshift_fallback(url: str) -> str | None:
279
+ m = re.search(r"/comments/([a-z0-9]{6,8})", url)
280
+ if not m:
281
+ return None
282
+ link_id = m.group(1)
283
+ try:
284
+ pst = _SESS.get(
285
+ "https://api.pushshift.io/reddit/submission/search/",
286
+ params={"ids": link_id, "size": 1},
287
+ timeout=10,
288
+ ).json()["data"]
289
+ post_txt = pst[0]["selftext"] if pst else ""
290
+
291
+ com = _SESS.get(
292
+ "https://api.pushshift.io/reddit/comment/search/",
293
+ params={"link_id": link_id, "sort": "desc", "size": 3},
294
+ timeout=10,
295
+ ).json()["data"]
296
+ top_txt = "\n\n".join(c["body"] for c in com)
297
+
298
+ txt = (post_txt + "\n\n" + top_txt).strip()
299
+ return txt or None
300
+ except Exception:
301
+ return None
302
+
303
+ def fetch_reddit(url: str) -> str:
304
+ txt = _reddit_old_version(url)
305
+ if txt:
306
+ return "[Retrieved from Reddit]" + txt[:CFG.text_cap]
307
+
308
+ if CFG.reddit_client_id and CFG.reddit_client_secret:
309
+ # print("AAAA")
310
+ txt = reddit_official_api(url, CFG.reddit_client_id, CFG.reddit_client_secret)
311
+ if txt:
312
+ return "[Retrieved from Reddit]" + txt[:CFG.text_cap]
313
+
314
+ txt = _reddit_json_api(url)
315
+ if txt:
316
+ return "[Retrieved from Reddit]" + txt[:CFG.text_cap]
317
+
318
+ txt = _pushshift_fallback(url)
319
+ if txt:
320
+ return "[Retrieved from Reddit]" + txt[:CFG.text_cap]
321
+
322
+
323
+ return fetch_blocked_site(url)[:CFG.text_cap]
324
+
web_agents_5/fetchers/youtube_fetcher.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import logging, re
3
+ from config import CFG, _SESS
4
+ from web_helpers import retry, fetch_blocked_site
5
+
6
+ try:
7
+ import yt_dlp
8
+ _HAS = True
9
+ except ImportError:
10
+ _HAS = False
11
+
12
+ _LANGS = ["en", "en-US"]
13
+
14
+ @retry
15
+ def fetch_youtube(url: str) -> str:
16
+ if not _HAS:
17
+ return fetch_blocked_site(url)[:CFG.text_cap]
18
+
19
+ try:
20
+ ydl_opts = {"quiet": True, "no_warnings": True,
21
+ "writesubtitles": True, "writeautomaticsub": True,
22
+ "skip_download": True}
23
+ with yt_dlp.YoutubeDL(ydl_opts) as y:
24
+ info = y.extract_info(url, download=False)
25
+
26
+ subs = info.get("subtitles", {}) or {}
27
+ auto = info.get("automatic_captions", {}) or {}
28
+ tracks = next((subs.get(l) or auto.get(l) for l in _LANGS
29
+ if subs.get(l) or auto.get(l)), None)
30
+ if not tracks:
31
+ tracks = next(iter(subs.values()), []) or next(iter(auto.values()), [])
32
+
33
+ if tracks:
34
+ cap_url = tracks[0]["url"]
35
+ if "fmt=" not in cap_url: cap_url += "&fmt=json3"
36
+ r = _SESS.get(cap_url, timeout=(CFG.connect_to, CFG.read_to))
37
+ r.raise_for_status()
38
+ if cap_url.endswith(".vtt"):
39
+ text = " ".join(l for l in r.text.splitlines()
40
+ if l and "-->" not in l and not re.match(r"\d{2}:\d{2}", l))
41
+ else:
42
+ text = " ".join(seg["utf8"] for ev in r.json()["events"]
43
+ for seg in ev.get("segs", []))
44
+ if text: return text[:CFG.text_cap]
45
+
46
+ meta = (info.get("title","") + "\n\n" + info.get("description","")).strip()
47
+ return "[Retrieved from yt-dlp] " + meta[:CFG.text_cap]
48
+ except Exception as e:
49
+ logging.error("YouTube fetch failed %s: %s", url, e)
50
+ return fetch_blocked_site(url)[:CFG.text_cap]
web_agents_5/fetchers_async.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # """
2
+ # fetchers_async.py – Orchestrates multiple specialised fetchers **without changing
3
+ # its public surface** (`async def fetch_url(url: str) -> str`).
4
+
5
+ # Order of strategies (after specialised handlers):
6
+ # 1. **Jina AI** – fast & cheap full‑text extraction
7
+ # 2. **Crawl4AI** – browser‑based heavy‑weight fallback
8
+ # 3. **Legacy HTML** – trafilatura / readability last‑chance scrape
9
+
10
+ # Specialised fetchers (PDF, YouTube, Reddit) remain unchanged.
11
+ # """
12
+ # from __future__ import annotations
13
+
14
+ # import asyncio, logging
15
+ # from typing import Callable
16
+
17
+ # from web_helpers import retry
18
+ # from fetchers.pdf_fetcher import fetch_pdf
19
+ # from fetchers.youtube_fetcher import fetch_youtube
20
+ # from fetchers.reddit_fetcher import fetch_reddit
21
+ # from fetchers.github_fetcher import fetch_github
22
+
23
+ # from fetchers.jina_fetcher import fetch_jina
24
+ # from fetchers.crawl4ai_fetcher import fetch_crawl4ai
25
+ # from fetchers.basic_fetcher import fetch_html
26
+
27
+
28
+ # _ERR_PREFIXES = ("[error", "[failed", "[unable")
29
+
30
+
31
+ # def _looks_error(txt: str | None) -> bool:
32
+ # return not txt or txt.strip().lower().startswith(_ERR_PREFIXES)
33
+
34
+
35
+ # async def _thread_wrapper(fn: Callable[[str], str], url: str) -> str | None:
36
+ # try:
37
+ # return await asyncio.to_thread(fn, url)
38
+ # except Exception as exc:
39
+ # logging.debug("%s threw in thread: %s", fn.__name__, exc)
40
+
41
+ # @retry
42
+ # async def fetch_url(url: str) -> str:
43
+ # url_l = url.lower()
44
+
45
+
46
+ # # 1 – Jina AI ------------------------------------------------------------
47
+ # if (out := await _thread_wrapper(fetch_jina, url)) and not _looks_error(out):
48
+ # return out
49
+
50
+ # # if (out := await _thread_wrapper(fetch_html, url)) and not _looks_error(out):
51
+ # # return out
52
+
53
+ # # 2 – Crawl4AI -----------------------------------------------------------
54
+ # try:
55
+ # md = await fetch_crawl4ai(url)
56
+ # if not _looks_error(md):
57
+ # return md
58
+ # except Exception as e:
59
+ # logging.debug("Crawl4AI failed: %s", e)
60
+
61
+ # if "pdf" in url_l:
62
+ # if (out := await _thread_wrapper(fetch_pdf, url)) and not _looks_error(out):
63
+ # return out
64
+
65
+ # if "reddit" in url_l:
66
+ # if (out := await _thread_wrapper(fetch_reddit, url)) and not _looks_error(out):
67
+ # return out
68
+ # if "youtube" in url_l:
69
+ # if (out := await _thread_wrapper(fetch_youtube, url)) and not _looks_error(out):
70
+ # return out
71
+ # if "github" in url_l:
72
+ # if (out := await _thread_wrapper(fetch_github, url)) and not _looks_error(out):
73
+ # return out
74
+
75
+ # # 3 – Basic HTML --------------------------------------------------------
76
+ # if (out := await _thread_wrapper(fetch_html, url)) and not _looks_error(out):
77
+ # return out
78
+
79
+ # return "[error fetch_url exhausted all methods]"
80
+
81
+
82
+
83
+ import asyncio, logging, time
84
+
85
+ from fetchers.pdf_fetcher import fetch_pdf
86
+ from fetchers.youtube_fetcher import fetch_youtube
87
+ from fetchers.reddit_fetcher import fetch_reddit
88
+ from fetchers.github_fetcher import fetch_github
89
+ from fetchers.jina_fetcher import fetch_jina
90
+ from fetchers.crawl4ai_fetcher import fetch_crawl4ai
91
+ from fetchers.basic_fetcher import fetch_html
92
+
93
+ _ERR_PREFIXES = ("[error", "[failed", "[unable]")
94
+
95
+ def _looks_error(txt: str | None) -> bool:
96
+ return not txt or txt.strip().lower().startswith(_ERR_PREFIXES)
97
+
98
+ # per-fetcher hard caps (seconds)
99
+ _FETCHER_TIMEOUTS = {
100
+ "fetch_jina": 20.0,
101
+ "fetch_github": 10.0,
102
+ "fetch_crawl4ai": 40.0,
103
+ "fetch_html": 20.0,
104
+ "fetch_pdf": 30.0,
105
+ "fetch_youtube": 30.0,
106
+ "fetch_reddit": 10.0,
107
+ }
108
+
109
+
110
+ async def fetch_url(url: str) -> str:
111
+ url_l = url.lower()
112
+
113
+ async def timed_fetch(fn) -> str | None:
114
+ name = fn.__name__
115
+ timeout = _FETCHER_TIMEOUTS.get(name, 60.0)
116
+ start_ts = time.perf_counter()
117
+ try:
118
+ # choose sync or async execution path
119
+ coro = fn(url) if asyncio.iscoroutinefunction(fn) else asyncio.to_thread(fn, url)
120
+ result = await asyncio.wait_for(coro, timeout=timeout)
121
+ elapsed = (time.perf_counter() - start_ts) * 1000
122
+ if result and not _looks_error(result):
123
+ logging.info(f"[{name}] βœ… success in {elapsed:.1f} ms")
124
+ return result
125
+ logging.warning(f"[{name}] ❌ error response in {elapsed:.1f} ms")
126
+ except asyncio.TimeoutError:
127
+ logging.warning(f"[{name}] ⏱️ timed-out after {timeout}s")
128
+ except Exception as e:
129
+ elapsed = (time.perf_counter() - start_ts) * 1000
130
+ logging.warning(f"[{name}] πŸ’₯ exception in {elapsed:.1f} ms β†’ {e}")
131
+ return None
132
+
133
+ async def try_chain(*fetchers) -> str | None:
134
+ for fn in fetchers:
135
+ if result := await timed_fetch(fn):
136
+ return result
137
+ return None
138
+
139
+ # -------------- domain-specific chains ---------------
140
+ if "github.com" in url_l:
141
+ return await try_chain(fetch_jina, fetch_github, fetch_crawl4ai)
142
+ if "wikipedia.org" in url_l:
143
+ return await try_chain(fetch_html, fetch_jina, fetch_crawl4ai)
144
+ if "reddit.com" in url_l:
145
+ return await try_chain(fetch_jina, fetch_reddit, fetch_html)
146
+ if "quora.com" in url_l:
147
+ return await try_chain(fetch_crawl4ai, fetch_jina, fetch_html)
148
+ if "youtube.com" in url_l or "youtu.be" in url_l:
149
+ return await try_chain(fetch_jina, fetch_youtube)
150
+ if url_l.endswith(".pdf") or "pdf" in url_l:
151
+ return await try_chain(fetch_jina, fetch_pdf, fetch_html, fetch_crawl4ai)
152
+
153
+ # -------------- generic fallback ---------------------
154
+ return (await try_chain(fetch_jina, fetch_crawl4ai, fetch_html)
155
+ or "[error fetch_url exhausted all methods]")
web_agents_5/host_serper2.sh ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ export PYTHONPATH=./
2
+ export MAX_OUTBOUND=256
3
+ export JINA_CACHE_DIR=./../.cache/jina_cache
4
+ export SERPER_CACHE_DIR=./../.cache/serper_cache
5
+
6
+ python ./web_agents_5/sandbox_serper.py --port $PORT_SERPER_HOST --workers 256
web_agents_5/sandbox_serper.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """sandbox_serper.py – resilient Serper sandbox v2.1
3
+
4
+ Fixes
5
+ -----
6
+ * Moved `global _MAX_OUTBOUND, _SEM` declaration to the **top of `main()`**
7
+ before any reference, eliminating the `SyntaxError: name used prior to
8
+ global declaration`.
9
+ * No functional changes otherwise.
10
+ """
11
+
12
+ from __future__ import annotations
13
+ import argparse, asyncio, logging, os, time, traceback
14
+ from fastapi import FastAPI
15
+ from fastapi.concurrency import run_in_threadpool
16
+ from pydantic import BaseModel
17
+ import uvicorn
18
+ import time
19
+ # ───────────────────────── logging setup ──────────────────────────
20
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
21
+ log = logging.getLogger("sandbox_serper")
22
+
23
+ app = FastAPI()
24
+
25
+ class Req(BaseModel):
26
+ env: str
27
+ call: str
28
+ timeout: int = 60
29
+
30
+ # ───────────────────── global throughput gate ─────────────────────
31
+ _MAX_OUTBOUND = int(os.getenv("MAX_OUTBOUND", "10"))
32
+ _SEM = asyncio.Semaphore(_MAX_OUTBOUND)
33
+
34
+ # ───────────────────────── endpoint ───────────────────────────────
35
+ @app.post("/execute")
36
+ async def execute(req: Req):
37
+ # async with _SEM:
38
+ async with _SEM: # ❰❰ throttle
39
+ result = await run_in_threadpool(_safe_eval, req.env,
40
+ req.call, req.timeout)
41
+
42
+ return {
43
+ "output": "",
44
+ "result": result,
45
+ "error": None if not str(result).startswith("[tool-error]") else result,
46
+ }
47
+
48
+ # ───────────────────── sandbox evaluator ──────────────────────────
49
+
50
+ def _safe_eval(env: str, call: str, timeout: int):
51
+ start = time.time(); loc: dict = {}
52
+ try:
53
+ exec(env, {}, loc)
54
+ exec(f"response = {call}", {}, loc)
55
+ if time.time() - start > timeout:
56
+ raise TimeoutError(f"wall-clock timeout for call {call}")
57
+ return loc.get("response", "[tool-error] no response var")
58
+ except Exception as e:
59
+ log.error("tool error: %s\n%s", e, traceback.format_exc())
60
+ return f"[tool-error] {e}"
61
+
62
+ # ─────────────────────────── main ────────────────────────────────
63
+
64
+ def main():
65
+ global _MAX_OUTBOUND, _SEM # ← moved to top
66
+
67
+ ap = argparse.ArgumentParser()
68
+ ap.add_argument("--port", type=int, default=1211)
69
+ ap.add_argument("--workers", type=int, default=1)
70
+ ap.add_argument("--reload", action="store_true")
71
+ # ap.add_argument("--max_outbound", type=int, default=_MAX_OUTBOUND,
72
+ # help="simultaneous outbound calls across all workers")
73
+ args = ap.parse_args()
74
+
75
+ _SEM = asyncio.Semaphore(_MAX_OUTBOUND)
76
+
77
+ if args.reload and args.workers > 1:
78
+ raise SystemExit("--reload and --workers>1 are mutually exclusive")
79
+
80
+ # log.info("Starting sandbox :%d | workers=%d | max_outbound=%d",
81
+ # args.port, args.workers, _MAX_OUTBOUND)
82
+
83
+ if args.workers > 1:
84
+ uvicorn.run("sandbox_serper:app", host="0.0.0.0", port=args.port, workers=args.workers)
85
+ else:
86
+ uvicorn.run(app, host="0.0.0.0", port=args.port, reload=args.reload)
87
+
88
+
89
+ if __name__ == "__main__":
90
+ main()
web_agents_5/search_api.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import asyncio
3
+ from typing import Dict, List
4
+ from utils import (google_search, url_hits_to_markdown,
5
+ search_result_to_markdown,
6
+ async_search_and_extract, _bad)
7
+ from fetchers_async import fetch_url
8
+ from compressor import compress_text, query_text
9
+ from config import CFG
10
+ import logging
11
+
12
+ # print("PYTHONPATH:", sys.path)
13
+
14
+ def web_search(query):
15
+ return search_urls(query = query, top_k = 10)
16
+
17
+ def web_visit(url):
18
+ return open_url(url = url, compress = False)
19
+
20
+ # ── 1. search_urls ──────────────────────────────────────────────────────
21
+ def search_urls(query: str, top_k: int = 10) -> str:
22
+ return url_hits_to_markdown(google_search(query, top_k))
23
+
24
+ # ── 2. open_url ─────────────────────────────────────────────────────────
25
+ def open_url(url: str, *, compress: bool = True, pct: float = CFG.pct,
26
+ model: str = "gpt-4o-mini") -> str:
27
+ if _bad(url): return _bad(url)
28
+ try:
29
+ body = asyncio.run(fetch_url(url))
30
+ body = str(body)
31
+ except Exception as e:
32
+ return f"[error fetching URL: {e}]"
33
+ if compress:
34
+ try:
35
+ body = compress_text(body, pct=pct, model=model)
36
+ except Exception as e:
37
+ body = f"[compression failed: {e}]\n\n{body[:2000]}"
38
+ return body
39
+
40
+ # ── 3. search_and_parse_query ───────────────────────────────────────────
41
+ def search_and_parse_query(query: str, top_k: int = 3, *,
42
+ compress: bool = True, pct: float = CFG.pct) -> str:
43
+ blocks = asyncio.run(async_search_and_extract(query, top_k))
44
+ if compress:
45
+ for b in blocks:
46
+ try:
47
+ cmp = compress_text(b["body"], pct=pct)
48
+ b["body"] = (f"**Summary:**\n{cmp['narrative']}\n\n"
49
+ f"**Facts:**\n{cmp['facts']}\n\n"
50
+ f"**Tables:**\n{cmp['tables']}")
51
+ except Exception as e:
52
+ b["body"] = f"[compression failed: {e}]\n\n{b['body']}"
53
+ return search_result_to_markdown(blocks)
54
+
55
+ # ── 4. query_url ────────────────────────────────────────────────────────
56
+ def query_url(url: str, goal: str, *, model: str = "gpt-4.1-mini") -> str:
57
+ if _bad(url): return _bad(url)
58
+ body = asyncio.run(fetch_url(url))
59
+ if not body or body.startswith("[error"):
60
+ return f"[failed to retrieve content from {url}]\n\n{body}"
61
+ return query_text(url, body, goal, model=model)['extracted_info']
62
+
63
+
web_agents_5/utils.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import asyncio, logging, re, tiktoken
3
+ from typing import Dict, List
4
+ from config import CFG, _SESS
5
+ from fetchers_async import fetch_url
6
+ from web_helpers import retry
7
+ from urllib.parse import urlparse
8
+
9
+ enc = tiktoken.get_encoding("cl100k_base")
10
+
11
+ # ── Google / Serper search ──────────────────────────────────────────────
12
+
13
+ # def google_search(query: str, top_k: int = 10) -> List[Dict[str,str]]:
14
+ # if not CFG.serper_key:
15
+ # raise EnvironmentError("SERPER_API_KEY not set")
16
+ # r = _SESS.post(
17
+ # CFG.serper_ep,
18
+ # headers={"X-API-KEY": CFG.serper_key, "Content-Type": "application/json"},
19
+ # json={"q": query}, timeout=20)
20
+ # r.raise_for_status()
21
+ # hits = []
22
+ # for it in r.json().get("organic", []):
23
+ # hits.append({"title": it.get("title",""),
24
+ # "link": it.get("link",""),
25
+ # "snippet": it.get("snippet","")})
26
+ # if len(hits) == top_k: break
27
+ # return hits
28
+ import hashlib, json, logging, os, time
29
+ from typing import List, Dict
30
+
31
+ def _canon_query(q: str) -> str:
32
+ # Normalize whitespace to avoid duplicate keys for e.g. "foo bar"
33
+ return " ".join((q or "").strip().split())
34
+
35
+
36
+ def _search_cache_key(query: str, top_k: int) -> str:
37
+ cq = _canon_query(query)
38
+ raw = f"{top_k}|{cq}"
39
+ return hashlib.sha256(raw.encode("utf-8")).hexdigest() + ".json"
40
+
41
+ def _search_cache_paths(query: str, top_k: int) -> str:
42
+ root = CFG.serper_cache_dir
43
+ os.makedirs(root, exist_ok=True)
44
+ return os.path.join(root, _search_cache_key(query, top_k))
45
+
46
+ def _ttl_seconds() -> int:
47
+ # 0 or missing β†’ no expiry
48
+ try:
49
+ return int(getattr(CFG, "search_cache_ttl", 0) or int(os.environ.get("SEARCH_CACHE_TTL", "0")))
50
+ except Exception:
51
+ return 0
52
+
53
+ def _load_search_cache(path: str) -> List[Dict[str, str]] | None:
54
+ try:
55
+ if not os.path.exists(path) or os.path.getsize(path) <= 2:
56
+ return None
57
+ ttl = _ttl_seconds()
58
+ if ttl > 0:
59
+ age = time.time() - os.path.getmtime(path)
60
+ if age > ttl:
61
+ return None
62
+ with open(path, "r", encoding="utf-8") as f:
63
+ data = json.load(f)
64
+ # Basic shape check: list of dicts with expected keys
65
+ if isinstance(data, list):
66
+ return data
67
+ except Exception as e:
68
+ logging.debug("Serper cache read failed (%s): %s", path, e)
69
+ return None
70
+
71
+ def _save_search_cache(path: str, hits: List[Dict[str, str]]) -> None:
72
+ try:
73
+ tmp = f"{path}.tmp.{os.getpid()}"
74
+ with open(tmp, "w", encoding="utf-8") as f:
75
+ json.dump(hits, f, ensure_ascii=False)
76
+ os.replace(tmp, path) # atomic on same FS
77
+ except Exception as e:
78
+ logging.debug("Serper cache write failed (%s): %s", path, e)
79
+
80
+
81
+ @retry
82
+ def google_search(query: str, top_k: int = 10) -> List[Dict[str,str]]:
83
+ if not CFG.serper_key:
84
+ raise EnvironmentError("SERPER_API_KEY not set")
85
+
86
+ cpath = _search_cache_paths(query, top_k)
87
+ cached = _load_search_cache(cpath)
88
+ if cached is not None:
89
+ logging.info("Serper search (cache hit) ← %r (top_k=%d)", _canon_query(query), top_k)
90
+ return cached
91
+
92
+ r = _SESS.post(
93
+ CFG.serper_ep,
94
+ headers={"X-API-KEY": CFG.serper_key, "Content-Type": "application/json"},
95
+ json={"q": query},
96
+ timeout=20
97
+ )
98
+ r.raise_for_status()
99
+ hits: List[Dict[str, str]] = []
100
+ for it in r.json().get("organic", []):
101
+ hits.append({
102
+ "title": it.get("title", ""),
103
+ "link": it.get("link", ""),
104
+ "snippet": it.get("snippet", ""),
105
+ })
106
+ if len(hits) == top_k:
107
+ break
108
+
109
+ _save_search_cache(cpath, hits)
110
+ return hits
111
+
112
+
113
+ # ── async extract per hit ───────────────────────────────────────────────
114
+ async def async_search_and_extract(query: str, top_k: int = 5) -> List[Dict]:
115
+ hits = google_search(query, top_k)
116
+ async def enrich(h):
117
+ return {**h, "body": await fetch_url(h["link"])}
118
+ return await asyncio.gather(*(enrich(h) for h in hits))
119
+
120
+ # ── markdown helpers ────────────────────────────────────────────────────
121
+ def url_hits_to_markdown(hits: List[Dict[str,str]]) -> str:
122
+ out = []
123
+ for i, h in enumerate(hits, 1):
124
+ out.append(f"### {i}. {h['title']}\n**URL**: {h['link']}\n\n**Snippet**: {h['snippet']}\n")
125
+ return "\n---\n\n".join(out)
126
+
127
+ def search_result_to_markdown(blocks: List[Dict]) -> str:
128
+ out = []
129
+ for i, b in enumerate(blocks, 1):
130
+ out.append(f"### {i}. **Title**: {b['title']}\n**URL**: {b['link']}\n\n"
131
+ f"**Snippet**: {b['snippet']}\n\n**Content**:\n{b['body']}\n")
132
+ return "\n---\n\n".join(out)
133
+
134
+ def trim_to_tokens(text: str, limit: int, model: str = "gpt-3.5-turbo") -> str:
135
+ ids = enc.encode(text)
136
+ if len(ids) <= limit: return text
137
+ keep = limit // 2
138
+ return enc.decode(ids[:keep] + ids[-keep:])
139
+
140
+ def _bad(url: str) -> str|None:
141
+ p = urlparse(url)
142
+ if p.scheme not in ("http","https") or not p.netloc:
143
+ return "[error: invalid URL – must start with http:// or https://]"
144
+ return None
145
+
web_agents_5/web_helpers.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import functools, logging, random, re, time, requests, trafilatura
3
+ from typing import Callable
4
+ from bs4 import BeautifulSoup
5
+ from config import CFG, _RND
6
+
7
+ # ── retry ────────────────────────────────────────────────────────────────
8
+ def retry(fn: Callable) -> Callable:
9
+ @functools.wraps(fn)
10
+ def _wrap(*a, **kw):
11
+ for i in range(CFG.retries):
12
+ try:
13
+ return fn(*a, **kw)
14
+ except Exception as e:
15
+ if i == CFG.retries - 1:
16
+ raise
17
+ delay = CFG.backoff * (2 ** i) * (1 + 0.3 * _RND.random())
18
+ logging.warning("Retry %s/%s %s: %s (%.2fs)",
19
+ i+1, CFG.retries, fn.__name__, e, delay)
20
+ time.sleep(delay)
21
+ return _wrap
22
+
23
+ # ── text extraction ──────────────────────────────────────────────────────
24
+ def extract_main_text(html: str) -> str:
25
+ txt = trafilatura.extract(html, output_format="txt") or ""
26
+ if len(txt) >= 500:
27
+ return txt
28
+ from readability import Document
29
+ soup = BeautifulSoup(Document(html).summary(), "lxml")
30
+ txt = soup.get_text(" ", strip=True)
31
+ if len(txt) >= 400:
32
+ return txt
33
+ for tag in soup(["script", "style", "noscript"]):
34
+ tag.decompose()
35
+ return re.sub(r"\s+", " ", soup.get_text(" ").strip())
36
+
37
+ # ── last‑chance fetch when everything fails ──────────────────────────────
38
+ @retry
39
+ def fetch_blocked_site(url: str) -> str:
40
+ hdrs = {"User-Agent": CFG.ua, "Referer": "https://www.google.com/"}
41
+ sess = requests.Session(); sess.headers.update(hdrs)
42
+
43
+ # 1) direct
44
+ try:
45
+ r = sess.get(url, timeout=(CFG.connect_to, CFG.read_to))
46
+ r.raise_for_status()
47
+ txt = extract_main_text(r.text)
48
+ if len(txt) > 500:
49
+ return "[Retrieved from redirected attempt]\n\n" + txt[:CFG.text_cap]
50
+ except Exception as e:
51
+ logging.debug("Direct scrape failed %s: %s", url, e)
52
+
53
+ # 2) wayback
54
+ try:
55
+ wb = f"https://web.archive.org/web/2023/{url}"
56
+ r = sess.get(wb, timeout=(CFG.connect_to, CFG.read_to))
57
+ r.raise_for_status()
58
+ txt = extract_main_text(r.text)
59
+ if len(txt) > 500:
60
+ return "[Retrieved from archive.org]\n\n" + txt[:CFG.text_cap]
61
+ except Exception as e:
62
+ logging.debug("Wayback scrape failed %s: %s", url, e)
63
+
64
+ return f"[Error accessing {url}. Try VPN or manual archive.org check.]"