Spaces:
Sleeping
Sleeping
Commit
·
08a5a31
1
Parent(s):
2bf686d
use openrouter only instead of google-genai
Browse files- .gitignore +3 -0
- config.py +4 -6
- pipeline/critique_extraction.py +49 -36
- pipeline/disagreement_detection.py +39 -27
- pipeline/disagreement_resolution.py +2 -2
- pipeline/meta_review.py +1 -1
- pipeline/search_retrieval.py +126 -74
- requirements.txt +1 -3
- test_api.py +273 -0
.gitignore
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
venv/
|
| 2 |
+
.env
|
| 3 |
+
__pycache__/
|
config.py
CHANGED
|
@@ -11,10 +11,10 @@ API_DESCRIPTION = """
|
|
| 11 |
## Automated Consensus Analysis for Peer Reviews
|
| 12 |
|
| 13 |
This API provides comprehensive analysis of peer review disagreements using:
|
| 14 |
-
- **LLM-based critique extraction** (Gemini 2.
|
| 15 |
- **Disagreement detection** between reviewers
|
| 16 |
- **Search-augmented evidence retrieval** (Semantic Scholar, arXiv, Google Scholar, Tavily)
|
| 17 |
-
- **AI-powered disagreement resolution** (DeepSeek-R1)
|
| 18 |
- **Meta-review generation**
|
| 19 |
|
| 20 |
### Features:
|
|
@@ -29,12 +29,11 @@ MAX_REQUESTS_PER_MINUTE = int(os.getenv("MAX_REQUESTS_PER_MINUTE", "10"))
|
|
| 29 |
MAX_CONCURRENT_TASKS = int(os.getenv("MAX_CONCURRENT_TASKS", "3"))
|
| 30 |
QUEUE_MAX_SIZE = int(os.getenv("QUEUE_MAX_SIZE", "20"))
|
| 31 |
|
| 32 |
-
# Model Configuration
|
| 33 |
-
GEMINI_MODEL = os.getenv("GEMINI_MODEL", "gemini-2.
|
| 34 |
DEEPSEEK_MODEL = os.getenv("DEEPSEEK_MODEL", "deepseek/deepseek-r1")
|
| 35 |
|
| 36 |
# API Keys (from HF Spaces secrets)
|
| 37 |
-
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
| 38 |
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
|
| 39 |
TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
|
| 40 |
SERPAPI_API_KEY = os.getenv("SERPAPI_API_KEY")
|
|
@@ -58,7 +57,6 @@ def validate_environment():
|
|
| 58 |
ValueError: If required variables are missing
|
| 59 |
"""
|
| 60 |
required_vars = {
|
| 61 |
-
"GEMINI_API_KEY": GEMINI_API_KEY,
|
| 62 |
"OPENROUTER_API_KEY": OPENROUTER_API_KEY,
|
| 63 |
"TAVILY_API_KEY": TAVILY_API_KEY,
|
| 64 |
}
|
|
|
|
| 11 |
## Automated Consensus Analysis for Peer Reviews
|
| 12 |
|
| 13 |
This API provides comprehensive analysis of peer review disagreements using:
|
| 14 |
+
- **LLM-based critique extraction** (Gemini 2.5 Flash Lite via OpenRouter)
|
| 15 |
- **Disagreement detection** between reviewers
|
| 16 |
- **Search-augmented evidence retrieval** (Semantic Scholar, arXiv, Google Scholar, Tavily)
|
| 17 |
+
- **AI-powered disagreement resolution** (DeepSeek-R1 via OpenRouter)
|
| 18 |
- **Meta-review generation**
|
| 19 |
|
| 20 |
### Features:
|
|
|
|
| 29 |
MAX_CONCURRENT_TASKS = int(os.getenv("MAX_CONCURRENT_TASKS", "3"))
|
| 30 |
QUEUE_MAX_SIZE = int(os.getenv("QUEUE_MAX_SIZE", "20"))
|
| 31 |
|
| 32 |
+
# Model Configuration (all via OpenRouter)
|
| 33 |
+
GEMINI_MODEL = os.getenv("GEMINI_MODEL", "google/gemini-2.5-flash-lite")
|
| 34 |
DEEPSEEK_MODEL = os.getenv("DEEPSEEK_MODEL", "deepseek/deepseek-r1")
|
| 35 |
|
| 36 |
# API Keys (from HF Spaces secrets)
|
|
|
|
| 37 |
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
|
| 38 |
TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
|
| 39 |
SERPAPI_API_KEY = os.getenv("SERPAPI_API_KEY")
|
|
|
|
| 57 |
ValueError: If required variables are missing
|
| 58 |
"""
|
| 59 |
required_vars = {
|
|
|
|
| 60 |
"OPENROUTER_API_KEY": OPENROUTER_API_KEY,
|
| 61 |
"TAVILY_API_KEY": TAVILY_API_KEY,
|
| 62 |
}
|
pipeline/critique_extraction.py
CHANGED
|
@@ -1,16 +1,21 @@
|
|
| 1 |
import json
|
| 2 |
import os
|
| 3 |
from typing import List, Dict
|
| 4 |
-
|
| 5 |
from pydantic import BaseModel
|
| 6 |
import asyncio
|
| 7 |
-
import time
|
| 8 |
|
| 9 |
from dotenv import load_dotenv
|
| 10 |
load_dotenv()
|
| 11 |
|
| 12 |
-
#
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
class CritiquePoint(BaseModel):
|
| 16 |
Methodology: List[str] = []
|
|
@@ -21,7 +26,7 @@ class CritiquePoint(BaseModel):
|
|
| 21 |
|
| 22 |
async def extract_single_critique(review_text: str, retries: int = 5) -> Dict:
|
| 23 |
"""
|
| 24 |
-
Extract critique points from a single review using Gemini
|
| 25 |
|
| 26 |
Args:
|
| 27 |
review_text: The review text to analyze
|
|
@@ -30,59 +35,67 @@ async def extract_single_critique(review_text: str, retries: int = 5) -> Dict:
|
|
| 30 |
Returns:
|
| 31 |
Dictionary with categorized critique points
|
| 32 |
"""
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
Return a structured JSON with these categories as keys and lists of critique points as values.
|
| 37 |
-
|
| 38 |
-
Review:
|
| 39 |
-
{review_text}
|
| 40 |
|
| 41 |
Respond with ONLY valid JSON in this format:
|
| 42 |
-
{
|
| 43 |
"Methodology": ["point1", "point2"],
|
| 44 |
"Experiments": ["point1"],
|
| 45 |
"Clarity": ["point1", "point2"],
|
| 46 |
"Significance": ["point1"],
|
| 47 |
"Novelty": ["point1"]
|
| 48 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
"""
|
| 50 |
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
}
|
| 56 |
-
)
|
| 57 |
|
| 58 |
for attempt in range(retries):
|
| 59 |
try:
|
| 60 |
response = await asyncio.to_thread(
|
| 61 |
-
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
| 63 |
)
|
| 64 |
|
| 65 |
-
if not response.
|
| 66 |
-
raise ValueError("Empty response from
|
| 67 |
|
| 68 |
-
result = json.loads(response.
|
| 69 |
|
| 70 |
# Validate structure
|
| 71 |
critique = CritiquePoint(**result)
|
| 72 |
return critique.model_dump()
|
| 73 |
|
| 74 |
-
except genai.types.generation_types.BlockedPromptException as e:
|
| 75 |
-
print(f"Content blocked by safety filters: {e}")
|
| 76 |
-
return {
|
| 77 |
-
"Methodology": [],
|
| 78 |
-
"Experiments": [],
|
| 79 |
-
"Clarity": [],
|
| 80 |
-
"Significance": [],
|
| 81 |
-
"Novelty": [],
|
| 82 |
-
"error": "Content blocked by safety filters"
|
| 83 |
-
}
|
| 84 |
-
|
| 85 |
except Exception as e:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
wait_time = 2 ** attempt
|
| 87 |
print(f"Attempt {attempt + 1} failed: {e}. Retrying in {wait_time}s...")
|
| 88 |
|
|
|
|
| 1 |
import json
|
| 2 |
import os
|
| 3 |
from typing import List, Dict
|
| 4 |
+
from openai import OpenAI
|
| 5 |
from pydantic import BaseModel
|
| 6 |
import asyncio
|
|
|
|
| 7 |
|
| 8 |
from dotenv import load_dotenv
|
| 9 |
load_dotenv()
|
| 10 |
|
| 11 |
+
# Initialize OpenRouter client
|
| 12 |
+
client = OpenAI(
|
| 13 |
+
base_url="https://openrouter.ai/api/v1",
|
| 14 |
+
api_key=os.getenv("OPENROUTER_API_KEY"),
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
# Model to use for critique extraction
|
| 18 |
+
CRITIQUE_MODEL = "google/gemini-2.5-flash-lite"
|
| 19 |
|
| 20 |
class CritiquePoint(BaseModel):
|
| 21 |
Methodology: List[str] = []
|
|
|
|
| 26 |
|
| 27 |
async def extract_single_critique(review_text: str, retries: int = 5) -> Dict:
|
| 28 |
"""
|
| 29 |
+
Extract critique points from a single review using OpenRouter (Gemini)
|
| 30 |
|
| 31 |
Args:
|
| 32 |
review_text: The review text to analyze
|
|
|
|
| 35 |
Returns:
|
| 36 |
Dictionary with categorized critique points
|
| 37 |
"""
|
| 38 |
+
system_prompt = """
|
| 39 |
+
You are an expert at analyzing academic peer reviews.
|
| 40 |
+
Extract key critique points from the review and categorize them.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
Respond with ONLY valid JSON in this format:
|
| 43 |
+
{
|
| 44 |
"Methodology": ["point1", "point2"],
|
| 45 |
"Experiments": ["point1"],
|
| 46 |
"Clarity": ["point1", "point2"],
|
| 47 |
"Significance": ["point1"],
|
| 48 |
"Novelty": ["point1"]
|
| 49 |
+
}
|
| 50 |
+
"""
|
| 51 |
+
|
| 52 |
+
user_prompt = f"""
|
| 53 |
+
Extract key critique points from the following research paper review.
|
| 54 |
+
Categorize them into aspects: Methodology, Experiments, Clarity, Significance, Novelty.
|
| 55 |
+
|
| 56 |
+
Review:
|
| 57 |
+
{review_text}
|
| 58 |
"""
|
| 59 |
|
| 60 |
+
messages = [
|
| 61 |
+
{"role": "system", "content": system_prompt},
|
| 62 |
+
{"role": "user", "content": user_prompt},
|
| 63 |
+
]
|
|
|
|
|
|
|
| 64 |
|
| 65 |
for attempt in range(retries):
|
| 66 |
try:
|
| 67 |
response = await asyncio.to_thread(
|
| 68 |
+
client.chat.completions.create,
|
| 69 |
+
model=CRITIQUE_MODEL,
|
| 70 |
+
messages=messages,
|
| 71 |
+
max_tokens=2048,
|
| 72 |
+
response_format={"type": "json_object"},
|
| 73 |
)
|
| 74 |
|
| 75 |
+
if not response.choices or not response.choices[0].message.content.strip():
|
| 76 |
+
raise ValueError("Empty response from API")
|
| 77 |
|
| 78 |
+
result = json.loads(response.choices[0].message.content.strip())
|
| 79 |
|
| 80 |
# Validate structure
|
| 81 |
critique = CritiquePoint(**result)
|
| 82 |
return critique.model_dump()
|
| 83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
except Exception as e:
|
| 85 |
+
error_msg = str(e)
|
| 86 |
+
|
| 87 |
+
# Check for content safety blocks
|
| 88 |
+
if "safety" in error_msg.lower() or "blocked" in error_msg.lower():
|
| 89 |
+
print(f"Content blocked by safety filters: {e}")
|
| 90 |
+
return {
|
| 91 |
+
"Methodology": [],
|
| 92 |
+
"Experiments": [],
|
| 93 |
+
"Clarity": [],
|
| 94 |
+
"Significance": [],
|
| 95 |
+
"Novelty": [],
|
| 96 |
+
"error": "Content blocked by safety filters"
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
wait_time = 2 ** attempt
|
| 100 |
print(f"Attempt {attempt + 1} failed: {e}. Retrying in {wait_time}s...")
|
| 101 |
|
pipeline/disagreement_detection.py
CHANGED
|
@@ -2,15 +2,21 @@ import json
|
|
| 2 |
import os
|
| 3 |
from typing import List, Dict
|
| 4 |
from itertools import combinations
|
| 5 |
-
|
| 6 |
from pydantic import BaseModel, Field
|
| 7 |
import asyncio
|
| 8 |
|
| 9 |
from dotenv import load_dotenv
|
| 10 |
load_dotenv()
|
| 11 |
|
| 12 |
-
#
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
class DisagreementDetails(BaseModel):
|
| 16 |
Methodology: List[str] = Field(default_factory=list)
|
|
@@ -48,7 +54,24 @@ async def compare_review_pair(
|
|
| 48 |
Returns:
|
| 49 |
Disagreement analysis results
|
| 50 |
"""
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
Compare the following two reviews and identify disagreements across different aspects.
|
| 53 |
Assess disagreement level (0.0 = perfect agreement, 1.0 = complete disagreement) and
|
| 54 |
list specific points of disagreement for each category.
|
|
@@ -66,38 +89,27 @@ async def compare_review_pair(
|
|
| 66 |
Clarity: {list_to_string(review2.get('Clarity', []))}
|
| 67 |
Significance: {list_to_string(review2.get('Significance', []))}
|
| 68 |
Novelty: {list_to_string(review2.get('Novelty', []))}
|
| 69 |
-
|
| 70 |
-
Respond with ONLY valid JSON in this exact format:
|
| 71 |
-
{{
|
| 72 |
-
"disagreement_score": 0.5,
|
| 73 |
-
"disagreement_details": {{
|
| 74 |
-
"Methodology": ["specific disagreement point 1"],
|
| 75 |
-
"Experiments": ["specific disagreement point 1"],
|
| 76 |
-
"Clarity": [],
|
| 77 |
-
"Significance": ["specific disagreement point 1"],
|
| 78 |
-
"Novelty": []
|
| 79 |
-
}}
|
| 80 |
-
}}
|
| 81 |
"""
|
| 82 |
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
}
|
| 88 |
-
)
|
| 89 |
|
| 90 |
for attempt in range(retries):
|
| 91 |
try:
|
| 92 |
response = await asyncio.to_thread(
|
| 93 |
-
|
| 94 |
-
|
|
|
|
|
|
|
|
|
|
| 95 |
)
|
| 96 |
|
| 97 |
-
if not response.
|
| 98 |
-
raise ValueError("Empty response from
|
| 99 |
|
| 100 |
-
result = json.loads(response.
|
| 101 |
|
| 102 |
# Validate structure
|
| 103 |
disagreement = DisagreementResult(
|
|
|
|
| 2 |
import os
|
| 3 |
from typing import List, Dict
|
| 4 |
from itertools import combinations
|
| 5 |
+
from openai import OpenAI
|
| 6 |
from pydantic import BaseModel, Field
|
| 7 |
import asyncio
|
| 8 |
|
| 9 |
from dotenv import load_dotenv
|
| 10 |
load_dotenv()
|
| 11 |
|
| 12 |
+
# Initialize OpenRouter client
|
| 13 |
+
client = OpenAI(
|
| 14 |
+
base_url="https://openrouter.ai/api/v1",
|
| 15 |
+
api_key=os.getenv("OPENROUTER_API_KEY"),
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
# Model to use for disagreement detection
|
| 19 |
+
DISAGREEMENT_MODEL = "google/gemini-2.5-flash-lite"
|
| 20 |
|
| 21 |
class DisagreementDetails(BaseModel):
|
| 22 |
Methodology: List[str] = Field(default_factory=list)
|
|
|
|
| 54 |
Returns:
|
| 55 |
Disagreement analysis results
|
| 56 |
"""
|
| 57 |
+
system_prompt = """
|
| 58 |
+
You are an expert at analyzing academic peer review disagreements.
|
| 59 |
+
Compare reviews and identify disagreements across different aspects.
|
| 60 |
+
|
| 61 |
+
Respond with ONLY valid JSON in this exact format:
|
| 62 |
+
{
|
| 63 |
+
"disagreement_score": 0.5,
|
| 64 |
+
"disagreement_details": {
|
| 65 |
+
"Methodology": ["specific disagreement point 1"],
|
| 66 |
+
"Experiments": ["specific disagreement point 1"],
|
| 67 |
+
"Clarity": [],
|
| 68 |
+
"Significance": ["specific disagreement point 1"],
|
| 69 |
+
"Novelty": []
|
| 70 |
+
}
|
| 71 |
+
}
|
| 72 |
+
"""
|
| 73 |
+
|
| 74 |
+
user_prompt = f"""
|
| 75 |
Compare the following two reviews and identify disagreements across different aspects.
|
| 76 |
Assess disagreement level (0.0 = perfect agreement, 1.0 = complete disagreement) and
|
| 77 |
list specific points of disagreement for each category.
|
|
|
|
| 89 |
Clarity: {list_to_string(review2.get('Clarity', []))}
|
| 90 |
Significance: {list_to_string(review2.get('Significance', []))}
|
| 91 |
Novelty: {list_to_string(review2.get('Novelty', []))}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
"""
|
| 93 |
|
| 94 |
+
messages = [
|
| 95 |
+
{"role": "system", "content": system_prompt},
|
| 96 |
+
{"role": "user", "content": user_prompt},
|
| 97 |
+
]
|
|
|
|
|
|
|
| 98 |
|
| 99 |
for attempt in range(retries):
|
| 100 |
try:
|
| 101 |
response = await asyncio.to_thread(
|
| 102 |
+
client.chat.completions.create,
|
| 103 |
+
model=DISAGREEMENT_MODEL,
|
| 104 |
+
messages=messages,
|
| 105 |
+
max_tokens=2048,
|
| 106 |
+
response_format={"type": "json_object"},
|
| 107 |
)
|
| 108 |
|
| 109 |
+
if not response.choices or not response.choices[0].message.content.strip():
|
| 110 |
+
raise ValueError("Empty response from API")
|
| 111 |
|
| 112 |
+
result = json.loads(response.choices[0].message.content.strip())
|
| 113 |
|
| 114 |
# Validate structure
|
| 115 |
disagreement = DisagreementResult(
|
pipeline/disagreement_resolution.py
CHANGED
|
@@ -18,11 +18,11 @@ client = OpenAI(
|
|
| 18 |
# Priority list of models to try
|
| 19 |
# 1. DeepSeek R1 (Best reasoning, most expensive)
|
| 20 |
# 2. DeepSeek R1 Distill (Good reasoning, cheaper)
|
| 21 |
-
# 3. Gemini 2.
|
| 22 |
MODELS = [
|
| 23 |
"deepseek/deepseek-r1",
|
| 24 |
"deepseek/deepseek-r1-distill-llama-70b",
|
| 25 |
-
"google/gemini-2.
|
| 26 |
]
|
| 27 |
|
| 28 |
class ResolutionDetails(BaseModel):
|
|
|
|
| 18 |
# Priority list of models to try
|
| 19 |
# 1. DeepSeek R1 (Best reasoning, most expensive)
|
| 20 |
# 2. DeepSeek R1 Distill (Good reasoning, cheaper)
|
| 21 |
+
# 3. Gemini 2.5 Flash Lite (Cheap, fast fallback)
|
| 22 |
MODELS = [
|
| 23 |
"deepseek/deepseek-r1",
|
| 24 |
"deepseek/deepseek-r1-distill-llama-70b",
|
| 25 |
+
"google/gemini-2.5-flash-lite"
|
| 26 |
]
|
| 27 |
|
| 28 |
class ResolutionDetails(BaseModel):
|
pipeline/meta_review.py
CHANGED
|
@@ -18,7 +18,7 @@ client = OpenAI(
|
|
| 18 |
MODELS = [
|
| 19 |
"deepseek/deepseek-r1",
|
| 20 |
"deepseek/deepseek-r1-distill-llama-70b",
|
| 21 |
-
"google/gemini-2.
|
| 22 |
]
|
| 23 |
|
| 24 |
class MetaReviewResult(BaseModel):
|
|
|
|
| 18 |
MODELS = [
|
| 19 |
"deepseek/deepseek-r1",
|
| 20 |
"deepseek/deepseek-r1-distill-llama-70b",
|
| 21 |
+
"google/gemini-2.5-flash-lite"
|
| 22 |
]
|
| 23 |
|
| 24 |
class MetaReviewResult(BaseModel):
|
pipeline/search_retrieval.py
CHANGED
|
@@ -1,63 +1,29 @@
|
|
| 1 |
import os
|
| 2 |
from typing import Dict, List
|
| 3 |
import asyncio
|
| 4 |
-
from
|
| 5 |
from langchain_community.utilities import ArxivAPIWrapper, SerpAPIWrapper
|
| 6 |
from langchain_community.tools.semanticscholar.tool import SemanticScholarQueryRun
|
| 7 |
from langchain_community.tools.tavily_search import TavilySearchResults
|
| 8 |
-
from langchain.agents import AgentType, initialize_agent, AgentExecutor
|
| 9 |
-
from langchain.tools import Tool
|
| 10 |
|
| 11 |
from dotenv import load_dotenv
|
| 12 |
load_dotenv()
|
| 13 |
|
| 14 |
-
# Initialize LLM
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
max_retries=2,
|
| 19 |
)
|
| 20 |
|
|
|
|
|
|
|
|
|
|
| 21 |
# Initialize search tools
|
| 22 |
semantic_scholar = SemanticScholarQueryRun()
|
| 23 |
google_scholar = SerpAPIWrapper(params={"engine": "google_scholar"})
|
| 24 |
arxiv_search = ArxivAPIWrapper()
|
| 25 |
tavily_search = TavilySearchResults(max_results=5)
|
| 26 |
|
| 27 |
-
# Define tools
|
| 28 |
-
tools = [
|
| 29 |
-
Tool(
|
| 30 |
-
name="TavilySearch",
|
| 31 |
-
func=tavily_search.run,
|
| 32 |
-
description="Retrieves the latest State-of-the-Art (SoTA) research and current academic information"
|
| 33 |
-
),
|
| 34 |
-
Tool(
|
| 35 |
-
name="SemanticScholar",
|
| 36 |
-
func=semantic_scholar.run,
|
| 37 |
-
description="Find academic papers from Semantic Scholar database"
|
| 38 |
-
),
|
| 39 |
-
Tool(
|
| 40 |
-
name="GoogleScholar",
|
| 41 |
-
func=google_scholar.run,
|
| 42 |
-
description="Search for scholarly articles and citations"
|
| 43 |
-
),
|
| 44 |
-
Tool(
|
| 45 |
-
name="ArxivSearch",
|
| 46 |
-
func=arxiv_search.run,
|
| 47 |
-
description="Find research papers from ArXiv preprint repository"
|
| 48 |
-
),
|
| 49 |
-
]
|
| 50 |
-
|
| 51 |
-
# Initialize agent
|
| 52 |
-
agent = initialize_agent(
|
| 53 |
-
tools=tools,
|
| 54 |
-
llm=llm,
|
| 55 |
-
agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
|
| 56 |
-
verbose=False,
|
| 57 |
-
handle_parsing_errors=True,
|
| 58 |
-
max_iterations=10
|
| 59 |
-
)
|
| 60 |
-
|
| 61 |
def combine_critiques(critique_points: List[Dict]) -> Dict[str, str]:
|
| 62 |
"""
|
| 63 |
Combine critique points from multiple reviews into categories
|
|
@@ -82,6 +48,16 @@ def combine_critiques(critique_points: List[Dict]) -> Dict[str, str]:
|
|
| 82 |
|
| 83 |
return combined
|
| 84 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
async def search_sota(paper_title: str, paper_abstract: str, retries: int = 3) -> str:
|
| 86 |
"""
|
| 87 |
Search for state-of-the-art research related to the paper
|
|
@@ -94,29 +70,73 @@ async def search_sota(paper_title: str, paper_abstract: str, retries: int = 3) -
|
|
| 94 |
Returns:
|
| 95 |
Summary of SoTA findings
|
| 96 |
"""
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
for attempt in range(retries):
|
| 104 |
try:
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
-
if not
|
| 108 |
-
raise ValueError("Empty
|
| 109 |
|
| 110 |
-
return
|
| 111 |
|
| 112 |
except Exception as e:
|
| 113 |
wait_time = 2 ** attempt
|
| 114 |
-
print(f"SoTA
|
| 115 |
|
| 116 |
if attempt < retries - 1:
|
| 117 |
await asyncio.sleep(wait_time)
|
| 118 |
else:
|
| 119 |
-
|
|
|
|
| 120 |
|
| 121 |
async def retrieve_evidence_for_category(
|
| 122 |
category: str,
|
|
@@ -137,28 +157,60 @@ async def retrieve_evidence_for_category(
|
|
| 137 |
if critiques == "No critiques" or not critiques.strip():
|
| 138 |
return f"No critiques to validate for {category}"
|
| 139 |
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
f"related to {category}: {critiques[:500]}"
|
| 143 |
-
)
|
| 144 |
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
|
| 163 |
async def retrieve_evidence(combined_critiques: Dict[str, str]) -> Dict[str, str]:
|
| 164 |
"""
|
|
|
|
| 1 |
import os
|
| 2 |
from typing import Dict, List
|
| 3 |
import asyncio
|
| 4 |
+
from openai import OpenAI
|
| 5 |
from langchain_community.utilities import ArxivAPIWrapper, SerpAPIWrapper
|
| 6 |
from langchain_community.tools.semanticscholar.tool import SemanticScholarQueryRun
|
| 7 |
from langchain_community.tools.tavily_search import TavilySearchResults
|
|
|
|
|
|
|
| 8 |
|
| 9 |
from dotenv import load_dotenv
|
| 10 |
load_dotenv()
|
| 11 |
|
| 12 |
+
# Initialize OpenRouter client for LLM calls
|
| 13 |
+
client = OpenAI(
|
| 14 |
+
base_url="https://openrouter.ai/api/v1",
|
| 15 |
+
api_key=os.getenv("OPENROUTER_API_KEY"),
|
|
|
|
| 16 |
)
|
| 17 |
|
| 18 |
+
# Model for search/retrieval tasks
|
| 19 |
+
SEARCH_MODEL = "google/gemini-2.5-flash-lite"
|
| 20 |
+
|
| 21 |
# Initialize search tools
|
| 22 |
semantic_scholar = SemanticScholarQueryRun()
|
| 23 |
google_scholar = SerpAPIWrapper(params={"engine": "google_scholar"})
|
| 24 |
arxiv_search = ArxivAPIWrapper()
|
| 25 |
tavily_search = TavilySearchResults(max_results=5)
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
def combine_critiques(critique_points: List[Dict]) -> Dict[str, str]:
|
| 28 |
"""
|
| 29 |
Combine critique points from multiple reviews into categories
|
|
|
|
| 48 |
|
| 49 |
return combined
|
| 50 |
|
| 51 |
+
async def run_search_tool(tool_name: str, tool_func, query: str) -> str:
|
| 52 |
+
"""Run a search tool with error handling"""
|
| 53 |
+
try:
|
| 54 |
+
result = await asyncio.to_thread(tool_func, query)
|
| 55 |
+
return str(result) if result else ""
|
| 56 |
+
except Exception as e:
|
| 57 |
+
print(f"{tool_name} search failed: {e}")
|
| 58 |
+
return ""
|
| 59 |
+
|
| 60 |
+
|
| 61 |
async def search_sota(paper_title: str, paper_abstract: str, retries: int = 3) -> str:
|
| 62 |
"""
|
| 63 |
Search for state-of-the-art research related to the paper
|
|
|
|
| 70 |
Returns:
|
| 71 |
Summary of SoTA findings
|
| 72 |
"""
|
| 73 |
+
# Create search query
|
| 74 |
+
search_query = f"{paper_title} recent advances methodology"
|
| 75 |
+
|
| 76 |
+
# Run multiple searches in parallel
|
| 77 |
+
search_tasks = [
|
| 78 |
+
run_search_tool("Tavily", tavily_search.run, search_query),
|
| 79 |
+
run_search_tool("ArXiv", arxiv_search.run, search_query[:300]),
|
| 80 |
+
run_search_tool("SemanticScholar", semantic_scholar.run, paper_title),
|
| 81 |
+
]
|
| 82 |
+
|
| 83 |
+
search_results = await asyncio.gather(*search_tasks)
|
| 84 |
+
|
| 85 |
+
# Combine all search results
|
| 86 |
+
combined_results = "\n\n".join([
|
| 87 |
+
f"=== Tavily Results ===\n{search_results[0]}" if search_results[0] else "",
|
| 88 |
+
f"=== ArXiv Results ===\n{search_results[1]}" if search_results[1] else "",
|
| 89 |
+
f"=== Semantic Scholar Results ===\n{search_results[2]}" if search_results[2] else "",
|
| 90 |
+
])
|
| 91 |
+
|
| 92 |
+
if not combined_results.strip():
|
| 93 |
+
return "No SoTA research found from available sources."
|
| 94 |
+
|
| 95 |
+
# Use LLM to synthesize the results
|
| 96 |
+
system_prompt = """
|
| 97 |
+
You are an expert at synthesizing academic research findings.
|
| 98 |
+
Summarize the search results to identify state-of-the-art approaches and recent advances.
|
| 99 |
+
Focus on methodologies, key findings, and how they relate to the paper being reviewed.
|
| 100 |
+
"""
|
| 101 |
+
|
| 102 |
+
user_prompt = f"""
|
| 103 |
+
Paper Title: {paper_title}
|
| 104 |
+
Paper Abstract: {paper_abstract[:500]}
|
| 105 |
+
|
| 106 |
+
Search Results:
|
| 107 |
+
{combined_results[:4000]}
|
| 108 |
+
|
| 109 |
+
Provide a concise summary of the state-of-the-art research relevant to this paper.
|
| 110 |
+
"""
|
| 111 |
+
|
| 112 |
+
messages = [
|
| 113 |
+
{"role": "system", "content": system_prompt},
|
| 114 |
+
{"role": "user", "content": user_prompt},
|
| 115 |
+
]
|
| 116 |
|
| 117 |
for attempt in range(retries):
|
| 118 |
try:
|
| 119 |
+
response = await asyncio.to_thread(
|
| 120 |
+
client.chat.completions.create,
|
| 121 |
+
model=SEARCH_MODEL,
|
| 122 |
+
messages=messages,
|
| 123 |
+
max_tokens=2048,
|
| 124 |
+
)
|
| 125 |
|
| 126 |
+
if not response.choices or not response.choices[0].message.content.strip():
|
| 127 |
+
raise ValueError("Empty response from API")
|
| 128 |
|
| 129 |
+
return response.choices[0].message.content.strip()
|
| 130 |
|
| 131 |
except Exception as e:
|
| 132 |
wait_time = 2 ** attempt
|
| 133 |
+
print(f"SoTA synthesis attempt {attempt + 1} failed: {e}")
|
| 134 |
|
| 135 |
if attempt < retries - 1:
|
| 136 |
await asyncio.sleep(wait_time)
|
| 137 |
else:
|
| 138 |
+
# Return raw results if synthesis fails
|
| 139 |
+
return f"Raw search results (synthesis failed):\n{combined_results[:2000]}"
|
| 140 |
|
| 141 |
async def retrieve_evidence_for_category(
|
| 142 |
category: str,
|
|
|
|
| 157 |
if critiques == "No critiques" or not critiques.strip():
|
| 158 |
return f"No critiques to validate for {category}"
|
| 159 |
|
| 160 |
+
# Create targeted search query
|
| 161 |
+
search_query = f"{category} research validation {critiques[:200]}"
|
|
|
|
|
|
|
| 162 |
|
| 163 |
+
# Run search
|
| 164 |
+
try:
|
| 165 |
+
tavily_result = await run_search_tool("Tavily", tavily_search.run, search_query)
|
| 166 |
+
arxiv_result = await run_search_tool("ArXiv", arxiv_search.run, search_query[:200])
|
| 167 |
+
|
| 168 |
+
combined = f"{tavily_result}\n{arxiv_result}".strip()
|
| 169 |
+
|
| 170 |
+
if not combined:
|
| 171 |
+
return f"No evidence found for {category} critiques"
|
| 172 |
+
|
| 173 |
+
# Use LLM to analyze relevance
|
| 174 |
+
system_prompt = f"""
|
| 175 |
+
You are an expert at evaluating academic critiques.
|
| 176 |
+
Analyze the search results to find evidence that supports or contradicts the critiques.
|
| 177 |
+
Focus on the {category} aspect.
|
| 178 |
+
"""
|
| 179 |
+
|
| 180 |
+
user_prompt = f"""
|
| 181 |
+
Critiques for {category}: {critiques}
|
| 182 |
+
|
| 183 |
+
Search Results:
|
| 184 |
+
{combined[:2000]}
|
| 185 |
+
|
| 186 |
+
Summarize the evidence found that relates to these critiques.
|
| 187 |
+
"""
|
| 188 |
+
|
| 189 |
+
messages = [
|
| 190 |
+
{"role": "system", "content": system_prompt},
|
| 191 |
+
{"role": "user", "content": user_prompt},
|
| 192 |
+
]
|
| 193 |
+
|
| 194 |
+
for attempt in range(retries):
|
| 195 |
+
try:
|
| 196 |
+
response = await asyncio.to_thread(
|
| 197 |
+
client.chat.completions.create,
|
| 198 |
+
model=SEARCH_MODEL,
|
| 199 |
+
messages=messages,
|
| 200 |
+
max_tokens=1024,
|
| 201 |
+
)
|
| 202 |
+
|
| 203 |
+
if response.choices and response.choices[0].message.content.strip():
|
| 204 |
+
return response.choices[0].message.content.strip()
|
| 205 |
+
|
| 206 |
+
except Exception as e:
|
| 207 |
+
if attempt < retries - 1:
|
| 208 |
+
await asyncio.sleep(2 ** attempt)
|
| 209 |
+
|
| 210 |
+
return f"Evidence retrieval completed for {category}"
|
| 211 |
+
|
| 212 |
+
except Exception as e:
|
| 213 |
+
return f"Error retrieving evidence for {category}: {str(e)}"
|
| 214 |
|
| 215 |
async def retrieve_evidence(combined_critiques: Dict[str, str]) -> Dict[str, str]:
|
| 216 |
"""
|
requirements.txt
CHANGED
|
@@ -1,14 +1,12 @@
|
|
| 1 |
# Web Framework
|
| 2 |
gradio==5.9.1
|
| 3 |
|
| 4 |
-
# LLM Libraries
|
| 5 |
openai==1.59.5
|
| 6 |
-
google-generativeai==0.8.3
|
| 7 |
|
| 8 |
# LangChain and Tools
|
| 9 |
langchain==0.3.13
|
| 10 |
langchain-community==0.3.13
|
| 11 |
-
langchain-google-genai==2.0.8
|
| 12 |
langgraph==0.2.59
|
| 13 |
langgraph-checkpoint-sqlite==2.0.5
|
| 14 |
|
|
|
|
| 1 |
# Web Framework
|
| 2 |
gradio==5.9.1
|
| 3 |
|
| 4 |
+
# LLM Libraries (OpenRouter uses OpenAI SDK)
|
| 5 |
openai==1.59.5
|
|
|
|
| 6 |
|
| 7 |
# LangChain and Tools
|
| 8 |
langchain==0.3.13
|
| 9 |
langchain-community==0.3.13
|
|
|
|
| 10 |
langgraph==0.2.59
|
| 11 |
langgraph-checkpoint-sqlite==2.0.5
|
| 12 |
|
test_api.py
ADDED
|
@@ -0,0 +1,273 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Local test script for the MetaSearch API
|
| 3 |
+
Tests individual pipeline components with sample data
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import asyncio
|
| 7 |
+
import os
|
| 8 |
+
from dotenv import load_dotenv
|
| 9 |
+
|
| 10 |
+
load_dotenv()
|
| 11 |
+
|
| 12 |
+
# Sample test data
|
| 13 |
+
SAMPLE_PAPER_TITLE = "Attention Is All You Need"
|
| 14 |
+
SAMPLE_PAPER_ABSTRACT = """
|
| 15 |
+
We propose a new simple network architecture, the Transformer, based solely on
|
| 16 |
+
attention mechanisms, dispensing with recurrence and convolutions entirely.
|
| 17 |
+
Experiments on two machine translation tasks show these models to be superior
|
| 18 |
+
in quality while being more parallelizable and requiring significantly less time to train.
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
SAMPLE_REVIEWS = [
|
| 22 |
+
"""
|
| 23 |
+
This paper introduces a novel architecture that replaces recurrence with self-attention.
|
| 24 |
+
|
| 25 |
+
Strengths:
|
| 26 |
+
- The model achieves state-of-the-art results on translation benchmarks
|
| 27 |
+
- Training is significantly faster due to parallelization
|
| 28 |
+
- The attention visualization provides interpretability
|
| 29 |
+
|
| 30 |
+
Weaknesses:
|
| 31 |
+
- Limited evaluation on other NLP tasks beyond translation
|
| 32 |
+
- The computational complexity of self-attention scales quadratically with sequence length
|
| 33 |
+
- Missing comparison with some recent RNN variants
|
| 34 |
+
|
| 35 |
+
The methodology is sound but could benefit from more diverse experiments.
|
| 36 |
+
Overall, this is a strong contribution to the field.
|
| 37 |
+
""",
|
| 38 |
+
"""
|
| 39 |
+
The Transformer architecture is an interesting departure from RNN-based models.
|
| 40 |
+
|
| 41 |
+
Strengths:
|
| 42 |
+
- Clean and elegant architecture design
|
| 43 |
+
- Strong empirical results on WMT benchmarks
|
| 44 |
+
- Good ablation studies
|
| 45 |
+
|
| 46 |
+
Weaknesses:
|
| 47 |
+
- The paper overclaims novelty - attention mechanisms existed before
|
| 48 |
+
- Experiments are limited to machine translation only
|
| 49 |
+
- No theoretical analysis of why this works better
|
| 50 |
+
- Memory requirements are high for long sequences
|
| 51 |
+
|
| 52 |
+
The significance of this work is questionable given the narrow evaluation scope.
|
| 53 |
+
""",
|
| 54 |
+
"""
|
| 55 |
+
This is a well-written paper with clear presentation of a new architecture.
|
| 56 |
+
|
| 57 |
+
Strengths:
|
| 58 |
+
- Excellent results, setting new SOTA on translation
|
| 59 |
+
- The multi-head attention is a clever innovation
|
| 60 |
+
- Reproducibility details are provided
|
| 61 |
+
|
| 62 |
+
Weaknesses:
|
| 63 |
+
- Claims of "attention is all you need" are overstated
|
| 64 |
+
- Limited to sequence-to-sequence tasks
|
| 65 |
+
- Positional encoding seems like a hack
|
| 66 |
+
|
| 67 |
+
Overall a solid paper with important contributions despite some limitations.
|
| 68 |
+
"""
|
| 69 |
+
]
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
async def test_critique_extraction():
|
| 73 |
+
"""Test the critique extraction module"""
|
| 74 |
+
print("\n" + "="*60)
|
| 75 |
+
print("Testing Critique Extraction")
|
| 76 |
+
print("="*60)
|
| 77 |
+
|
| 78 |
+
from pipeline.critique_extraction import extract_critiques
|
| 79 |
+
|
| 80 |
+
print(f"Processing {len(SAMPLE_REVIEWS)} reviews...")
|
| 81 |
+
critiques = await extract_critiques(SAMPLE_REVIEWS)
|
| 82 |
+
|
| 83 |
+
for i, critique in enumerate(critiques):
|
| 84 |
+
print(f"\n--- Review {i+1} Critiques ---")
|
| 85 |
+
for category, points in critique.items():
|
| 86 |
+
if category != "error" and points:
|
| 87 |
+
print(f" {category}: {len(points)} points")
|
| 88 |
+
for point in points[:2]: # Show first 2 points
|
| 89 |
+
print(f" - {point[:80]}...")
|
| 90 |
+
|
| 91 |
+
return critiques
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
async def test_disagreement_detection(critiques):
|
| 95 |
+
"""Test the disagreement detection module"""
|
| 96 |
+
print("\n" + "="*60)
|
| 97 |
+
print("Testing Disagreement Detection")
|
| 98 |
+
print("="*60)
|
| 99 |
+
|
| 100 |
+
from pipeline.disagreement_detection import detect_disagreements
|
| 101 |
+
|
| 102 |
+
print(f"Detecting disagreements across {len(critiques)} reviews...")
|
| 103 |
+
disagreements = await detect_disagreements(critiques)
|
| 104 |
+
|
| 105 |
+
for d in disagreements:
|
| 106 |
+
pair = d.get('review_pair', [])
|
| 107 |
+
score = d.get('disagreement_score', 0)
|
| 108 |
+
print(f"\n--- Reviews {pair[0]+1} vs {pair[1]+1} ---")
|
| 109 |
+
print(f" Disagreement Score: {score:.2f}")
|
| 110 |
+
|
| 111 |
+
details = d.get('disagreement_details', {})
|
| 112 |
+
for category, points in details.items():
|
| 113 |
+
if points:
|
| 114 |
+
print(f" {category}: {len(points)} disagreements")
|
| 115 |
+
|
| 116 |
+
return disagreements
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
async def test_search_retrieval(critiques):
|
| 120 |
+
"""Test the search and retrieval module"""
|
| 121 |
+
print("\n" + "="*60)
|
| 122 |
+
print("Testing Search & Retrieval")
|
| 123 |
+
print("="*60)
|
| 124 |
+
|
| 125 |
+
from pipeline.search_retrieval import search_and_retrieve
|
| 126 |
+
|
| 127 |
+
print("Searching for SoTA research and evidence...")
|
| 128 |
+
results = await search_and_retrieve(
|
| 129 |
+
SAMPLE_PAPER_TITLE,
|
| 130 |
+
SAMPLE_PAPER_ABSTRACT,
|
| 131 |
+
critiques
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
print(f"\n--- SoTA Results (first 500 chars) ---")
|
| 135 |
+
print(results.get('SoTA_Results', 'N/A')[:500])
|
| 136 |
+
|
| 137 |
+
print(f"\n--- Combined Critiques ---")
|
| 138 |
+
for cat, text in results.get('Combined_Critiques', {}).items():
|
| 139 |
+
print(f" {cat}: {len(text)} chars")
|
| 140 |
+
|
| 141 |
+
print(f"\n--- Retrieved Evidence ---")
|
| 142 |
+
for cat, evidence in results.get('Retrieved_Evidence', {}).items():
|
| 143 |
+
print(f" {cat}: {len(evidence)} chars")
|
| 144 |
+
|
| 145 |
+
return results
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
async def test_disagreement_resolution(critiques, disagreements, search_results):
|
| 149 |
+
"""Test the disagreement resolution module"""
|
| 150 |
+
print("\n" + "="*60)
|
| 151 |
+
print("Testing Disagreement Resolution")
|
| 152 |
+
print("="*60)
|
| 153 |
+
|
| 154 |
+
from pipeline.disagreement_resolution import resolve_disagreements
|
| 155 |
+
|
| 156 |
+
print(f"Resolving {len(disagreements)} disagreements...")
|
| 157 |
+
resolutions = await resolve_disagreements(
|
| 158 |
+
SAMPLE_PAPER_TITLE,
|
| 159 |
+
SAMPLE_PAPER_ABSTRACT,
|
| 160 |
+
disagreements,
|
| 161 |
+
critiques,
|
| 162 |
+
search_results
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
for i, resolution in enumerate(resolutions):
|
| 166 |
+
print(f"\n--- Resolution {i+1} ---")
|
| 167 |
+
details = resolution.get('resolution_details', {})
|
| 168 |
+
|
| 169 |
+
accepted = details.get('accepted_critique_points', {})
|
| 170 |
+
rejected = details.get('rejected_critique_points', {})
|
| 171 |
+
|
| 172 |
+
print(f" Accepted categories: {list(accepted.keys())}")
|
| 173 |
+
print(f" Rejected categories: {list(rejected.keys())}")
|
| 174 |
+
|
| 175 |
+
summary = details.get('final_resolution_summary', '')
|
| 176 |
+
print(f" Summary: {summary[:200]}...")
|
| 177 |
+
|
| 178 |
+
return resolutions
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
async def test_meta_review(resolutions, search_results):
|
| 182 |
+
"""Test the meta-review generation module"""
|
| 183 |
+
print("\n" + "="*60)
|
| 184 |
+
print("Testing Meta-Review Generation")
|
| 185 |
+
print("="*60)
|
| 186 |
+
|
| 187 |
+
from pipeline.meta_review import generate_meta_review
|
| 188 |
+
|
| 189 |
+
print("Generating meta-review...")
|
| 190 |
+
meta_review = await generate_meta_review(
|
| 191 |
+
SAMPLE_PAPER_TITLE,
|
| 192 |
+
SAMPLE_PAPER_ABSTRACT,
|
| 193 |
+
resolutions,
|
| 194 |
+
search_results
|
| 195 |
+
)
|
| 196 |
+
|
| 197 |
+
print(f"\n--- Meta-Review (first 1000 chars) ---")
|
| 198 |
+
print(meta_review[:1000])
|
| 199 |
+
print("...")
|
| 200 |
+
|
| 201 |
+
return meta_review
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
async def run_full_pipeline():
|
| 205 |
+
"""Run the complete pipeline test"""
|
| 206 |
+
print("\n" + "#"*60)
|
| 207 |
+
print("# MetaSearch API - Full Pipeline Test")
|
| 208 |
+
print("#"*60)
|
| 209 |
+
|
| 210 |
+
# Check environment
|
| 211 |
+
if not os.getenv("OPENROUTER_API_KEY"):
|
| 212 |
+
print("\n❌ ERROR: OPENROUTER_API_KEY not set!")
|
| 213 |
+
print("Please set it in your .env file")
|
| 214 |
+
return
|
| 215 |
+
|
| 216 |
+
print("\n✅ OPENROUTER_API_KEY is set")
|
| 217 |
+
|
| 218 |
+
try:
|
| 219 |
+
# Step 1: Extract critiques
|
| 220 |
+
critiques = await test_critique_extraction()
|
| 221 |
+
|
| 222 |
+
# Step 2: Detect disagreements
|
| 223 |
+
disagreements = await test_disagreement_detection(critiques)
|
| 224 |
+
|
| 225 |
+
# Step 3: Search and retrieve (optional - can be slow)
|
| 226 |
+
search_results = await test_search_retrieval(critiques)
|
| 227 |
+
|
| 228 |
+
# Step 4: Resolve disagreements
|
| 229 |
+
resolutions = await test_disagreement_resolution(
|
| 230 |
+
critiques, disagreements, search_results
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
# Step 5: Generate meta-review
|
| 234 |
+
meta_review = await test_meta_review(resolutions, search_results)
|
| 235 |
+
|
| 236 |
+
print("\n" + "#"*60)
|
| 237 |
+
print("# ✅ Full Pipeline Test Complete!")
|
| 238 |
+
print("#"*60)
|
| 239 |
+
|
| 240 |
+
except Exception as e:
|
| 241 |
+
print(f"\n❌ Pipeline failed with error: {e}")
|
| 242 |
+
import traceback
|
| 243 |
+
traceback.print_exc()
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
async def run_quick_test():
|
| 247 |
+
"""Run a quick test of just critique extraction"""
|
| 248 |
+
print("\n" + "#"*60)
|
| 249 |
+
print("# MetaSearch API - Quick Test (Critique Extraction Only)")
|
| 250 |
+
print("#"*60)
|
| 251 |
+
|
| 252 |
+
if not os.getenv("OPENROUTER_API_KEY"):
|
| 253 |
+
print("\n❌ ERROR: OPENROUTER_API_KEY not set!")
|
| 254 |
+
return
|
| 255 |
+
|
| 256 |
+
print("\n✅ OPENROUTER_API_KEY is set")
|
| 257 |
+
|
| 258 |
+
try:
|
| 259 |
+
critiques = await test_critique_extraction()
|
| 260 |
+
print("\n✅ Quick test passed!")
|
| 261 |
+
except Exception as e:
|
| 262 |
+
print(f"\n❌ Test failed: {e}")
|
| 263 |
+
import traceback
|
| 264 |
+
traceback.print_exc()
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
if __name__ == "__main__":
|
| 268 |
+
import sys
|
| 269 |
+
|
| 270 |
+
if len(sys.argv) > 1 and sys.argv[1] == "--quick":
|
| 271 |
+
asyncio.run(run_quick_test())
|
| 272 |
+
else:
|
| 273 |
+
asyncio.run(run_full_pipeline())
|