Tirath5504 commited on
Commit
08a5a31
·
1 Parent(s): 2bf686d

use openrouter only instead of google-genai

Browse files
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ venv/
2
+ .env
3
+ __pycache__/
config.py CHANGED
@@ -11,10 +11,10 @@ API_DESCRIPTION = """
11
  ## Automated Consensus Analysis for Peer Reviews
12
 
13
  This API provides comprehensive analysis of peer review disagreements using:
14
- - **LLM-based critique extraction** (Gemini 2.0)
15
  - **Disagreement detection** between reviewers
16
  - **Search-augmented evidence retrieval** (Semantic Scholar, arXiv, Google Scholar, Tavily)
17
- - **AI-powered disagreement resolution** (DeepSeek-R1)
18
  - **Meta-review generation**
19
 
20
  ### Features:
@@ -29,12 +29,11 @@ MAX_REQUESTS_PER_MINUTE = int(os.getenv("MAX_REQUESTS_PER_MINUTE", "10"))
29
  MAX_CONCURRENT_TASKS = int(os.getenv("MAX_CONCURRENT_TASKS", "3"))
30
  QUEUE_MAX_SIZE = int(os.getenv("QUEUE_MAX_SIZE", "20"))
31
 
32
- # Model Configuration
33
- GEMINI_MODEL = os.getenv("GEMINI_MODEL", "gemini-2.0-flash")
34
  DEEPSEEK_MODEL = os.getenv("DEEPSEEK_MODEL", "deepseek/deepseek-r1")
35
 
36
  # API Keys (from HF Spaces secrets)
37
- GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
38
  OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
39
  TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
40
  SERPAPI_API_KEY = os.getenv("SERPAPI_API_KEY")
@@ -58,7 +57,6 @@ def validate_environment():
58
  ValueError: If required variables are missing
59
  """
60
  required_vars = {
61
- "GEMINI_API_KEY": GEMINI_API_KEY,
62
  "OPENROUTER_API_KEY": OPENROUTER_API_KEY,
63
  "TAVILY_API_KEY": TAVILY_API_KEY,
64
  }
 
11
  ## Automated Consensus Analysis for Peer Reviews
12
 
13
  This API provides comprehensive analysis of peer review disagreements using:
14
+ - **LLM-based critique extraction** (Gemini 2.5 Flash Lite via OpenRouter)
15
  - **Disagreement detection** between reviewers
16
  - **Search-augmented evidence retrieval** (Semantic Scholar, arXiv, Google Scholar, Tavily)
17
+ - **AI-powered disagreement resolution** (DeepSeek-R1 via OpenRouter)
18
  - **Meta-review generation**
19
 
20
  ### Features:
 
29
  MAX_CONCURRENT_TASKS = int(os.getenv("MAX_CONCURRENT_TASKS", "3"))
30
  QUEUE_MAX_SIZE = int(os.getenv("QUEUE_MAX_SIZE", "20"))
31
 
32
+ # Model Configuration (all via OpenRouter)
33
+ GEMINI_MODEL = os.getenv("GEMINI_MODEL", "google/gemini-2.5-flash-lite")
34
  DEEPSEEK_MODEL = os.getenv("DEEPSEEK_MODEL", "deepseek/deepseek-r1")
35
 
36
  # API Keys (from HF Spaces secrets)
 
37
  OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
38
  TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
39
  SERPAPI_API_KEY = os.getenv("SERPAPI_API_KEY")
 
57
  ValueError: If required variables are missing
58
  """
59
  required_vars = {
 
60
  "OPENROUTER_API_KEY": OPENROUTER_API_KEY,
61
  "TAVILY_API_KEY": TAVILY_API_KEY,
62
  }
pipeline/critique_extraction.py CHANGED
@@ -1,16 +1,21 @@
1
  import json
2
  import os
3
  from typing import List, Dict
4
- import google.generativeai as genai
5
  from pydantic import BaseModel
6
  import asyncio
7
- import time
8
 
9
  from dotenv import load_dotenv
10
  load_dotenv()
11
 
12
- # Configure Gemini
13
- genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
 
 
 
 
 
 
14
 
15
  class CritiquePoint(BaseModel):
16
  Methodology: List[str] = []
@@ -21,7 +26,7 @@ class CritiquePoint(BaseModel):
21
 
22
  async def extract_single_critique(review_text: str, retries: int = 5) -> Dict:
23
  """
24
- Extract critique points from a single review using Gemini
25
 
26
  Args:
27
  review_text: The review text to analyze
@@ -30,59 +35,67 @@ async def extract_single_critique(review_text: str, retries: int = 5) -> Dict:
30
  Returns:
31
  Dictionary with categorized critique points
32
  """
33
- prompt = f"""
34
- Extract key critique points from the following research paper review.
35
- Categorize them into aspects: Methodology, Experiments, Clarity, Significance, Novelty.
36
- Return a structured JSON with these categories as keys and lists of critique points as values.
37
-
38
- Review:
39
- {review_text}
40
 
41
  Respond with ONLY valid JSON in this format:
42
- {{
43
  "Methodology": ["point1", "point2"],
44
  "Experiments": ["point1"],
45
  "Clarity": ["point1", "point2"],
46
  "Significance": ["point1"],
47
  "Novelty": ["point1"]
48
- }}
 
 
 
 
 
 
 
 
49
  """
50
 
51
- model = genai.GenerativeModel(
52
- model_name="gemini-2.5-flash-lite",
53
- generation_config={
54
- "response_mime_type": "application/json",
55
- }
56
- )
57
 
58
  for attempt in range(retries):
59
  try:
60
  response = await asyncio.to_thread(
61
- model.generate_content,
62
- prompt
 
 
 
63
  )
64
 
65
- if not response.text.strip():
66
- raise ValueError("Empty response from Gemini")
67
 
68
- result = json.loads(response.text)
69
 
70
  # Validate structure
71
  critique = CritiquePoint(**result)
72
  return critique.model_dump()
73
 
74
- except genai.types.generation_types.BlockedPromptException as e:
75
- print(f"Content blocked by safety filters: {e}")
76
- return {
77
- "Methodology": [],
78
- "Experiments": [],
79
- "Clarity": [],
80
- "Significance": [],
81
- "Novelty": [],
82
- "error": "Content blocked by safety filters"
83
- }
84
-
85
  except Exception as e:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  wait_time = 2 ** attempt
87
  print(f"Attempt {attempt + 1} failed: {e}. Retrying in {wait_time}s...")
88
 
 
1
  import json
2
  import os
3
  from typing import List, Dict
4
+ from openai import OpenAI
5
  from pydantic import BaseModel
6
  import asyncio
 
7
 
8
  from dotenv import load_dotenv
9
  load_dotenv()
10
 
11
+ # Initialize OpenRouter client
12
+ client = OpenAI(
13
+ base_url="https://openrouter.ai/api/v1",
14
+ api_key=os.getenv("OPENROUTER_API_KEY"),
15
+ )
16
+
17
+ # Model to use for critique extraction
18
+ CRITIQUE_MODEL = "google/gemini-2.5-flash-lite"
19
 
20
  class CritiquePoint(BaseModel):
21
  Methodology: List[str] = []
 
26
 
27
  async def extract_single_critique(review_text: str, retries: int = 5) -> Dict:
28
  """
29
+ Extract critique points from a single review using OpenRouter (Gemini)
30
 
31
  Args:
32
  review_text: The review text to analyze
 
35
  Returns:
36
  Dictionary with categorized critique points
37
  """
38
+ system_prompt = """
39
+ You are an expert at analyzing academic peer reviews.
40
+ Extract key critique points from the review and categorize them.
 
 
 
 
41
 
42
  Respond with ONLY valid JSON in this format:
43
+ {
44
  "Methodology": ["point1", "point2"],
45
  "Experiments": ["point1"],
46
  "Clarity": ["point1", "point2"],
47
  "Significance": ["point1"],
48
  "Novelty": ["point1"]
49
+ }
50
+ """
51
+
52
+ user_prompt = f"""
53
+ Extract key critique points from the following research paper review.
54
+ Categorize them into aspects: Methodology, Experiments, Clarity, Significance, Novelty.
55
+
56
+ Review:
57
+ {review_text}
58
  """
59
 
60
+ messages = [
61
+ {"role": "system", "content": system_prompt},
62
+ {"role": "user", "content": user_prompt},
63
+ ]
 
 
64
 
65
  for attempt in range(retries):
66
  try:
67
  response = await asyncio.to_thread(
68
+ client.chat.completions.create,
69
+ model=CRITIQUE_MODEL,
70
+ messages=messages,
71
+ max_tokens=2048,
72
+ response_format={"type": "json_object"},
73
  )
74
 
75
+ if not response.choices or not response.choices[0].message.content.strip():
76
+ raise ValueError("Empty response from API")
77
 
78
+ result = json.loads(response.choices[0].message.content.strip())
79
 
80
  # Validate structure
81
  critique = CritiquePoint(**result)
82
  return critique.model_dump()
83
 
 
 
 
 
 
 
 
 
 
 
 
84
  except Exception as e:
85
+ error_msg = str(e)
86
+
87
+ # Check for content safety blocks
88
+ if "safety" in error_msg.lower() or "blocked" in error_msg.lower():
89
+ print(f"Content blocked by safety filters: {e}")
90
+ return {
91
+ "Methodology": [],
92
+ "Experiments": [],
93
+ "Clarity": [],
94
+ "Significance": [],
95
+ "Novelty": [],
96
+ "error": "Content blocked by safety filters"
97
+ }
98
+
99
  wait_time = 2 ** attempt
100
  print(f"Attempt {attempt + 1} failed: {e}. Retrying in {wait_time}s...")
101
 
pipeline/disagreement_detection.py CHANGED
@@ -2,15 +2,21 @@ import json
2
  import os
3
  from typing import List, Dict
4
  from itertools import combinations
5
- import google.generativeai as genai
6
  from pydantic import BaseModel, Field
7
  import asyncio
8
 
9
  from dotenv import load_dotenv
10
  load_dotenv()
11
 
12
- # Configure Gemini
13
- genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
 
 
 
 
 
 
14
 
15
  class DisagreementDetails(BaseModel):
16
  Methodology: List[str] = Field(default_factory=list)
@@ -48,7 +54,24 @@ async def compare_review_pair(
48
  Returns:
49
  Disagreement analysis results
50
  """
51
- prompt = f"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  Compare the following two reviews and identify disagreements across different aspects.
53
  Assess disagreement level (0.0 = perfect agreement, 1.0 = complete disagreement) and
54
  list specific points of disagreement for each category.
@@ -66,38 +89,27 @@ async def compare_review_pair(
66
  Clarity: {list_to_string(review2.get('Clarity', []))}
67
  Significance: {list_to_string(review2.get('Significance', []))}
68
  Novelty: {list_to_string(review2.get('Novelty', []))}
69
-
70
- Respond with ONLY valid JSON in this exact format:
71
- {{
72
- "disagreement_score": 0.5,
73
- "disagreement_details": {{
74
- "Methodology": ["specific disagreement point 1"],
75
- "Experiments": ["specific disagreement point 1"],
76
- "Clarity": [],
77
- "Significance": ["specific disagreement point 1"],
78
- "Novelty": []
79
- }}
80
- }}
81
  """
82
 
83
- model = genai.GenerativeModel(
84
- model_name="gemini-2.5-flash-lite",
85
- generation_config={
86
- "response_mime_type": "application/json",
87
- }
88
- )
89
 
90
  for attempt in range(retries):
91
  try:
92
  response = await asyncio.to_thread(
93
- model.generate_content,
94
- prompt
 
 
 
95
  )
96
 
97
- if not response.text.strip():
98
- raise ValueError("Empty response from Gemini")
99
 
100
- result = json.loads(response.text)
101
 
102
  # Validate structure
103
  disagreement = DisagreementResult(
 
2
  import os
3
  from typing import List, Dict
4
  from itertools import combinations
5
+ from openai import OpenAI
6
  from pydantic import BaseModel, Field
7
  import asyncio
8
 
9
  from dotenv import load_dotenv
10
  load_dotenv()
11
 
12
+ # Initialize OpenRouter client
13
+ client = OpenAI(
14
+ base_url="https://openrouter.ai/api/v1",
15
+ api_key=os.getenv("OPENROUTER_API_KEY"),
16
+ )
17
+
18
+ # Model to use for disagreement detection
19
+ DISAGREEMENT_MODEL = "google/gemini-2.5-flash-lite"
20
 
21
  class DisagreementDetails(BaseModel):
22
  Methodology: List[str] = Field(default_factory=list)
 
54
  Returns:
55
  Disagreement analysis results
56
  """
57
+ system_prompt = """
58
+ You are an expert at analyzing academic peer review disagreements.
59
+ Compare reviews and identify disagreements across different aspects.
60
+
61
+ Respond with ONLY valid JSON in this exact format:
62
+ {
63
+ "disagreement_score": 0.5,
64
+ "disagreement_details": {
65
+ "Methodology": ["specific disagreement point 1"],
66
+ "Experiments": ["specific disagreement point 1"],
67
+ "Clarity": [],
68
+ "Significance": ["specific disagreement point 1"],
69
+ "Novelty": []
70
+ }
71
+ }
72
+ """
73
+
74
+ user_prompt = f"""
75
  Compare the following two reviews and identify disagreements across different aspects.
76
  Assess disagreement level (0.0 = perfect agreement, 1.0 = complete disagreement) and
77
  list specific points of disagreement for each category.
 
89
  Clarity: {list_to_string(review2.get('Clarity', []))}
90
  Significance: {list_to_string(review2.get('Significance', []))}
91
  Novelty: {list_to_string(review2.get('Novelty', []))}
 
 
 
 
 
 
 
 
 
 
 
 
92
  """
93
 
94
+ messages = [
95
+ {"role": "system", "content": system_prompt},
96
+ {"role": "user", "content": user_prompt},
97
+ ]
 
 
98
 
99
  for attempt in range(retries):
100
  try:
101
  response = await asyncio.to_thread(
102
+ client.chat.completions.create,
103
+ model=DISAGREEMENT_MODEL,
104
+ messages=messages,
105
+ max_tokens=2048,
106
+ response_format={"type": "json_object"},
107
  )
108
 
109
+ if not response.choices or not response.choices[0].message.content.strip():
110
+ raise ValueError("Empty response from API")
111
 
112
+ result = json.loads(response.choices[0].message.content.strip())
113
 
114
  # Validate structure
115
  disagreement = DisagreementResult(
pipeline/disagreement_resolution.py CHANGED
@@ -18,11 +18,11 @@ client = OpenAI(
18
  # Priority list of models to try
19
  # 1. DeepSeek R1 (Best reasoning, most expensive)
20
  # 2. DeepSeek R1 Distill (Good reasoning, cheaper)
21
- # 3. Gemini 2.0 Flash (Free/Cheap, very fast fallback)
22
  MODELS = [
23
  "deepseek/deepseek-r1",
24
  "deepseek/deepseek-r1-distill-llama-70b",
25
- "google/gemini-2.0-flash-exp:free"
26
  ]
27
 
28
  class ResolutionDetails(BaseModel):
 
18
  # Priority list of models to try
19
  # 1. DeepSeek R1 (Best reasoning, most expensive)
20
  # 2. DeepSeek R1 Distill (Good reasoning, cheaper)
21
+ # 3. Gemini 2.5 Flash Lite (Cheap, fast fallback)
22
  MODELS = [
23
  "deepseek/deepseek-r1",
24
  "deepseek/deepseek-r1-distill-llama-70b",
25
+ "google/gemini-2.5-flash-lite"
26
  ]
27
 
28
  class ResolutionDetails(BaseModel):
pipeline/meta_review.py CHANGED
@@ -18,7 +18,7 @@ client = OpenAI(
18
  MODELS = [
19
  "deepseek/deepseek-r1",
20
  "deepseek/deepseek-r1-distill-llama-70b",
21
- "google/gemini-2.0-flash-exp:free"
22
  ]
23
 
24
  class MetaReviewResult(BaseModel):
 
18
  MODELS = [
19
  "deepseek/deepseek-r1",
20
  "deepseek/deepseek-r1-distill-llama-70b",
21
+ "google/gemini-2.5-flash-lite"
22
  ]
23
 
24
  class MetaReviewResult(BaseModel):
pipeline/search_retrieval.py CHANGED
@@ -1,63 +1,29 @@
1
  import os
2
  from typing import Dict, List
3
  import asyncio
4
- from langchain_google_genai import ChatGoogleGenerativeAI
5
  from langchain_community.utilities import ArxivAPIWrapper, SerpAPIWrapper
6
  from langchain_community.tools.semanticscholar.tool import SemanticScholarQueryRun
7
  from langchain_community.tools.tavily_search import TavilySearchResults
8
- from langchain.agents import AgentType, initialize_agent, AgentExecutor
9
- from langchain.tools import Tool
10
 
11
  from dotenv import load_dotenv
12
  load_dotenv()
13
 
14
- # Initialize LLM
15
- llm = ChatGoogleGenerativeAI(
16
- model=os.getenv("GEMINI_MODEL"),
17
- google_api_key=os.getenv("GEMINI_API_KEY"),
18
- max_retries=2,
19
  )
20
 
 
 
 
21
  # Initialize search tools
22
  semantic_scholar = SemanticScholarQueryRun()
23
  google_scholar = SerpAPIWrapper(params={"engine": "google_scholar"})
24
  arxiv_search = ArxivAPIWrapper()
25
  tavily_search = TavilySearchResults(max_results=5)
26
 
27
- # Define tools
28
- tools = [
29
- Tool(
30
- name="TavilySearch",
31
- func=tavily_search.run,
32
- description="Retrieves the latest State-of-the-Art (SoTA) research and current academic information"
33
- ),
34
- Tool(
35
- name="SemanticScholar",
36
- func=semantic_scholar.run,
37
- description="Find academic papers from Semantic Scholar database"
38
- ),
39
- Tool(
40
- name="GoogleScholar",
41
- func=google_scholar.run,
42
- description="Search for scholarly articles and citations"
43
- ),
44
- Tool(
45
- name="ArxivSearch",
46
- func=arxiv_search.run,
47
- description="Find research papers from ArXiv preprint repository"
48
- ),
49
- ]
50
-
51
- # Initialize agent
52
- agent = initialize_agent(
53
- tools=tools,
54
- llm=llm,
55
- agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
56
- verbose=False,
57
- handle_parsing_errors=True,
58
- max_iterations=10
59
- )
60
-
61
  def combine_critiques(critique_points: List[Dict]) -> Dict[str, str]:
62
  """
63
  Combine critique points from multiple reviews into categories
@@ -82,6 +48,16 @@ def combine_critiques(critique_points: List[Dict]) -> Dict[str, str]:
82
 
83
  return combined
84
 
 
 
 
 
 
 
 
 
 
 
85
  async def search_sota(paper_title: str, paper_abstract: str, retries: int = 3) -> str:
86
  """
87
  Search for state-of-the-art research related to the paper
@@ -94,29 +70,73 @@ async def search_sota(paper_title: str, paper_abstract: str, retries: int = 3) -
94
  Returns:
95
  Summary of SoTA findings
96
  """
97
- query = (
98
- f"Find the latest state-of-the-art research related to: '{paper_title}'. "
99
- f"Abstract: {paper_abstract[:500]}. "
100
- f"Focus on recent advances, similar methodologies, and competing approaches."
101
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
  for attempt in range(retries):
104
  try:
105
- result = await asyncio.to_thread(agent.run, query)
 
 
 
 
 
106
 
107
- if not result or len(result.strip()) < 50:
108
- raise ValueError("Empty or insufficient response")
109
 
110
- return result
111
 
112
  except Exception as e:
113
  wait_time = 2 ** attempt
114
- print(f"SoTA search attempt {attempt + 1} failed: {e}")
115
 
116
  if attempt < retries - 1:
117
  await asyncio.sleep(wait_time)
118
  else:
119
- return f"Error retrieving SoTA research: {str(e)}"
 
120
 
121
  async def retrieve_evidence_for_category(
122
  category: str,
@@ -137,28 +157,60 @@ async def retrieve_evidence_for_category(
137
  if critiques == "No critiques" or not critiques.strip():
138
  return f"No critiques to validate for {category}"
139
 
140
- query = (
141
- f"Find research papers that support or contradict these critiques "
142
- f"related to {category}: {critiques[:500]}"
143
- )
144
 
145
- for attempt in range(retries):
146
- try:
147
- result = await asyncio.to_thread(agent.run, query)
148
-
149
- if not result:
150
- raise ValueError("Empty response")
151
-
152
- return result
153
-
154
- except Exception as e:
155
- wait_time = 2 ** attempt
156
- print(f"Evidence retrieval for {category} attempt {attempt + 1} failed: {e}")
157
-
158
- if attempt < retries - 1:
159
- await asyncio.sleep(wait_time)
160
- else:
161
- return f"Error retrieving evidence for {category}: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
  async def retrieve_evidence(combined_critiques: Dict[str, str]) -> Dict[str, str]:
164
  """
 
1
  import os
2
  from typing import Dict, List
3
  import asyncio
4
+ from openai import OpenAI
5
  from langchain_community.utilities import ArxivAPIWrapper, SerpAPIWrapper
6
  from langchain_community.tools.semanticscholar.tool import SemanticScholarQueryRun
7
  from langchain_community.tools.tavily_search import TavilySearchResults
 
 
8
 
9
  from dotenv import load_dotenv
10
  load_dotenv()
11
 
12
+ # Initialize OpenRouter client for LLM calls
13
+ client = OpenAI(
14
+ base_url="https://openrouter.ai/api/v1",
15
+ api_key=os.getenv("OPENROUTER_API_KEY"),
 
16
  )
17
 
18
+ # Model for search/retrieval tasks
19
+ SEARCH_MODEL = "google/gemini-2.5-flash-lite"
20
+
21
  # Initialize search tools
22
  semantic_scholar = SemanticScholarQueryRun()
23
  google_scholar = SerpAPIWrapper(params={"engine": "google_scholar"})
24
  arxiv_search = ArxivAPIWrapper()
25
  tavily_search = TavilySearchResults(max_results=5)
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  def combine_critiques(critique_points: List[Dict]) -> Dict[str, str]:
28
  """
29
  Combine critique points from multiple reviews into categories
 
48
 
49
  return combined
50
 
51
+ async def run_search_tool(tool_name: str, tool_func, query: str) -> str:
52
+ """Run a search tool with error handling"""
53
+ try:
54
+ result = await asyncio.to_thread(tool_func, query)
55
+ return str(result) if result else ""
56
+ except Exception as e:
57
+ print(f"{tool_name} search failed: {e}")
58
+ return ""
59
+
60
+
61
  async def search_sota(paper_title: str, paper_abstract: str, retries: int = 3) -> str:
62
  """
63
  Search for state-of-the-art research related to the paper
 
70
  Returns:
71
  Summary of SoTA findings
72
  """
73
+ # Create search query
74
+ search_query = f"{paper_title} recent advances methodology"
75
+
76
+ # Run multiple searches in parallel
77
+ search_tasks = [
78
+ run_search_tool("Tavily", tavily_search.run, search_query),
79
+ run_search_tool("ArXiv", arxiv_search.run, search_query[:300]),
80
+ run_search_tool("SemanticScholar", semantic_scholar.run, paper_title),
81
+ ]
82
+
83
+ search_results = await asyncio.gather(*search_tasks)
84
+
85
+ # Combine all search results
86
+ combined_results = "\n\n".join([
87
+ f"=== Tavily Results ===\n{search_results[0]}" if search_results[0] else "",
88
+ f"=== ArXiv Results ===\n{search_results[1]}" if search_results[1] else "",
89
+ f"=== Semantic Scholar Results ===\n{search_results[2]}" if search_results[2] else "",
90
+ ])
91
+
92
+ if not combined_results.strip():
93
+ return "No SoTA research found from available sources."
94
+
95
+ # Use LLM to synthesize the results
96
+ system_prompt = """
97
+ You are an expert at synthesizing academic research findings.
98
+ Summarize the search results to identify state-of-the-art approaches and recent advances.
99
+ Focus on methodologies, key findings, and how they relate to the paper being reviewed.
100
+ """
101
+
102
+ user_prompt = f"""
103
+ Paper Title: {paper_title}
104
+ Paper Abstract: {paper_abstract[:500]}
105
+
106
+ Search Results:
107
+ {combined_results[:4000]}
108
+
109
+ Provide a concise summary of the state-of-the-art research relevant to this paper.
110
+ """
111
+
112
+ messages = [
113
+ {"role": "system", "content": system_prompt},
114
+ {"role": "user", "content": user_prompt},
115
+ ]
116
 
117
  for attempt in range(retries):
118
  try:
119
+ response = await asyncio.to_thread(
120
+ client.chat.completions.create,
121
+ model=SEARCH_MODEL,
122
+ messages=messages,
123
+ max_tokens=2048,
124
+ )
125
 
126
+ if not response.choices or not response.choices[0].message.content.strip():
127
+ raise ValueError("Empty response from API")
128
 
129
+ return response.choices[0].message.content.strip()
130
 
131
  except Exception as e:
132
  wait_time = 2 ** attempt
133
+ print(f"SoTA synthesis attempt {attempt + 1} failed: {e}")
134
 
135
  if attempt < retries - 1:
136
  await asyncio.sleep(wait_time)
137
  else:
138
+ # Return raw results if synthesis fails
139
+ return f"Raw search results (synthesis failed):\n{combined_results[:2000]}"
140
 
141
  async def retrieve_evidence_for_category(
142
  category: str,
 
157
  if critiques == "No critiques" or not critiques.strip():
158
  return f"No critiques to validate for {category}"
159
 
160
+ # Create targeted search query
161
+ search_query = f"{category} research validation {critiques[:200]}"
 
 
162
 
163
+ # Run search
164
+ try:
165
+ tavily_result = await run_search_tool("Tavily", tavily_search.run, search_query)
166
+ arxiv_result = await run_search_tool("ArXiv", arxiv_search.run, search_query[:200])
167
+
168
+ combined = f"{tavily_result}\n{arxiv_result}".strip()
169
+
170
+ if not combined:
171
+ return f"No evidence found for {category} critiques"
172
+
173
+ # Use LLM to analyze relevance
174
+ system_prompt = f"""
175
+ You are an expert at evaluating academic critiques.
176
+ Analyze the search results to find evidence that supports or contradicts the critiques.
177
+ Focus on the {category} aspect.
178
+ """
179
+
180
+ user_prompt = f"""
181
+ Critiques for {category}: {critiques}
182
+
183
+ Search Results:
184
+ {combined[:2000]}
185
+
186
+ Summarize the evidence found that relates to these critiques.
187
+ """
188
+
189
+ messages = [
190
+ {"role": "system", "content": system_prompt},
191
+ {"role": "user", "content": user_prompt},
192
+ ]
193
+
194
+ for attempt in range(retries):
195
+ try:
196
+ response = await asyncio.to_thread(
197
+ client.chat.completions.create,
198
+ model=SEARCH_MODEL,
199
+ messages=messages,
200
+ max_tokens=1024,
201
+ )
202
+
203
+ if response.choices and response.choices[0].message.content.strip():
204
+ return response.choices[0].message.content.strip()
205
+
206
+ except Exception as e:
207
+ if attempt < retries - 1:
208
+ await asyncio.sleep(2 ** attempt)
209
+
210
+ return f"Evidence retrieval completed for {category}"
211
+
212
+ except Exception as e:
213
+ return f"Error retrieving evidence for {category}: {str(e)}"
214
 
215
  async def retrieve_evidence(combined_critiques: Dict[str, str]) -> Dict[str, str]:
216
  """
requirements.txt CHANGED
@@ -1,14 +1,12 @@
1
  # Web Framework
2
  gradio==5.9.1
3
 
4
- # LLM Libraries
5
  openai==1.59.5
6
- google-generativeai==0.8.3
7
 
8
  # LangChain and Tools
9
  langchain==0.3.13
10
  langchain-community==0.3.13
11
- langchain-google-genai==2.0.8
12
  langgraph==0.2.59
13
  langgraph-checkpoint-sqlite==2.0.5
14
 
 
1
  # Web Framework
2
  gradio==5.9.1
3
 
4
+ # LLM Libraries (OpenRouter uses OpenAI SDK)
5
  openai==1.59.5
 
6
 
7
  # LangChain and Tools
8
  langchain==0.3.13
9
  langchain-community==0.3.13
 
10
  langgraph==0.2.59
11
  langgraph-checkpoint-sqlite==2.0.5
12
 
test_api.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Local test script for the MetaSearch API
3
+ Tests individual pipeline components with sample data
4
+ """
5
+
6
+ import asyncio
7
+ import os
8
+ from dotenv import load_dotenv
9
+
10
+ load_dotenv()
11
+
12
+ # Sample test data
13
+ SAMPLE_PAPER_TITLE = "Attention Is All You Need"
14
+ SAMPLE_PAPER_ABSTRACT = """
15
+ We propose a new simple network architecture, the Transformer, based solely on
16
+ attention mechanisms, dispensing with recurrence and convolutions entirely.
17
+ Experiments on two machine translation tasks show these models to be superior
18
+ in quality while being more parallelizable and requiring significantly less time to train.
19
+ """
20
+
21
+ SAMPLE_REVIEWS = [
22
+ """
23
+ This paper introduces a novel architecture that replaces recurrence with self-attention.
24
+
25
+ Strengths:
26
+ - The model achieves state-of-the-art results on translation benchmarks
27
+ - Training is significantly faster due to parallelization
28
+ - The attention visualization provides interpretability
29
+
30
+ Weaknesses:
31
+ - Limited evaluation on other NLP tasks beyond translation
32
+ - The computational complexity of self-attention scales quadratically with sequence length
33
+ - Missing comparison with some recent RNN variants
34
+
35
+ The methodology is sound but could benefit from more diverse experiments.
36
+ Overall, this is a strong contribution to the field.
37
+ """,
38
+ """
39
+ The Transformer architecture is an interesting departure from RNN-based models.
40
+
41
+ Strengths:
42
+ - Clean and elegant architecture design
43
+ - Strong empirical results on WMT benchmarks
44
+ - Good ablation studies
45
+
46
+ Weaknesses:
47
+ - The paper overclaims novelty - attention mechanisms existed before
48
+ - Experiments are limited to machine translation only
49
+ - No theoretical analysis of why this works better
50
+ - Memory requirements are high for long sequences
51
+
52
+ The significance of this work is questionable given the narrow evaluation scope.
53
+ """,
54
+ """
55
+ This is a well-written paper with clear presentation of a new architecture.
56
+
57
+ Strengths:
58
+ - Excellent results, setting new SOTA on translation
59
+ - The multi-head attention is a clever innovation
60
+ - Reproducibility details are provided
61
+
62
+ Weaknesses:
63
+ - Claims of "attention is all you need" are overstated
64
+ - Limited to sequence-to-sequence tasks
65
+ - Positional encoding seems like a hack
66
+
67
+ Overall a solid paper with important contributions despite some limitations.
68
+ """
69
+ ]
70
+
71
+
72
+ async def test_critique_extraction():
73
+ """Test the critique extraction module"""
74
+ print("\n" + "="*60)
75
+ print("Testing Critique Extraction")
76
+ print("="*60)
77
+
78
+ from pipeline.critique_extraction import extract_critiques
79
+
80
+ print(f"Processing {len(SAMPLE_REVIEWS)} reviews...")
81
+ critiques = await extract_critiques(SAMPLE_REVIEWS)
82
+
83
+ for i, critique in enumerate(critiques):
84
+ print(f"\n--- Review {i+1} Critiques ---")
85
+ for category, points in critique.items():
86
+ if category != "error" and points:
87
+ print(f" {category}: {len(points)} points")
88
+ for point in points[:2]: # Show first 2 points
89
+ print(f" - {point[:80]}...")
90
+
91
+ return critiques
92
+
93
+
94
+ async def test_disagreement_detection(critiques):
95
+ """Test the disagreement detection module"""
96
+ print("\n" + "="*60)
97
+ print("Testing Disagreement Detection")
98
+ print("="*60)
99
+
100
+ from pipeline.disagreement_detection import detect_disagreements
101
+
102
+ print(f"Detecting disagreements across {len(critiques)} reviews...")
103
+ disagreements = await detect_disagreements(critiques)
104
+
105
+ for d in disagreements:
106
+ pair = d.get('review_pair', [])
107
+ score = d.get('disagreement_score', 0)
108
+ print(f"\n--- Reviews {pair[0]+1} vs {pair[1]+1} ---")
109
+ print(f" Disagreement Score: {score:.2f}")
110
+
111
+ details = d.get('disagreement_details', {})
112
+ for category, points in details.items():
113
+ if points:
114
+ print(f" {category}: {len(points)} disagreements")
115
+
116
+ return disagreements
117
+
118
+
119
+ async def test_search_retrieval(critiques):
120
+ """Test the search and retrieval module"""
121
+ print("\n" + "="*60)
122
+ print("Testing Search & Retrieval")
123
+ print("="*60)
124
+
125
+ from pipeline.search_retrieval import search_and_retrieve
126
+
127
+ print("Searching for SoTA research and evidence...")
128
+ results = await search_and_retrieve(
129
+ SAMPLE_PAPER_TITLE,
130
+ SAMPLE_PAPER_ABSTRACT,
131
+ critiques
132
+ )
133
+
134
+ print(f"\n--- SoTA Results (first 500 chars) ---")
135
+ print(results.get('SoTA_Results', 'N/A')[:500])
136
+
137
+ print(f"\n--- Combined Critiques ---")
138
+ for cat, text in results.get('Combined_Critiques', {}).items():
139
+ print(f" {cat}: {len(text)} chars")
140
+
141
+ print(f"\n--- Retrieved Evidence ---")
142
+ for cat, evidence in results.get('Retrieved_Evidence', {}).items():
143
+ print(f" {cat}: {len(evidence)} chars")
144
+
145
+ return results
146
+
147
+
148
+ async def test_disagreement_resolution(critiques, disagreements, search_results):
149
+ """Test the disagreement resolution module"""
150
+ print("\n" + "="*60)
151
+ print("Testing Disagreement Resolution")
152
+ print("="*60)
153
+
154
+ from pipeline.disagreement_resolution import resolve_disagreements
155
+
156
+ print(f"Resolving {len(disagreements)} disagreements...")
157
+ resolutions = await resolve_disagreements(
158
+ SAMPLE_PAPER_TITLE,
159
+ SAMPLE_PAPER_ABSTRACT,
160
+ disagreements,
161
+ critiques,
162
+ search_results
163
+ )
164
+
165
+ for i, resolution in enumerate(resolutions):
166
+ print(f"\n--- Resolution {i+1} ---")
167
+ details = resolution.get('resolution_details', {})
168
+
169
+ accepted = details.get('accepted_critique_points', {})
170
+ rejected = details.get('rejected_critique_points', {})
171
+
172
+ print(f" Accepted categories: {list(accepted.keys())}")
173
+ print(f" Rejected categories: {list(rejected.keys())}")
174
+
175
+ summary = details.get('final_resolution_summary', '')
176
+ print(f" Summary: {summary[:200]}...")
177
+
178
+ return resolutions
179
+
180
+
181
+ async def test_meta_review(resolutions, search_results):
182
+ """Test the meta-review generation module"""
183
+ print("\n" + "="*60)
184
+ print("Testing Meta-Review Generation")
185
+ print("="*60)
186
+
187
+ from pipeline.meta_review import generate_meta_review
188
+
189
+ print("Generating meta-review...")
190
+ meta_review = await generate_meta_review(
191
+ SAMPLE_PAPER_TITLE,
192
+ SAMPLE_PAPER_ABSTRACT,
193
+ resolutions,
194
+ search_results
195
+ )
196
+
197
+ print(f"\n--- Meta-Review (first 1000 chars) ---")
198
+ print(meta_review[:1000])
199
+ print("...")
200
+
201
+ return meta_review
202
+
203
+
204
+ async def run_full_pipeline():
205
+ """Run the complete pipeline test"""
206
+ print("\n" + "#"*60)
207
+ print("# MetaSearch API - Full Pipeline Test")
208
+ print("#"*60)
209
+
210
+ # Check environment
211
+ if not os.getenv("OPENROUTER_API_KEY"):
212
+ print("\n❌ ERROR: OPENROUTER_API_KEY not set!")
213
+ print("Please set it in your .env file")
214
+ return
215
+
216
+ print("\n✅ OPENROUTER_API_KEY is set")
217
+
218
+ try:
219
+ # Step 1: Extract critiques
220
+ critiques = await test_critique_extraction()
221
+
222
+ # Step 2: Detect disagreements
223
+ disagreements = await test_disagreement_detection(critiques)
224
+
225
+ # Step 3: Search and retrieve (optional - can be slow)
226
+ search_results = await test_search_retrieval(critiques)
227
+
228
+ # Step 4: Resolve disagreements
229
+ resolutions = await test_disagreement_resolution(
230
+ critiques, disagreements, search_results
231
+ )
232
+
233
+ # Step 5: Generate meta-review
234
+ meta_review = await test_meta_review(resolutions, search_results)
235
+
236
+ print("\n" + "#"*60)
237
+ print("# ✅ Full Pipeline Test Complete!")
238
+ print("#"*60)
239
+
240
+ except Exception as e:
241
+ print(f"\n❌ Pipeline failed with error: {e}")
242
+ import traceback
243
+ traceback.print_exc()
244
+
245
+
246
+ async def run_quick_test():
247
+ """Run a quick test of just critique extraction"""
248
+ print("\n" + "#"*60)
249
+ print("# MetaSearch API - Quick Test (Critique Extraction Only)")
250
+ print("#"*60)
251
+
252
+ if not os.getenv("OPENROUTER_API_KEY"):
253
+ print("\n❌ ERROR: OPENROUTER_API_KEY not set!")
254
+ return
255
+
256
+ print("\n✅ OPENROUTER_API_KEY is set")
257
+
258
+ try:
259
+ critiques = await test_critique_extraction()
260
+ print("\n✅ Quick test passed!")
261
+ except Exception as e:
262
+ print(f"\n❌ Test failed: {e}")
263
+ import traceback
264
+ traceback.print_exc()
265
+
266
+
267
+ if __name__ == "__main__":
268
+ import sys
269
+
270
+ if len(sys.argv) > 1 and sys.argv[1] == "--quick":
271
+ asyncio.run(run_quick_test())
272
+ else:
273
+ asyncio.run(run_full_pipeline())