Mr-Thop Nand0ZZ commited on
Commit
9f21c0c
·
verified ·
1 Parent(s): cc703a4

Update rm.py (#3)

Browse files

- Update rm.py (47d17a42db934d2b6381a9d49347dfd4de4b8f85)


Co-authored-by: Nandini Patawri <Nand0ZZ@users.noreply.huggingface.co>

Files changed (1) hide show
  1. rm.py +148 -115
rm.py CHANGED
@@ -8,185 +8,218 @@ from google import genai
8
  import json
9
  import logging
10
 
11
-
12
-
13
-
14
  f_app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
15
  app = Flask(__name__)
16
  CORS(app)
17
 
18
  client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
- SYSTEM_PROMPT = os.getenv("SYSTEM_PROMPT")
22
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(name)s - %(message)s')
23
-
24
- def get_google_scholar_results(key_params: dict):
25
- key_params['api_key'] = os.getenv("SERPAPI_API_KEY")
26
- key_params['engine'] = "google_scholar"
27
- key_params['hl'] = "en"
28
  search = GoogleSearch(key_params)
29
  results = search.get_dict()
30
- if "profiles" in results and "organic_results" in results:
31
- return results["profiles"],results["organic_results"]
32
- elif "profiles" in results:
33
- return results["profiles"],None
34
- elif "organic_results" in results:
35
- return None,results["organic_results"]
36
- else:
37
- return None,None
38
 
39
- def get_results(query: str):
40
- '''
41
- This function is used to get the results from the Google Scholar API.
42
- It takes a query as input and returns a list of dictionaries, each containing the information about a paper/author.
43
- The keys of the dictionaries are the fields of the paper.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
- Keys of the dictionary are:
46
- dict_keys(['position', 'title', 'result_id', 'link', 'snippet', 'publication_info', 'resources', 'inline_links'])
47
- '''
48
- params = {
49
- "q": query,
50
- }
51
 
52
  answer = []
53
- keys = []
 
54
 
55
- profiles,result = get_google_scholar_results(params)
56
- if result:
57
- keys = result[0].keys()
58
- for i in range(len(result)):
59
  output = {}
60
- if "title" in result[i]:
61
- output["title"] = result[i]["title"]
62
- if "result_id" in result[i]:
63
- output["result_id"] = result[i]["result_id"]
64
- if "link" in result[i]:
65
- output["link"] = result[i]["link"]
66
- if "https://www.annualreviews" in result[i]["link"]:
67
- output["abstract"] = get_abstract(result[i]["link"])
68
- if "snippet" in result[i]:
69
- output["snippet"] = result[i]["snippet"]
70
- if "publication_info" in result[i]:
71
- output["publication_info"] = result[i]["publication_info"]
72
- if "resources" in result[i]:
73
- output["resources"] = result[i]["resources"]
74
-
75
  answer.append(output)
76
-
77
 
78
- return profiles,answer,keys
79
 
 
80
  def get_abstract(url: str):
81
- scrape_result = f_app.scrape(url, formats=['markdown', 'html'])
82
  if "Abstract" in scrape_result.html:
83
  offset = scrape_result.html.find("Abstract")
84
  start = scrape_result.html[offset:].find("<p>")
85
- end = scrape_result.html[offset+start:].find("</p>")
86
- return scrape_result.html[offset+start:offset+start+end]
87
- else:
88
- return "Abstract not found"
89
-
90
- def scrape_web(url:str):
91
- '''
92
- This function is used inorder to scrape any websitye based on its url
93
- Returns the html code of the webpage
94
- '''
95
- scrape_result = f_app.scrape(url, formats=['markdown', 'html'])
96
  return scrape_result.html
97
 
98
- def get_response(chat_client,user):
 
 
 
99
  response = chat_client.send_message(user)
100
  return response.candidates[0].content.parts[0].text
101
 
102
  def convert_to_json(text):
103
  start = text.find("{")
104
  end = text[::-1].find("}")
105
- json_text = text[start : -end]
106
  try:
107
  return json.loads(json_text)
108
  except Exception as e:
109
  return "Json Parse Error due to " + str(e)
110
 
111
- def get_observation(function,inp):
112
- functions = ["get_results","scrape_web"]
113
  if function == functions[0]:
114
- profiles,answer,keys = get_results(inp)
 
 
 
 
 
115
  out_dict = {
116
- "state" : "OBSERVATION",
117
- "observation" : {
118
- "profiles" : profiles,
119
- "answer" : answer,
120
- "keys" : keys
121
  }
122
  }
123
  elif function == functions[1]:
124
  html_text = scrape_web(inp)
125
  out_dict = {
126
- "state" : "OBSERVATION",
127
- "observation" : {
128
- "html_text" : html_text
129
- }
130
  }
131
  else:
132
  out_dict = {
133
- "state" : "OBSERVATION",
134
- "observation" : {
135
- "message":"Function Not found, Please Retry"
136
- }
137
  }
138
  return out_dict
139
 
140
- def get_output(chat_client,inp):
141
- response = get_response(chat_client,str(inp))
142
  output = convert_to_json(response)
143
- while output["state"] != "OUTPUT":
144
- if output["state"] == "PLAN":
145
- response = get_response(chat_client,str(output))
146
  output = convert_to_json(response)
147
- elif output["state"] == "CALL":
148
- function = output["function_name"]
149
- for i in output["params"].keys():
150
- inp = output["params"][i]
151
- obs = get_observation(function,inp)
152
- response = get_response(chat_client,str(obs))
 
 
 
153
  output = convert_to_json(response)
154
- elif output["state"] == "OBSERVATION":
155
- response = get_response(chat_client,str(output))
156
  output = convert_to_json(response)
157
  else:
158
- response = get_response(chat_client,str(output))
159
  output = convert_to_json(response)
160
  return output
161
-
162
-
163
-
164
 
165
  def chat(query: str):
166
- chat_client = client.chats.create(
167
- model="gemini-2.5-flash"
168
- )
169
- response = get_response(chat_client,SYSTEM_PROMPT)
170
- inp = {
171
- "state" : "START",
172
- "user" : query
173
- }
174
-
175
- output = get_output(chat_client,inp)
176
  return output["output"]
177
 
178
-
179
- @app.route("/",methods=["GET"])
180
  def default():
181
  return jsonify({"message": "Backend Working Successfully"})
182
 
183
- @app.route("/chat",methods=["POST","GET"])
184
  def get_chat_results():
185
- query = request.json.get("query")
186
- app.logger.info(f"Chat Initiated : {query}")
 
 
 
 
 
 
 
187
  output = chat(query)
188
- app.logger.info("Output Parsed")
189
- return jsonify({"output":output})
190
 
191
 
192
 
 
8
  import json
9
  import logging
10
 
11
+ # -------- App & clients --------
 
 
12
  f_app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
13
  app = Flask(__name__)
14
  CORS(app)
15
 
16
  client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
17
 
18
+ # Safe fallback so we never pass None into send_message
19
+ SYSTEM_PROMPT = os.getenv(
20
+ "SYSTEM_PROMPT",
21
+ "You are a helpful research assistant. Respond using a JSON state machine with states PLAN, CALL, OBSERVATION, OUTPUT."
22
+ )
23
+
24
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
25
+ log = logging.getLogger("rm.py")
26
+
27
+ # -------- Scholar search (location-aware) --------
28
+ def get_google_scholar_results(key_params: dict, location: str | None = None):
29
+ """
30
+ Calls SerpAPI for Google Scholar results.
31
+ If `location` is provided, filter author profiles whose text contains that location.
32
+ """
33
+ key_params["api_key"] = os.getenv("SERPAPI_API_KEY")
34
+ key_params["engine"] = "google_scholar"
35
+ key_params["hl"] = "en"
36
 
 
 
 
 
 
 
 
37
  search = GoogleSearch(key_params)
38
  results = search.get_dict()
 
 
 
 
 
 
 
 
39
 
40
+ profiles = results.get("profiles")
41
+ organic = results.get("organic_results")
42
+
43
+ if profiles and location:
44
+ loc = location.strip().lower()
45
+ filtered = []
46
+ for p in profiles:
47
+ # defensively join a few text fields and do a simple substring match
48
+ haystack_parts = [
49
+ str(p.get("name", "")),
50
+ str(p.get("affiliations", "")),
51
+ str(p.get("description", "")),
52
+ str(p.get("position", "")),
53
+ str(p.get("link", "")),
54
+ str(p.get("email", "")),
55
+ ]
56
+ haystack = " | ".join(haystack_parts).lower()
57
+ if loc in haystack:
58
+ filtered.append(p)
59
+ profiles = filtered
60
+
61
+ return profiles, organic
62
+
63
+ def get_results(query):
64
+ """
65
+ Location-aware Google Scholar retrieval.
66
+
67
+ Accepts:
68
+ - string query, OR
69
+ - dict with keys: {"query" or "q", "location" (optional)}
70
+
71
+ Returns: (profiles, answer, keys)
72
+ - profiles: possibly filtered by location
73
+ - answer: simplified list of organic results
74
+ - keys: keys present in the first organic result (if any)
75
+ """
76
+ if isinstance(query, dict):
77
+ q = query.get("query") or query.get("q") or ""
78
+ location = query.get("location")
79
+ else:
80
+ q = str(query)
81
+ location = None
82
 
83
+ q_for_scholar = f"{q} {location}".strip() if location else q
84
+ params = {"q": q_for_scholar}
 
 
 
 
85
 
86
  answer = []
87
+ profiles, organic = get_google_scholar_results(params, location=location)
88
+ keys = organic[0].keys() if organic and len(organic) > 0 else []
89
 
90
+ if organic:
91
+ for item in organic:
 
 
92
  output = {}
93
+ if "title" in item:
94
+ output["title"] = item["title"]
95
+ if "result_id" in item:
96
+ output["result_id"] = item["result_id"]
97
+ if "link" in item:
98
+ output["link"] = item["link"]
99
+ log.info("Result link: %s", output["link"])
100
+ if "https://www.annualreviews" in item["link"]:
101
+ output["abstract"] = get_abstract(item["link"])
102
+ if "snippet" in item:
103
+ output["snippet"] = item["snippet"]
104
+ if "publication_info" in item:
105
+ output["publication_info"] = item["publication_info"]
106
+ if "resources" in item:
107
+ output["resources"] = item["resources"]
108
  answer.append(output)
 
109
 
110
+ return profiles, answer, keys
111
 
112
+ # -------- Scraping / LLM helpers --------
113
  def get_abstract(url: str):
114
+ scrape_result = f_app.scrape_url(url, formats=["markdown", "html"])
115
  if "Abstract" in scrape_result.html:
116
  offset = scrape_result.html.find("Abstract")
117
  start = scrape_result.html[offset:].find("<p>")
118
+ end = scrape_result.html[offset + start:].find("</p>")
119
+ return scrape_result.html[offset + start : offset + start + end]
120
+ return "Abstract not found"
121
+
122
+ def scrape_web(url: str):
123
+ scrape_result = f_app.scrape_url(url, formats=["markdown", "html"])
 
 
 
 
 
124
  return scrape_result.html
125
 
126
+ def get_response(chat_client, user):
127
+ # never pass None to the SDK
128
+ if user is None:
129
+ user = ""
130
  response = chat_client.send_message(user)
131
  return response.candidates[0].content.parts[0].text
132
 
133
  def convert_to_json(text):
134
  start = text.find("{")
135
  end = text[::-1].find("}")
136
+ json_text = text[start : -end] if end != -1 else text[start:]
137
  try:
138
  return json.loads(json_text)
139
  except Exception as e:
140
  return "Json Parse Error due to " + str(e)
141
 
142
+ def get_observation(function, inp):
143
+ functions = ["get_results", "scrape_web"]
144
  if function == functions[0]:
145
+ if isinstance(inp, dict):
146
+ q = inp.get("query") or inp.get("q") or ""
147
+ location = inp.get("location")
148
+ profiles, answer, keys = get_results({"query": q, "location": location})
149
+ else:
150
+ profiles, answer, keys = get_results(inp)
151
  out_dict = {
152
+ "state": "OBSERVATION",
153
+ "observation": {
154
+ "profiles": profiles,
155
+ "answer": answer,
156
+ "keys": list(keys) if keys else []
157
  }
158
  }
159
  elif function == functions[1]:
160
  html_text = scrape_web(inp)
161
  out_dict = {
162
+ "state": "OBSERVATION",
163
+ "observation": {"html_text": html_text}
 
 
164
  }
165
  else:
166
  out_dict = {
167
+ "state": "OBSERVATION",
168
+ "observation": {"message": "Function Not found, Please Retry"}
 
 
169
  }
170
  return out_dict
171
 
172
+ def get_output(chat_client, inp):
173
+ response = get_response(chat_client, str(inp))
174
  output = convert_to_json(response)
175
+ while isinstance(output, dict) and output.get("state") != "OUTPUT":
176
+ if output.get("state") == "PLAN":
177
+ response = get_response(chat_client, str(output))
178
  output = convert_to_json(response)
179
+ elif output.get("state") == "CALL":
180
+ function = output.get("function_name")
181
+ params_obj = output.get("params", {})
182
+ inp_to_fn = params_obj if isinstance(params_obj, dict) and params_obj else None
183
+ if not inp_to_fn:
184
+ for k in params_obj.keys():
185
+ inp_to_fn = params_obj[k]
186
+ obs = get_observation(function, inp_to_fn)
187
+ response = get_response(chat_client, str(obs))
188
  output = convert_to_json(response)
189
+ elif output.get("state") == "OBSERVATION":
190
+ response = get_response(chat_client, str(output))
191
  output = convert_to_json(response)
192
  else:
193
+ response = get_response(chat_client, str(output))
194
  output = convert_to_json(response)
195
  return output
 
 
 
196
 
197
  def chat(query: str):
198
+ chat_client = client.chats.create(model="gemini-2.5-flash")
199
+ _ = get_response(chat_client, SYSTEM_PROMPT)
200
+ inp = {"state": "START", "user": query}
201
+ output = get_output(chat_client, inp)
 
 
 
 
 
 
202
  return output["output"]
203
 
204
+ # -------- Routes --------
205
+ @app.route("/", methods=["GET"])
206
  def default():
207
  return jsonify({"message": "Backend Working Successfully"})
208
 
209
+ @app.route("/chat", methods=["POST", "GET"])
210
  def get_chat_results():
211
+ if request.method == "POST":
212
+ data = request.get_json(silent=True) or {}
213
+ query = data.get("query")
214
+ else: # GET
215
+ query = request.args.get("query")
216
+
217
+ if not query:
218
+ return jsonify({"error": "No query provided"}), 400
219
+
220
  output = chat(query)
221
+ return jsonify({"output": output})
222
+
223
 
224
 
225