BinKhoaLe1812 commited on
Commit
b49a32d
·
verified ·
1 Parent(s): 337fac1

Upd citation parser for multi refs

Browse files
Files changed (1) hide show
  1. api/chatbot.py +31 -14
api/chatbot.py CHANGED
@@ -124,7 +124,7 @@ class RAGMedicalChatbot:
124
  # 5. Search context with citation instructions
125
  if search_context:
126
  parts.append("Additional information from web search:\n" + search_context)
127
- parts.append("IMPORTANT: When you use information from the web search results above, you MUST add a citation tag <#ID> immediately after the relevant content, where ID is the document number (1, 2, 3, etc.). For example: 'According to recent studies <#1>, this condition affects...'")
128
 
129
  parts.append(f"User's question: {user_query}")
130
  parts.append(f"Language to generate answer: {lang}")
@@ -149,24 +149,41 @@ class RAGMedicalChatbot:
149
  return response.strip()
150
 
151
  def _process_citations(self, response: str, url_mapping: Dict[int, str]) -> str:
152
- """Replace citation tags with actual URLs"""
153
 
154
- # Find all citation tags like <#1>, <#2>, etc.
155
- citation_pattern = r'<#(\d+)>'
156
- citations_found = re.findall(citation_pattern, response)
157
 
158
  def replace_citation(match):
159
- doc_id = int(match.group(1))
160
- if doc_id in url_mapping:
161
- url = url_mapping[doc_id]
162
- logger.info(f"[CITATION] Replacing <#{doc_id}> with {url}")
163
- return f'<{url}>'
164
- else:
165
- logger.warning(f"[CITATION] No URL mapping found for document ID {doc_id}")
166
- return match.group(0) # Keep original if URL not found
 
 
 
 
 
 
 
 
 
 
 
 
 
167
 
168
  # Replace citations with URLs
169
  processed_response = re.sub(citation_pattern, replace_citation, response)
170
 
171
- logger.info(f"[CITATION] Processed {len(citations_found)} citations, {len(url_mapping)} URL mappings available")
 
 
 
 
 
172
  return processed_response
 
124
  # 5. Search context with citation instructions
125
  if search_context:
126
  parts.append("Additional information from web search:\n" + search_context)
127
+ parts.append("IMPORTANT: When you use information from the web search results above, you MUST add citation tags immediately after the relevant content. Use single citations like <#1> or multiple citations like <#1, #2, #5> when information comes from multiple sources. For example: 'According to recent studies <#1>, this condition affects...' or 'Multiple sources <#1, #3, #7> suggest that...'")
128
 
129
  parts.append(f"User's question: {user_query}")
130
  parts.append(f"Language to generate answer: {lang}")
 
149
  return response.strip()
150
 
151
  def _process_citations(self, response: str, url_mapping: Dict[int, str]) -> str:
152
+ """Replace citation tags with actual URLs, handling both single and multiple references"""
153
 
154
+ # Pattern to match both single citations <#1> and multiple citations <#1, #2, #5, #7, #9>
155
+ citation_pattern = r'<#([^>]+)>'
 
156
 
157
  def replace_citation(match):
158
+ citation_content = match.group(1)
159
+ # Split by comma and clean up each citation ID
160
+ citation_ids = [id_str.strip() for id_str in citation_content.split(',')]
161
+
162
+ urls = []
163
+ for citation_id in citation_ids:
164
+ try:
165
+ doc_id = int(citation_id)
166
+ if doc_id in url_mapping:
167
+ url = url_mapping[doc_id]
168
+ urls.append(f'<{url}>')
169
+ logger.info(f"[CITATION] Replacing <#{doc_id}> with {url}")
170
+ else:
171
+ logger.warning(f"[CITATION] No URL mapping found for document ID {doc_id}")
172
+ urls.append(f'<#{doc_id}>') # Keep original if URL not found
173
+ except ValueError:
174
+ logger.warning(f"[CITATION] Invalid citation ID: {citation_id}")
175
+ urls.append(f'<#{citation_id}>') # Keep original if invalid
176
+
177
+ # Join multiple URLs with spaces
178
+ return ' '.join(urls)
179
 
180
  # Replace citations with URLs
181
  processed_response = re.sub(citation_pattern, replace_citation, response)
182
 
183
+ # Count total citations processed
184
+ citations_found = re.findall(citation_pattern, response)
185
+ total_citations = sum(len([id_str.strip() for id_str in citation_content.split(',')])
186
+ for citation_content in citations_found)
187
+
188
+ logger.info(f"[CITATION] Processed {total_citations} citations from {len(citations_found)} citation groups, {len(url_mapping)} URL mappings available")
189
  return processed_response