Spaces:
Sleeping
Sleeping
Upd citation parser for multi refs
Browse files- api/chatbot.py +31 -14
api/chatbot.py
CHANGED
|
@@ -124,7 +124,7 @@ class RAGMedicalChatbot:
|
|
| 124 |
# 5. Search context with citation instructions
|
| 125 |
if search_context:
|
| 126 |
parts.append("Additional information from web search:\n" + search_context)
|
| 127 |
-
parts.append("IMPORTANT: When you use information from the web search results above, you MUST add
|
| 128 |
|
| 129 |
parts.append(f"User's question: {user_query}")
|
| 130 |
parts.append(f"Language to generate answer: {lang}")
|
|
@@ -149,24 +149,41 @@ class RAGMedicalChatbot:
|
|
| 149 |
return response.strip()
|
| 150 |
|
| 151 |
def _process_citations(self, response: str, url_mapping: Dict[int, str]) -> str:
|
| 152 |
-
"""Replace citation tags with actual URLs"""
|
| 153 |
|
| 154 |
-
#
|
| 155 |
-
citation_pattern = r'<#(
|
| 156 |
-
citations_found = re.findall(citation_pattern, response)
|
| 157 |
|
| 158 |
def replace_citation(match):
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
|
| 168 |
# Replace citations with URLs
|
| 169 |
processed_response = re.sub(citation_pattern, replace_citation, response)
|
| 170 |
|
| 171 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
return processed_response
|
|
|
|
| 124 |
# 5. Search context with citation instructions
|
| 125 |
if search_context:
|
| 126 |
parts.append("Additional information from web search:\n" + search_context)
|
| 127 |
+
parts.append("IMPORTANT: When you use information from the web search results above, you MUST add citation tags immediately after the relevant content. Use single citations like <#1> or multiple citations like <#1, #2, #5> when information comes from multiple sources. For example: 'According to recent studies <#1>, this condition affects...' or 'Multiple sources <#1, #3, #7> suggest that...'")
|
| 128 |
|
| 129 |
parts.append(f"User's question: {user_query}")
|
| 130 |
parts.append(f"Language to generate answer: {lang}")
|
|
|
|
| 149 |
return response.strip()
|
| 150 |
|
| 151 |
def _process_citations(self, response: str, url_mapping: Dict[int, str]) -> str:
|
| 152 |
+
"""Replace citation tags with actual URLs, handling both single and multiple references"""
|
| 153 |
|
| 154 |
+
# Pattern to match both single citations <#1> and multiple citations <#1, #2, #5, #7, #9>
|
| 155 |
+
citation_pattern = r'<#([^>]+)>'
|
|
|
|
| 156 |
|
| 157 |
def replace_citation(match):
|
| 158 |
+
citation_content = match.group(1)
|
| 159 |
+
# Split by comma and clean up each citation ID
|
| 160 |
+
citation_ids = [id_str.strip() for id_str in citation_content.split(',')]
|
| 161 |
+
|
| 162 |
+
urls = []
|
| 163 |
+
for citation_id in citation_ids:
|
| 164 |
+
try:
|
| 165 |
+
doc_id = int(citation_id)
|
| 166 |
+
if doc_id in url_mapping:
|
| 167 |
+
url = url_mapping[doc_id]
|
| 168 |
+
urls.append(f'<{url}>')
|
| 169 |
+
logger.info(f"[CITATION] Replacing <#{doc_id}> with {url}")
|
| 170 |
+
else:
|
| 171 |
+
logger.warning(f"[CITATION] No URL mapping found for document ID {doc_id}")
|
| 172 |
+
urls.append(f'<#{doc_id}>') # Keep original if URL not found
|
| 173 |
+
except ValueError:
|
| 174 |
+
logger.warning(f"[CITATION] Invalid citation ID: {citation_id}")
|
| 175 |
+
urls.append(f'<#{citation_id}>') # Keep original if invalid
|
| 176 |
+
|
| 177 |
+
# Join multiple URLs with spaces
|
| 178 |
+
return ' '.join(urls)
|
| 179 |
|
| 180 |
# Replace citations with URLs
|
| 181 |
processed_response = re.sub(citation_pattern, replace_citation, response)
|
| 182 |
|
| 183 |
+
# Count total citations processed
|
| 184 |
+
citations_found = re.findall(citation_pattern, response)
|
| 185 |
+
total_citations = sum(len([id_str.strip() for id_str in citation_content.split(',')])
|
| 186 |
+
for citation_content in citations_found)
|
| 187 |
+
|
| 188 |
+
logger.info(f"[CITATION] Processed {total_citations} citations from {len(citations_found)} citation groups, {len(url_mapping)} URL mappings available")
|
| 189 |
return processed_response
|