Spaces:

mdredze1
/

tobacco-watcher-chat-with-citations

Running

App Files Files Community

vtiyyal1 commited on 1 day ago

Commit

9ca13f0

•

1 Parent(s): cde1f49

Upload feed_to_llm_v2.py

Browse files

Files changed (1) hide show

feed_to_llm_v2.py +40 -21

feed_to_llm_v2.py CHANGED Viewed

@@ -65,33 +65,52 @@ def feed_articles_to_gpt_with_links(information, question):
     print("LLM Response Content:", response_content)
     # Extract sources from the response content
-    matches = re.findall(r'\((Article \d+)\)', response_content)
-    if not matches:
-        print("No sources found in the response.")
         return response_content, [], [], []
-    unique_matches = list(set(matches))
-    used_article_nums = [int(re.findall(r'\d+', match)[0]) - 1 for match in unique_matches]
-    # Create citation list
     citations = []
-    for idx, num in enumerate(used_article_nums, start=1):
-        citation = f"{idx}. {titles_list[num]} ({domains_list[num]})"
         citations.append(citation)
-    # Replace article numbers with citation numbers in response
-    for i, match in enumerate(unique_matches, start=1):
-        response_content = response_content.replace(match, f"[{i}]")
-    # Append citations to the response
-    response_with_citations = f"{response_content}\n\nReferences:\n" + "\n".join(citations)
-    # Prepare links with titles and domains
-    links = [f"https://tobaccowatcher.globaltobaccocontrol.org/articles/{uuid}/" for uuid in uuids]
-    hyperlinks = [f"<a href='{link}' target='_blank'>{titles_list[i]}</a> ({domains_list[i]})" for i, link in enumerate(links)]
-    return response_with_citations, hyperlinks, titles_list, domains_list
 if __name__ == "__main__":
     question = "How is United States fighting against tobacco addiction?"

     print("LLM Response Content:", response_content)
     # Extract sources from the response content
+    inline_matches = re.findall(r'Article \d+', response_content)
+    parenthetical_matches = re.findall(r'\(Article \d+\)', response_content)
+    if not (inline_matches or parenthetical_matches):
         return response_content, [], [], []
+    # Combine and get unique article numbers
+    all_matches = inline_matches + [m.strip('()') for m in parenthetical_matches]
+    unique_articles = list(set(all_matches))
+    used_article_nums = [int(re.findall(r'\d+', match)[0]) - 1 for match in unique_articles]
+    # Create citation mapping
+    citation_map = {}
     citations = []
+    for idx, article_num in enumerate(used_article_nums, start=1):
+        original = f"Article {article_num + 1}"
+        citation_map[original] = f"[{idx}]"
+        citation = f"[{idx}] {titles_list[article_num]} ({domains_list[article_num]})"
         citations.append(citation)
+    # Replace all article references with citation numbers
+    modified_response = response_content
+    for original, citation_num in citation_map.items():
+        # Replace both inline and parenthetical references
+        modified_response = modified_response.replace(f"({original})", citation_num)
+        modified_response = modified_response.replace(original, citation_num)
+    # Format final response with citations
+    response_with_citations = (
+        f"{modified_response}\n\n"
+        f"References:\n"
+        f"{chr(10).join(citations)}"
+    )
+    # Prepare links only for cited articles
+    cited_links = []
+    cited_titles = []
+    cited_domains = []
+    for article_num in used_article_nums:
+        uuid = uuids[article_num]
+        link = f"https://tobaccowatcher.globaltobaccocontrol.org/articles/{uuid}/"
+        cited_links.append(link)
+        cited_titles.append(titles_list[article_num])
+        cited_domains.append(domains_list[article_num])
+    return response_with_citations, cited_links, cited_titles, cited_domains
 if __name__ == "__main__":
     question = "How is United States fighting against tobacco addiction?"