Spaces:

mdredze1
/

tobacco-watcher-chat

Running

vtiyyal1 commited on Nov 21, 2024

Commit

2bf4ad8

verified ·

1 Parent(s): 11d1b17

Update feed_to_llm_v2.py

Files changed (1) hide show

feed_to_llm_v2.py CHANGED Viewed

@@ -61,26 +61,30 @@ def feed_articles_to_gpt_with_links(information, question):
     response_content = response.content  # Access the content of the AIMessage
     print("LLM Response Content:", response_content)
-    # Extract sources from the response content
-    matches = re.findall(r'\((.*?)\)', response_content)
-    if not matches:
         print("No sources found in the response.")
         return response_content, [], [], []
-    source = matches[-1]
-    # Get integers from source
-    source = re.findall(r'\d+', source)
-    used_article_num = [int(i) - 1 for i in source]
     links = [f"https://tobaccowatcher.globaltobaccocontrol.org/articles/{uuid}/" for uuid in uuids]
     titles = [titles for score, contents, uuids, titles, domains in information]
-    links = [links[i] for i in used_article_num]
-    titles = [titles[i] for i in used_article_num]
-    domains = [domains[i] for i in used_article_num]
-    response_without_source = re.sub(r"\(Article.*\)", "", response_content)
     return response_without_source, links, titles, domains
 if __name__ == "__main__":

     response_content = response.content  # Access the content of the AIMessage
     print("LLM Response Content:", response_content)
+    # Use regex to extract Sources field
+    sources_pattern = r"Sources:\s*([\d,]+)"
+    sources_match = re.search(sources_pattern, response_content)
+    if sources_match:
+        # Extract the list of article numbers
+        source_numbers = sources_match.group(1).split(',')
+        used_article_num = [int(num.strip()) - 1 for num in source_numbers]
+    else:
         print("No sources found in the response.")
         return response_content, [], [], []
+    # Prepare links, titles, and domains for the cited articles
     links = [f"https://tobaccowatcher.globaltobaccocontrol.org/articles/{uuid}/" for uuid in uuids]
     titles = [titles for score, contents, uuids, titles, domains in information]
+    # Filter to only the cited articles
+    links = [links[i] for i in used_article_num if 0 <= i < len(links)]
+    titles = [titles[i] for i in used_article_num if 0 <= i < len(titles)]
+    domains = [domains[i] for i in used_article_num if 0 <= i < len(domains)]
+    # Remove the Sources field from the response if needed
+    response_without_source = re.sub(r"Sources:\s*[\d,]+", "", response_content).strip()
+    # Return the cleaned response and the citations
     return response_without_source, links, titles, domains
 if __name__ == "__main__":