vtiyyal1 commited on
Commit
2bf4ad8
·
verified ·
1 Parent(s): 11d1b17

Update feed_to_llm_v2.py

Browse files
Files changed (1) hide show
  1. feed_to_llm_v2.py +17 -13
feed_to_llm_v2.py CHANGED
@@ -61,26 +61,30 @@ def feed_articles_to_gpt_with_links(information, question):
61
  response_content = response.content # Access the content of the AIMessage
62
  print("LLM Response Content:", response_content)
63
 
64
- # Extract sources from the response content
65
- matches = re.findall(r'\((.*?)\)', response_content)
66
- if not matches:
 
 
 
 
 
67
  print("No sources found in the response.")
68
  return response_content, [], [], []
69
 
70
- source = matches[-1]
71
-
72
- # Get integers from source
73
- source = re.findall(r'\d+', source)
74
- used_article_num = [int(i) - 1 for i in source]
75
-
76
  links = [f"https://tobaccowatcher.globaltobaccocontrol.org/articles/{uuid}/" for uuid in uuids]
77
  titles = [titles for score, contents, uuids, titles, domains in information]
78
 
79
- links = [links[i] for i in used_article_num]
80
- titles = [titles[i] for i in used_article_num]
81
- domains = [domains[i] for i in used_article_num]
 
 
 
 
82
 
83
- response_without_source = re.sub(r"\(Article.*\)", "", response_content)
84
  return response_without_source, links, titles, domains
85
 
86
  if __name__ == "__main__":
 
61
  response_content = response.content # Access the content of the AIMessage
62
  print("LLM Response Content:", response_content)
63
 
64
+ # Use regex to extract Sources field
65
+ sources_pattern = r"Sources:\s*([\d,]+)"
66
+ sources_match = re.search(sources_pattern, response_content)
67
+ if sources_match:
68
+ # Extract the list of article numbers
69
+ source_numbers = sources_match.group(1).split(',')
70
+ used_article_num = [int(num.strip()) - 1 for num in source_numbers]
71
+ else:
72
  print("No sources found in the response.")
73
  return response_content, [], [], []
74
 
75
+ # Prepare links, titles, and domains for the cited articles
 
 
 
 
 
76
  links = [f"https://tobaccowatcher.globaltobaccocontrol.org/articles/{uuid}/" for uuid in uuids]
77
  titles = [titles for score, contents, uuids, titles, domains in information]
78
 
79
+ # Filter to only the cited articles
80
+ links = [links[i] for i in used_article_num if 0 <= i < len(links)]
81
+ titles = [titles[i] for i in used_article_num if 0 <= i < len(titles)]
82
+ domains = [domains[i] for i in used_article_num if 0 <= i < len(domains)]
83
+
84
+ # Remove the Sources field from the response if needed
85
+ response_without_source = re.sub(r"Sources:\s*[\d,]+", "", response_content).strip()
86
 
87
+ # Return the cleaned response and the citations
88
  return response_without_source, links, titles, domains
89
 
90
  if __name__ == "__main__":