vtiyyal1 commited on
Commit
9ca13f0
1 Parent(s): cde1f49

Upload feed_to_llm_v2.py

Browse files
Files changed (1) hide show
  1. feed_to_llm_v2.py +40 -21
feed_to_llm_v2.py CHANGED
@@ -65,33 +65,52 @@ def feed_articles_to_gpt_with_links(information, question):
65
  print("LLM Response Content:", response_content)
66
 
67
  # Extract sources from the response content
68
- matches = re.findall(r'\((Article \d+)\)', response_content)
69
- if not matches:
70
- print("No sources found in the response.")
 
71
  return response_content, [], [], []
72
 
73
- unique_matches = list(set(matches))
74
- used_article_nums = [int(re.findall(r'\d+', match)[0]) - 1 for match in unique_matches]
 
 
75
 
76
- # Create citation list
 
77
  citations = []
78
- for idx, num in enumerate(used_article_nums, start=1):
79
- citation = f"{idx}. {titles_list[num]} ({domains_list[num]})"
 
 
80
  citations.append(citation)
81
 
82
- # Replace article numbers with citation numbers in response
83
- for i, match in enumerate(unique_matches, start=1):
84
- response_content = response_content.replace(match, f"[{i}]")
85
-
86
- # Append citations to the response
87
- response_with_citations = f"{response_content}\n\nReferences:\n" + "\n".join(citations)
88
-
89
- # Prepare links with titles and domains
90
- links = [f"https://tobaccowatcher.globaltobaccocontrol.org/articles/{uuid}/" for uuid in uuids]
91
- hyperlinks = [f"<a href='{link}' target='_blank'>{titles_list[i]}</a> ({domains_list[i]})" for i, link in enumerate(links)]
92
-
93
- return response_with_citations, hyperlinks, titles_list, domains_list
94
-
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
  if __name__ == "__main__":
97
  question = "How is United States fighting against tobacco addiction?"
 
65
  print("LLM Response Content:", response_content)
66
 
67
  # Extract sources from the response content
68
+ inline_matches = re.findall(r'Article \d+', response_content)
69
+ parenthetical_matches = re.findall(r'\(Article \d+\)', response_content)
70
+
71
+ if not (inline_matches or parenthetical_matches):
72
  return response_content, [], [], []
73
 
74
+ # Combine and get unique article numbers
75
+ all_matches = inline_matches + [m.strip('()') for m in parenthetical_matches]
76
+ unique_articles = list(set(all_matches))
77
+ used_article_nums = [int(re.findall(r'\d+', match)[0]) - 1 for match in unique_articles]
78
 
79
+ # Create citation mapping
80
+ citation_map = {}
81
  citations = []
82
+ for idx, article_num in enumerate(used_article_nums, start=1):
83
+ original = f"Article {article_num + 1}"
84
+ citation_map[original] = f"[{idx}]"
85
+ citation = f"[{idx}] {titles_list[article_num]} ({domains_list[article_num]})"
86
  citations.append(citation)
87
 
88
+ # Replace all article references with citation numbers
89
+ modified_response = response_content
90
+ for original, citation_num in citation_map.items():
91
+ # Replace both inline and parenthetical references
92
+ modified_response = modified_response.replace(f"({original})", citation_num)
93
+ modified_response = modified_response.replace(original, citation_num)
94
+
95
+ # Format final response with citations
96
+ response_with_citations = (
97
+ f"{modified_response}\n\n"
98
+ f"References:\n"
99
+ f"{chr(10).join(citations)}"
100
+ )
101
+
102
+ # Prepare links only for cited articles
103
+ cited_links = []
104
+ cited_titles = []
105
+ cited_domains = []
106
+ for article_num in used_article_nums:
107
+ uuid = uuids[article_num]
108
+ link = f"https://tobaccowatcher.globaltobaccocontrol.org/articles/{uuid}/"
109
+ cited_links.append(link)
110
+ cited_titles.append(titles_list[article_num])
111
+ cited_domains.append(domains_list[article_num])
112
+
113
+ return response_with_citations, cited_links, cited_titles, cited_domains
114
 
115
  if __name__ == "__main__":
116
  question = "How is United States fighting against tobacco addiction?"