Spaces:

mdredze1
/

tobacco-watcher-chat-with-citations

Running

App Files Files Community

tobacco-watcher-chat-with-citations / feed_to_llm_v2.py

vtiyyal1

Upload 2 files

4c7c1f7 verified about 5 hours ago

raw

history blame contribute delete

6.29 kB

	from langchain_openai import ChatOpenAI

	from langchain.schema import (
	HumanMessage,
	SystemMessage
	)
	import tiktoken
	import re

	from get_articles import save_solr_articles_full
	from rerank import crossencoder_rerank_answer
	import logging
	from logging.handlers import RotatingFileHandler

	# Configure logging
	logger = logging.getLogger("TobaccoInfoAssistant")
	logger.setLevel(logging.INFO)

	handler = RotatingFileHandler(
	"tobacco_info_assistant.log", maxBytes=10 * 1024 * 1024, backupCount=3
	)
	formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
	handler.setFormatter(formatter)
	logger.addHandler(handler)

	def num_tokens_from_string(string: str, encoder) -> int:
	num_tokens = len(encoder.encode(string))
	return num_tokens


	def feed_articles_to_gpt_with_links(information, question):
	prompt = """
	You are a Question Answering system specializing in tobacco-related topics. You have access to several curated articles, each numbered (e.g., Article 1, Article 2). These articles cover various aspects of tobacco use, health effects, legislation, and quitting resources.

	When formulating your response, adhere to the following guidelines:

	1. Use information from the provided articles to directly answer the question. Explicitly reference the article(s) used in your response by stating the article number(s) (e.g., "According to Article 1, ..." or "Articles 2 and 3 mention that...").
	2. If the answer is not covered by any of the articles, clearly state that the information is unavailable. Do not guess or fabricate information.
	3. Avoid using ambiguous time references like 'recently' or 'last year.' Instead, use absolute terms based on the article's content (e.g., 'In 2021' or 'As per Article 2, published in 2020').
	4. Keep responses concise, accurate, and helpful while maintaining a professional tone.

	Below is a list of articles you can reference. Each article is identified by its number and content:
	"""
	end_prompt = "\n----------------\n"
	prompt += end_prompt

	content = ""
	separator = "<<<<>>>>"
	token_count = 0

	# Encoder setup for token count tracking
	encoder = tiktoken.encoding_for_model("gpt-3.5-turbo")
	token_count += num_tokens_from_string(prompt, encoder)

	# Add articles to the prompt
	articles = [contents for score, contents, uuids, titles, domains, published_dates in information]
	uuids = [uuids for score, contents, uuids, titles, domains, published_dates in information]
	titles_list = [titles for score, contents, uuids, titles, domains, published_dates in information]
	domains_list = [domains for score, contents, uuids, titles, domains, published_dates in information]
	published_dates = [published_dates for score, contents, uuids, titles, domains, published_dates in information]
	logger.info(f"Article retrieved: {len(articles)}")
	logger.info(f"Article titles: {titles_list}")
	for i in range(len(articles)):
	addition = f"Article {i + 1}: {articles[i]} {separator}"
	token_count += num_tokens_from_string(addition, encoder)
	if token_count > 3500:
	break
	content += addition

	prompt += content
	logger.info(f"Prompt: {prompt}")
	llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.0)
	message = [
	SystemMessage(content=prompt),
	HumanMessage(content=question)
	]

	response = llm.invoke(message)
	response_content = response.content # Access the content of the AIMessage
	logger.info(f"LLM Response Content: {response_content}")

	# Extract sources from the response content
	inline_matches = re.findall(r'Article \d+', response_content)
	parenthetical_matches = re.findall(r'\(Article \d+\)', response_content)

	if not (inline_matches or parenthetical_matches):
	return response_content, [], [], []

	# Combine and get unique article numbers
	all_matches = inline_matches + [m.strip('()') for m in parenthetical_matches]
	unique_articles = list(set(all_matches))
	used_article_nums = [int(re.findall(r'\d+', match)[0]) - 1 for match in unique_articles]

	# Create citation mapping
	citation_map = {}
	citations = []
	for idx, article_num in enumerate(used_article_nums, start=1):
	original = f"Article {article_num + 1}"
	citation_map[original] = f"[{idx}]"
	publication_date = published_dates[article_num] if published_dates[article_num] else "Unknown Date"
	citation = f"[{idx}] {titles_list[article_num]} ({domains_list[article_num]}) {publication_date}"
	citations.append(citation)

	# Replace all article references with citation numbers
	modified_response = response_content
	for original, citation_num in citation_map.items():
	# Replace both inline and parenthetical references
	modified_response = modified_response.replace(f"({original})", citation_num)
	modified_response = modified_response.replace(original, citation_num)

	# Format final response with citations
	response_with_citations = (
	f"{modified_response}\n\n"
	f"References:\n"
	f"{chr(10).join(citations)}"
	)

	# Prepare links only for cited articles
	cited_links = []
	cited_titles = []
	cited_domains = []
	cited_published_dates = []
	for article_num in used_article_nums:
	uuid = uuids[article_num]
	link = f"https://tobaccowatcher.globaltobaccocontrol.org/articles/{uuid}/"
	cited_links.append(link)
	cited_titles.append(titles_list[article_num])
	cited_domains.append(domains_list[article_num])
	cited_published_dates.append(published_dates[article_num])
	return response_with_citations, cited_links, cited_titles, cited_domains, cited_published_dates

	if __name__ == "__main__":
	question = "How is United States fighting against tobacco addiction?"
	rerank_type = "crossencoder"
	llm_type = "chat"
	csv_path = save_solr_articles_full(question, keyword_type="rake")
	reranked_out = crossencoder_rerank_answer(csv_path, question)
	feed_articles_to_gpt_with_links(reranked_out, question)