Spaces:

mdredze1
/

tobacco-watcher-chat

Running

App Files Files Community

tobacco-watcher-chat / feed_to_llm_v2.py

vtiyyal1

Update feed_to_llm_v2.py

d058257 verified about 21 hours ago

raw

history blame contribute delete

4.61 kB

	from langchain_openai import ChatOpenAI

	from langchain.schema import (
	HumanMessage,
	SystemMessage
	)
	import tiktoken
	import re

	from get_articles import save_solr_articles_full
	from rerank import crossencoder_rerank_answer


	def num_tokens_from_string(string: str, encoder) -> int:
	num_tokens = len(encoder.encode(string))
	return num_tokens


	def feed_articles_to_gpt_with_links(information, question):
	prompt = """
	You are a Question Answering machine specialized in providing information on tobacco-related queries. You have access to a curated list of articles that span various aspects of tobacco use, health effects, legislation, and quitting resources. When responding to questions, follow these guidelines:

	1. Use information from the articles to formulate your answers. Indicate the article number you're referencing at the end of your response. At the end of your response, include the field "Sources:" with a sequence of comma separated numbers indicating the articles used in your response. For example, if you used articles 3 and 5, you should write "Sources: 3,5"
	2. If the question's answer is not covered by your articles, clearly state that you do not know the answer. Do not attempt to infer or make up information.
	3. Avoid using time-relative terms like 'last year,' 'recently,' etc., as the articles' publication dates and the current date may not align. Instead, use absolute terms (e.g., 'In 2022,' 'As of the article's 2020 publication,').
	4. Aim for concise, informative responses that directly address the question asked.

	Remember, your goal is to provide accurate, helpful information on tobacco-related topics, aiding in education and informed decision-making.
	"""
	end_prompt = "\n----------------\n"
	prompt += end_prompt
	content = ""
	seperator = "<<<<>>>>"

	token_count = 0
	encoder = tiktoken.encoding_for_model("gpt-3.5-turbo")
	token_count += num_tokens_from_string(prompt, encoder)

	articles = [contents for score, contents, uuids, titles, domains in information]
	uuids = [uuids for score, contents, uuids, titles, domains in information]
	domains = [domains for score, contents, uuids, titles, domains in information]

	for i in range(len(articles)):
	addition = "Article " + str(i + 1) + ": " + articles[i] + seperator
	addition += articles[i] + seperator
	token_count += num_tokens_from_string(addition, encoder)
	if token_count > 3500:
	print(i)
	break

	content += addition

	prompt += content
	llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.0)
	message = [
	SystemMessage(content=prompt),
	HumanMessage(content=question)
	]

	response = llm.invoke(message)
	response_content = response.content # Access the content of the AIMessage
	print("LLM Response Content:", response_content)

	# Use regex to extract Sources field
	# Extract sources from the response content
	sources_pattern = r"Sources:\s*([\d,]+)"
	sources_match = re.search(sources_pattern, response_content)

	if sources_match:
	# Split the matched source numbers and filter out any empty strings
	source_numbers = [num.strip() for num in sources_match.group(1).split(',') if num.strip()]
	used_article_num = [int(num) - 1 for num in source_numbers]
	else:
	print("No sources found in the response.")
	return response_content, [], [], []


	# Prepare links, titles, and domains for the cited articles
	links = [f"https://tobaccowatcher.globaltobaccocontrol.org/articles/{uuid}/" for uuid in uuids]
	titles = [titles for score, contents, uuids, titles, domains in information]

	# Filter to only the cited articles
	links = [links[i] for i in used_article_num if 0 <= i < len(links)]
	titles = [titles[i] for i in used_article_num if 0 <= i < len(titles)]
	domains = [domains[i] for i in used_article_num if 0 <= i < len(domains)]

	# Remove the Sources field from the response if needed
	response_without_source = re.sub(r"Sources:\s*[\d,]+", "", response_content).strip()

	# Return the cleaned response and the citations
	return response_without_source, links, titles, domains

	if __name__ == "__main__":
	question = "How is United States fighting against tobacco addiction?"
	rerank_type = "crossencoder"
	llm_type = "chat"
	csv_path = save_solr_articles_full(question, keyword_type="rake")
	reranked_out = crossencoder_rerank_answer(csv_path, question)
	feed_articles_to_gpt_with_links(reranked_out, question)