tobacco-watcher-chat / feed_to_llm.py
vtiyyal1's picture
Upload 9 files
18f5c04 verified
raw
history blame
3.84 kB
from langchain.chat_models import ChatOpenAI
from langchain.schema import (
HumanMessage,
SystemMessage
)
import tiktoken
import re
def num_tokens_from_string(string: str, encoder) -> int:
num_tokens = len(encoder.encode(string))
return num_tokens
def feed_articles_to_gpt_with_links(information, question):
prompt = "The following pieces of information includes relevant articles. \nUse the following sentences to answer question. \nIf you don't know the answer, just say that you don't know, don't try to make up an answer. "
prompt += "Please state the number of the article used to answer the question after your response\n"
end_prompt = "\n----------------\n"
prompt += end_prompt
content = ""
seperator = "<<<<>>>>"
token_count = 0
encoder = tiktoken.encoding_for_model("gpt-3.5-turbo")
token_count += num_tokens_from_string(prompt, encoder)
articles = [contents for score, contents, uuids, titles, domains in information]
uuids = [uuids for score, contents, uuids, titles, domains in information]
domains = [domains for score, contents, uuids, titles, domains in information]
for i in range(len(articles)):
addition = "Article " + str(i + 1) + ": " + articles[i] + seperator
addition += articles[i] + seperator
token_count += num_tokens_from_string(addition, encoder)
if token_count > 3500:
print(i)
break
content += addition
prompt += content
llm = ChatOpenAI(temperature=0.0)
message = [
SystemMessage(content=prompt),
HumanMessage(content=question)
]
response = llm(message)
print(response.content)
print("response length: ", len(response.content))
answer_found_prompt = "Please check if the following response found the answer. If yes, return 1 and if no, return 0. \n"
message = [
SystemMessage(content=answer_found_prompt),
HumanMessage(content=response.content)
]
print(llm(message).content)
if llm(message).content == "0":
return "I could not find the answer.", [], [], []
# sources = "\n Sources: \n"
# for i in range(len(uuids)):
# link = "https://tobaccowatcher.globaltobaccocontrol.org/articles/" + uuids[i] + "/" + "\n"
# sources += link
# response.content += sources
lowercase_response = response.content.lower()
# remove parentheses
lowercase_response = re.sub('[()]', '', lowercase_response)
lowercase_split = lowercase_response.split()
used_article_num = []
for i in range(len(lowercase_split)):
if lowercase_split[i] == "article":
next_word = lowercase_split[i + 1]
# get rid of non-numenric characters
next_word = ''.join(c for c in next_word if c.isdigit())
print("Article number: ", next_word)
# append only if it is not present in the list
if next_word not in used_article_num:
used_article_num.append(next_word)
# if empty
print("Used article num: ", used_article_num)
if not used_article_num:
print("I could not find the answer. Reached")
return "I could not find the answer.", [], [], []
used_article_num = [int(num) - 1 for num in used_article_num]
links = [f"https://tobaccowatcher.globaltobaccocontrol.org/articles/{uuid}/" for uuid in uuids]
titles = [titles for score, contents, uuids, titles, domains in information]
links = [links[i] for i in used_article_num]
titles = [titles[i] for i in used_article_num]
domains = [domains[i] for i in used_article_num]
# get rid of substring that starts with (Article and ends with )
response_without_source = re.sub("""\(Article.*\)""", "", response.content)
return response_without_source, links, titles, domains