Spaces:
Sleeping
Sleeping
from langchain.chat_models import ChatOpenAI | |
from langchain.schema import ( | |
HumanMessage, | |
SystemMessage | |
) | |
import tiktoken | |
import re | |
def num_tokens_from_string(string: str, encoder) -> int: | |
num_tokens = len(encoder.encode(string)) | |
return num_tokens | |
def feed_articles_to_gpt_with_links(information, question): | |
prompt = "The following pieces of information includes relevant articles. \nUse the following sentences to answer question. \nIf you don't know the answer, just say that you don't know, don't try to make up an answer. " | |
prompt += "Please state the number of the article used to answer the question after your response\n" | |
end_prompt = "\n----------------\n" | |
prompt += end_prompt | |
content = "" | |
seperator = "<<<<>>>>" | |
token_count = 0 | |
encoder = tiktoken.encoding_for_model("gpt-3.5-turbo") | |
token_count += num_tokens_from_string(prompt, encoder) | |
articles = [contents for score, contents, uuids, titles, domains in information] | |
uuids = [uuids for score, contents, uuids, titles, domains in information] | |
domains = [domains for score, contents, uuids, titles, domains in information] | |
for i in range(len(articles)): | |
addition = "Article " + str(i + 1) + ": " + articles[i] + seperator | |
addition += articles[i] + seperator | |
token_count += num_tokens_from_string(addition, encoder) | |
if token_count > 3500: | |
print(i) | |
break | |
content += addition | |
prompt += content | |
llm = ChatOpenAI(temperature=0.0) | |
message = [ | |
SystemMessage(content=prompt), | |
HumanMessage(content=question) | |
] | |
response = llm(message) | |
print(response.content) | |
print("response length: ", len(response.content)) | |
answer_found_prompt = "Please check if the following response found the answer. If yes, return 1 and if no, return 0. \n" | |
message = [ | |
SystemMessage(content=answer_found_prompt), | |
HumanMessage(content=response.content) | |
] | |
print(llm(message).content) | |
if llm(message).content == "0": | |
return "I could not find the answer.", [], [], [] | |
# sources = "\n Sources: \n" | |
# for i in range(len(uuids)): | |
# link = "https://tobaccowatcher.globaltobaccocontrol.org/articles/" + uuids[i] + "/" + "\n" | |
# sources += link | |
# response.content += sources | |
lowercase_response = response.content.lower() | |
# remove parentheses | |
lowercase_response = re.sub('[()]', '', lowercase_response) | |
lowercase_split = lowercase_response.split() | |
used_article_num = [] | |
for i in range(len(lowercase_split)): | |
if lowercase_split[i] == "article": | |
next_word = lowercase_split[i + 1] | |
# get rid of non-numenric characters | |
next_word = ''.join(c for c in next_word if c.isdigit()) | |
print("Article number: ", next_word) | |
# append only if it is not present in the list | |
if next_word not in used_article_num: | |
used_article_num.append(next_word) | |
# if empty | |
print("Used article num: ", used_article_num) | |
if not used_article_num: | |
print("I could not find the answer. Reached") | |
return "I could not find the answer.", [], [], [] | |
used_article_num = [int(num) - 1 for num in used_article_num] | |
links = [f"https://tobaccowatcher.globaltobaccocontrol.org/articles/{uuid}/" for uuid in uuids] | |
titles = [titles for score, contents, uuids, titles, domains in information] | |
links = [links[i] for i in used_article_num] | |
titles = [titles[i] for i in used_article_num] | |
domains = [domains[i] for i in used_article_num] | |
# get rid of substring that starts with (Article and ends with ) | |
response_without_source = re.sub("""\(Article.*\)""", "", response.content) | |
return response_without_source, links, titles, domains | |