File size: 4,551 Bytes
12cca3e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
from langchain_openai import ChatOpenAI

from langchain.schema import (
    HumanMessage,
    SystemMessage
)
import tiktoken
import re

from get_articles import save_solr_articles_full
from rerank import crossencoder_rerank_answer


def num_tokens_from_string(string: str, encoder) -> int:
    num_tokens = len(encoder.encode(string))
    return num_tokens


def feed_articles_to_gpt_with_links(information, question):
    prompt = """

    You are a Question Answering system specializing in tobacco-related topics. You have access to several curated articles, each numbered (e.g., Article 1, Article 2). These articles cover various aspects of tobacco use, health effects, legislation, and quitting resources.



    When formulating your response, adhere to the following guidelines:

    

    1. Use information from the provided articles to directly answer the question. Explicitly reference the article(s) used in your response by stating the article number(s) (e.g., "According to Article 1, ..." or "Articles 2 and 3 mention that...").

    2. If the answer is not covered by any of the articles, clearly state that the information is unavailable. Do not guess or fabricate information.

    3. Avoid using ambiguous time references like 'recently' or 'last year.' Instead, use absolute terms based on the article's content (e.g., 'In 2021' or 'As per Article 2, published in 2020').

    4. Keep responses concise, accurate, and helpful while maintaining a professional tone.



    Below is a list of articles you can reference. Each article is identified by its number and content:

    """
    end_prompt = "\n----------------\n"
    prompt += end_prompt

    content = ""
    separator = "<<<<>>>>"
    token_count = 0

    # Encoder setup for token count tracking
    encoder = tiktoken.encoding_for_model("gpt-3.5-turbo")
    token_count += num_tokens_from_string(prompt, encoder)

    # Add articles to the prompt
    articles = [contents for score, contents, uuids, titles, domains in information]
    uuids = [uuids for score, contents, uuids, titles, domains in information]
    titles_list = [titles for score, contents, uuids, titles, domains in information]
    domains_list = [domains for score, contents, uuids, titles, domains in information]

    for i in range(len(articles)):
        addition = f"Article {i + 1}: {articles[i]} {separator}"
        token_count += num_tokens_from_string(addition, encoder)
        if token_count > 3500:
            break
        content += addition

    prompt += content
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.0)
    message = [
        SystemMessage(content=prompt),
        HumanMessage(content=question)
    ]

    response = llm.invoke(message)
    response_content = response.content  # Access the content of the AIMessage
    print("LLM Response Content:", response_content)

    # Extract sources from the response content
    matches = re.findall(r'\((Article \d+)\)', response_content)
    if not matches:
        print("No sources found in the response.")
        return response_content, [], [], []

    unique_matches = list(set(matches))
    used_article_nums = [int(re.findall(r'\d+', match)[0]) - 1 for match in unique_matches]

    # Create citation list
    citations = []
    for idx, num in enumerate(used_article_nums, start=1):
        citation = f"{idx}. {titles_list[num]} ({domains_list[num]})"
        citations.append(citation)

    # Replace article numbers with citation numbers in response
    for i, match in enumerate(unique_matches, start=1):
        response_content = response_content.replace(match, f"[{i}]")

    # Append citations to the response
    response_with_citations = f"{response_content}\n\nReferences:\n" + "\n".join(citations)

    # Prepare links with titles and domains
    links = [f"https://tobaccowatcher.globaltobaccocontrol.org/articles/{uuid}/" for uuid in uuids]
    hyperlinks = [f"<a href='{link}' target='_blank'>{titles_list[i]}</a> ({domains_list[i]})" for i, link in enumerate(links)]

    return response_with_citations, hyperlinks, titles_list, domains_list


if __name__ == "__main__":
    question = "How is United States fighting against tobacco addiction?"
    rerank_type = "crossencoder"
    llm_type = "chat"
    csv_path = save_solr_articles_full(question, keyword_type="rake")
    reranked_out = crossencoder_rerank_answer(csv_path, question)
    feed_articles_to_gpt_with_links(reranked_out, question)