Upload 2 files
Browse filespreviously uploaded wrong files, this correct for logs and dates
- app.py +48 -33
- feed_to_llm_v2.py +138 -85
app.py
CHANGED
@@ -1,33 +1,48 @@
|
|
1 |
-
|
2 |
-
import
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
responder
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import openai
|
2 |
+
import gradio as gr
|
3 |
+
from full_chain import get_response
|
4 |
+
import os
|
5 |
+
|
6 |
+
api_key = os.getenv("OPENAI_API_KEY")
|
7 |
+
client = openai.OpenAI(api_key=api_key)
|
8 |
+
|
9 |
+
def create_hyperlink(url, title, domain):
|
10 |
+
"""Create HTML hyperlink with domain information."""
|
11 |
+
return f"<a href='{url}' target='_blank'>{title}</a> ({domain})"
|
12 |
+
|
13 |
+
def predict(message, history):
|
14 |
+
"""Process user message and return response with hyperlinked sources."""
|
15 |
+
# Get response and source information
|
16 |
+
responder, links, titles, domains, published_dates = get_response(message, rerank_type="crossencoder")
|
17 |
+
|
18 |
+
# The responder already contains the formatted response with numbered citations
|
19 |
+
# We just need to add the hyperlinked references at the bottom
|
20 |
+
hyperlinks = []
|
21 |
+
for i, (link, title, domain, published_date) in enumerate(zip(links, titles, domains, published_dates), 1):
|
22 |
+
hyperlink = f"[{i}] {create_hyperlink(link, title, domain)} {published_date}"
|
23 |
+
hyperlinks.append(hyperlink)
|
24 |
+
|
25 |
+
# Split the responder to separate the response from its references
|
26 |
+
response_parts = responder.split("References:")
|
27 |
+
main_response = response_parts[0].strip()
|
28 |
+
|
29 |
+
# Combine the response with hyperlinked references
|
30 |
+
final_response = (
|
31 |
+
f"{main_response}\n\n"
|
32 |
+
f"References:\n"
|
33 |
+
f"{chr(10).join(hyperlinks)}"
|
34 |
+
)
|
35 |
+
|
36 |
+
return final_response
|
37 |
+
|
38 |
+
# Initialize and launch Gradio interface
|
39 |
+
gr.ChatInterface(
|
40 |
+
predict,
|
41 |
+
examples=[
|
42 |
+
"How many Americans Smoke?",
|
43 |
+
"What are some measures taken by the Indian Government to reduce the smoking population?",
|
44 |
+
"Does smoking negatively affect my health?"
|
45 |
+
],
|
46 |
+
title="Tobacco Information Assistant",
|
47 |
+
description="Ask questions about tobacco-related topics and get answers with reliable sources."
|
48 |
+
).launch()
|
feed_to_llm_v2.py
CHANGED
@@ -1,85 +1,138 @@
|
|
1 |
-
from langchain_openai import
|
2 |
-
|
3 |
-
from langchain.schema import (
|
4 |
-
HumanMessage,
|
5 |
-
SystemMessage
|
6 |
-
)
|
7 |
-
import tiktoken
|
8 |
-
import re
|
9 |
-
|
10 |
-
from get_articles import save_solr_articles_full
|
11 |
-
from rerank import crossencoder_rerank_answer
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
]
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_openai import ChatOpenAI
|
2 |
+
|
3 |
+
from langchain.schema import (
|
4 |
+
HumanMessage,
|
5 |
+
SystemMessage
|
6 |
+
)
|
7 |
+
import tiktoken
|
8 |
+
import re
|
9 |
+
|
10 |
+
from get_articles import save_solr_articles_full
|
11 |
+
from rerank import crossencoder_rerank_answer
|
12 |
+
import logging
|
13 |
+
from logging.handlers import RotatingFileHandler
|
14 |
+
|
15 |
+
# Configure logging
|
16 |
+
logger = logging.getLogger("TobaccoInfoAssistant")
|
17 |
+
logger.setLevel(logging.INFO)
|
18 |
+
|
19 |
+
handler = RotatingFileHandler(
|
20 |
+
"tobacco_info_assistant.log", maxBytes=10 * 1024 * 1024, backupCount=3
|
21 |
+
)
|
22 |
+
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
23 |
+
handler.setFormatter(formatter)
|
24 |
+
logger.addHandler(handler)
|
25 |
+
|
26 |
+
def num_tokens_from_string(string: str, encoder) -> int:
|
27 |
+
num_tokens = len(encoder.encode(string))
|
28 |
+
return num_tokens
|
29 |
+
|
30 |
+
|
31 |
+
def feed_articles_to_gpt_with_links(information, question):
|
32 |
+
prompt = """
|
33 |
+
You are a Question Answering system specializing in tobacco-related topics. You have access to several curated articles, each numbered (e.g., Article 1, Article 2). These articles cover various aspects of tobacco use, health effects, legislation, and quitting resources.
|
34 |
+
|
35 |
+
When formulating your response, adhere to the following guidelines:
|
36 |
+
|
37 |
+
1. Use information from the provided articles to directly answer the question. Explicitly reference the article(s) used in your response by stating the article number(s) (e.g., "According to Article 1, ..." or "Articles 2 and 3 mention that...").
|
38 |
+
2. If the answer is not covered by any of the articles, clearly state that the information is unavailable. Do not guess or fabricate information.
|
39 |
+
3. Avoid using ambiguous time references like 'recently' or 'last year.' Instead, use absolute terms based on the article's content (e.g., 'In 2021' or 'As per Article 2, published in 2020').
|
40 |
+
4. Keep responses concise, accurate, and helpful while maintaining a professional tone.
|
41 |
+
|
42 |
+
Below is a list of articles you can reference. Each article is identified by its number and content:
|
43 |
+
"""
|
44 |
+
end_prompt = "\n----------------\n"
|
45 |
+
prompt += end_prompt
|
46 |
+
|
47 |
+
content = ""
|
48 |
+
separator = "<<<<>>>>"
|
49 |
+
token_count = 0
|
50 |
+
|
51 |
+
# Encoder setup for token count tracking
|
52 |
+
encoder = tiktoken.encoding_for_model("gpt-3.5-turbo")
|
53 |
+
token_count += num_tokens_from_string(prompt, encoder)
|
54 |
+
|
55 |
+
# Add articles to the prompt
|
56 |
+
articles = [contents for score, contents, uuids, titles, domains, published_dates in information]
|
57 |
+
uuids = [uuids for score, contents, uuids, titles, domains, published_dates in information]
|
58 |
+
titles_list = [titles for score, contents, uuids, titles, domains, published_dates in information]
|
59 |
+
domains_list = [domains for score, contents, uuids, titles, domains, published_dates in information]
|
60 |
+
published_dates = [published_dates for score, contents, uuids, titles, domains, published_dates in information]
|
61 |
+
logger.info(f"Article retrieved: {len(articles)}")
|
62 |
+
logger.info(f"Article titles: {titles_list}")
|
63 |
+
for i in range(len(articles)):
|
64 |
+
addition = f"Article {i + 1}: {articles[i]} {separator}"
|
65 |
+
token_count += num_tokens_from_string(addition, encoder)
|
66 |
+
if token_count > 3500:
|
67 |
+
break
|
68 |
+
content += addition
|
69 |
+
|
70 |
+
prompt += content
|
71 |
+
logger.info(f"Prompt: {prompt}")
|
72 |
+
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.0)
|
73 |
+
message = [
|
74 |
+
SystemMessage(content=prompt),
|
75 |
+
HumanMessage(content=question)
|
76 |
+
]
|
77 |
+
|
78 |
+
response = llm.invoke(message)
|
79 |
+
response_content = response.content # Access the content of the AIMessage
|
80 |
+
logger.info(f"LLM Response Content: {response_content}")
|
81 |
+
|
82 |
+
# Extract sources from the response content
|
83 |
+
inline_matches = re.findall(r'Article \d+', response_content)
|
84 |
+
parenthetical_matches = re.findall(r'\(Article \d+\)', response_content)
|
85 |
+
|
86 |
+
if not (inline_matches or parenthetical_matches):
|
87 |
+
return response_content, [], [], []
|
88 |
+
|
89 |
+
# Combine and get unique article numbers
|
90 |
+
all_matches = inline_matches + [m.strip('()') for m in parenthetical_matches]
|
91 |
+
unique_articles = list(set(all_matches))
|
92 |
+
used_article_nums = [int(re.findall(r'\d+', match)[0]) - 1 for match in unique_articles]
|
93 |
+
|
94 |
+
# Create citation mapping
|
95 |
+
citation_map = {}
|
96 |
+
citations = []
|
97 |
+
for idx, article_num in enumerate(used_article_nums, start=1):
|
98 |
+
original = f"Article {article_num + 1}"
|
99 |
+
citation_map[original] = f"[{idx}]"
|
100 |
+
publication_date = published_dates[article_num] if published_dates[article_num] else "Unknown Date"
|
101 |
+
citation = f"[{idx}] {titles_list[article_num]} ({domains_list[article_num]}) {publication_date}"
|
102 |
+
citations.append(citation)
|
103 |
+
|
104 |
+
# Replace all article references with citation numbers
|
105 |
+
modified_response = response_content
|
106 |
+
for original, citation_num in citation_map.items():
|
107 |
+
# Replace both inline and parenthetical references
|
108 |
+
modified_response = modified_response.replace(f"({original})", citation_num)
|
109 |
+
modified_response = modified_response.replace(original, citation_num)
|
110 |
+
|
111 |
+
# Format final response with citations
|
112 |
+
response_with_citations = (
|
113 |
+
f"{modified_response}\n\n"
|
114 |
+
f"References:\n"
|
115 |
+
f"{chr(10).join(citations)}"
|
116 |
+
)
|
117 |
+
|
118 |
+
# Prepare links only for cited articles
|
119 |
+
cited_links = []
|
120 |
+
cited_titles = []
|
121 |
+
cited_domains = []
|
122 |
+
cited_published_dates = []
|
123 |
+
for article_num in used_article_nums:
|
124 |
+
uuid = uuids[article_num]
|
125 |
+
link = f"https://tobaccowatcher.globaltobaccocontrol.org/articles/{uuid}/"
|
126 |
+
cited_links.append(link)
|
127 |
+
cited_titles.append(titles_list[article_num])
|
128 |
+
cited_domains.append(domains_list[article_num])
|
129 |
+
cited_published_dates.append(published_dates[article_num])
|
130 |
+
return response_with_citations, cited_links, cited_titles, cited_domains, cited_published_dates
|
131 |
+
|
132 |
+
if __name__ == "__main__":
|
133 |
+
question = "How is United States fighting against tobacco addiction?"
|
134 |
+
rerank_type = "crossencoder"
|
135 |
+
llm_type = "chat"
|
136 |
+
csv_path = save_solr_articles_full(question, keyword_type="rake")
|
137 |
+
reranked_out = crossencoder_rerank_answer(csv_path, question)
|
138 |
+
feed_articles_to_gpt_with_links(reranked_out, question)
|