vtiyyal1 commited on
Commit
4c7c1f7
1 Parent(s): 08f7e4c

Upload 2 files

Browse files

previously uploaded wrong files, this correct for logs and dates

Files changed (2) hide show
  1. app.py +48 -33
  2. feed_to_llm_v2.py +138 -85
app.py CHANGED
@@ -1,33 +1,48 @@
1
-
2
- import openai
3
- import gradio as gr
4
- from full_chain import get_response
5
- import os
6
-
7
- api_key = os.getenv("OPENAI_API_KEY")
8
- client = openai.OpenAI(api_key=api_key)
9
-
10
-
11
- def create_hyperlink(url, title, domain):
12
- return f"<a href='{url}'>{title}</a>" + " (" + domain + ")"
13
-
14
-
15
- def predict(message, history):
16
- print("get_responses: ")
17
- # print(get_response(message, rerank_type="crossencoder"))
18
- responder, links, titles, domains = get_response(message, rerank_type="crossencoder")
19
- for i in range(len(links)):
20
- links[i] = create_hyperlink(links[i], titles[i], domains[i])
21
-
22
- out = responder + "\n" + "\n".join(links)
23
-
24
- return out
25
-
26
-
27
- gr.ChatInterface(predict,
28
- examples = [
29
- "How many Americans Smoke?",
30
- "What are some measures taken by the Indian Government to reduce the smoking population?",
31
- "Does smoking negatively affect my health?"
32
- ]
33
- ).launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import gradio as gr
3
+ from full_chain import get_response
4
+ import os
5
+
6
+ api_key = os.getenv("OPENAI_API_KEY")
7
+ client = openai.OpenAI(api_key=api_key)
8
+
9
+ def create_hyperlink(url, title, domain):
10
+ """Create HTML hyperlink with domain information."""
11
+ return f"<a href='{url}' target='_blank'>{title}</a> ({domain})"
12
+
13
+ def predict(message, history):
14
+ """Process user message and return response with hyperlinked sources."""
15
+ # Get response and source information
16
+ responder, links, titles, domains, published_dates = get_response(message, rerank_type="crossencoder")
17
+
18
+ # The responder already contains the formatted response with numbered citations
19
+ # We just need to add the hyperlinked references at the bottom
20
+ hyperlinks = []
21
+ for i, (link, title, domain, published_date) in enumerate(zip(links, titles, domains, published_dates), 1):
22
+ hyperlink = f"[{i}] {create_hyperlink(link, title, domain)} {published_date}"
23
+ hyperlinks.append(hyperlink)
24
+
25
+ # Split the responder to separate the response from its references
26
+ response_parts = responder.split("References:")
27
+ main_response = response_parts[0].strip()
28
+
29
+ # Combine the response with hyperlinked references
30
+ final_response = (
31
+ f"{main_response}\n\n"
32
+ f"References:\n"
33
+ f"{chr(10).join(hyperlinks)}"
34
+ )
35
+
36
+ return final_response
37
+
38
+ # Initialize and launch Gradio interface
39
+ gr.ChatInterface(
40
+ predict,
41
+ examples=[
42
+ "How many Americans Smoke?",
43
+ "What are some measures taken by the Indian Government to reduce the smoking population?",
44
+ "Does smoking negatively affect my health?"
45
+ ],
46
+ title="Tobacco Information Assistant",
47
+ description="Ask questions about tobacco-related topics and get answers with reliable sources."
48
+ ).launch()
feed_to_llm_v2.py CHANGED
@@ -1,85 +1,138 @@
1
- from langchain_openai import OpenAI
2
-
3
- from langchain.schema import (
4
- HumanMessage,
5
- SystemMessage
6
- )
7
- import tiktoken
8
- import re
9
-
10
- from get_articles import save_solr_articles_full
11
- from rerank import crossencoder_rerank_answer
12
-
13
-
14
- def num_tokens_from_string(string: str, encoder) -> int:
15
- num_tokens = len(encoder.encode(string))
16
- return num_tokens
17
-
18
-
19
- def feed_articles_to_gpt_with_links(information, question):
20
- prompt = """
21
- You are a Question Answering machine specialized in providing information on tobacco-related queries. You have access to a curated list of articles that span various aspects of tobacco use, health effects, legislation, and quitting resources. When responding to questions, follow these guidelines:
22
-
23
- 1. Use information from the articles to formulate your answers. Indicate the article number you're referencing at the end of your response.
24
- 2. If the question's answer is not covered by your articles, clearly state that you do not know the answer. Do not attempt to infer or make up information.
25
- 3. Avoid using time-relative terms like 'last year,' 'recently,' etc., as the articles' publication dates and the current date may not align. Instead, use absolute terms (e.g., 'In 2022,' 'As of the article's 2020 publication,').
26
- 4. Aim for concise, informative responses that directly address the question asked.
27
-
28
- Remember, your goal is to provide accurate, helpful information on tobacco-related topics, aiding in education and informed decision-making.
29
- """
30
- end_prompt = "\n----------------\n"
31
- prompt += end_prompt
32
- content = ""
33
- seperator = "<<<<>>>>"
34
-
35
- token_count = 0
36
- encoder = tiktoken.encoding_for_model("gpt-3.5-turbo")
37
- token_count += num_tokens_from_string(prompt, encoder)
38
-
39
- articles = [contents for score, contents, uuids, titles, domains in information]
40
- uuids = [uuids for score, contents, uuids, titles, domains in information]
41
- domains = [domains for score, contents, uuids, titles, domains in information]
42
-
43
- for i in range(len(articles)):
44
- addition = "Article " + str(i + 1) + ": " + articles[i] + seperator
45
- addition += articles[i] + seperator
46
- token_count += num_tokens_from_string(addition, encoder)
47
- if token_count > 3500:
48
- print(i)
49
- break
50
-
51
- content += addition
52
-
53
- prompt += content
54
- llm = OpenAI(model_name="gpt-4o-mini", temperature=0.0)
55
- message = [
56
- SystemMessage(content=prompt),
57
- HumanMessage(content=question)
58
- ]
59
-
60
- response = llm.invoke(message)
61
- print(response)
62
- print("response length:", len(response))
63
- source = re.findall('\((.*?)\)', response)[-1]
64
-
65
- # get integers from source
66
- source = re.findall(r'\d+', source)
67
- used_article_num = [int(i) - 1 for i in source]
68
-
69
- links = [f"https://tobaccowatcher.globaltobaccocontrol.org/articles/{uuid}/" for uuid in uuids]
70
- titles = [titles for score, contents, uuids, titles, domains in information]
71
-
72
- links = [links[i] for i in used_article_num]
73
- titles = [titles[i] for i in used_article_num]
74
- domains = [domains[i] for i in used_article_num]
75
-
76
- response_without_source = re.sub("""\(Article.*\)""", "", response)
77
- return response_without_source, links, titles, domains
78
-
79
- if __name__ == "__main__":
80
- question = "How is United States fighting against tobacco addiction?"
81
- rerank_type = "crossencoder"
82
- llm_type = "chat"
83
- csv_path = save_solr_articles_full(question, keyword_type="rake")
84
- reranked_out = crossencoder_rerank_answer(csv_path, question)
85
- feed_articles_to_gpt_with_links(reranked_out, question)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_openai import ChatOpenAI
2
+
3
+ from langchain.schema import (
4
+ HumanMessage,
5
+ SystemMessage
6
+ )
7
+ import tiktoken
8
+ import re
9
+
10
+ from get_articles import save_solr_articles_full
11
+ from rerank import crossencoder_rerank_answer
12
+ import logging
13
+ from logging.handlers import RotatingFileHandler
14
+
15
+ # Configure logging
16
+ logger = logging.getLogger("TobaccoInfoAssistant")
17
+ logger.setLevel(logging.INFO)
18
+
19
+ handler = RotatingFileHandler(
20
+ "tobacco_info_assistant.log", maxBytes=10 * 1024 * 1024, backupCount=3
21
+ )
22
+ formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
23
+ handler.setFormatter(formatter)
24
+ logger.addHandler(handler)
25
+
26
+ def num_tokens_from_string(string: str, encoder) -> int:
27
+ num_tokens = len(encoder.encode(string))
28
+ return num_tokens
29
+
30
+
31
+ def feed_articles_to_gpt_with_links(information, question):
32
+ prompt = """
33
+ You are a Question Answering system specializing in tobacco-related topics. You have access to several curated articles, each numbered (e.g., Article 1, Article 2). These articles cover various aspects of tobacco use, health effects, legislation, and quitting resources.
34
+
35
+ When formulating your response, adhere to the following guidelines:
36
+
37
+ 1. Use information from the provided articles to directly answer the question. Explicitly reference the article(s) used in your response by stating the article number(s) (e.g., "According to Article 1, ..." or "Articles 2 and 3 mention that...").
38
+ 2. If the answer is not covered by any of the articles, clearly state that the information is unavailable. Do not guess or fabricate information.
39
+ 3. Avoid using ambiguous time references like 'recently' or 'last year.' Instead, use absolute terms based on the article's content (e.g., 'In 2021' or 'As per Article 2, published in 2020').
40
+ 4. Keep responses concise, accurate, and helpful while maintaining a professional tone.
41
+
42
+ Below is a list of articles you can reference. Each article is identified by its number and content:
43
+ """
44
+ end_prompt = "\n----------------\n"
45
+ prompt += end_prompt
46
+
47
+ content = ""
48
+ separator = "<<<<>>>>"
49
+ token_count = 0
50
+
51
+ # Encoder setup for token count tracking
52
+ encoder = tiktoken.encoding_for_model("gpt-3.5-turbo")
53
+ token_count += num_tokens_from_string(prompt, encoder)
54
+
55
+ # Add articles to the prompt
56
+ articles = [contents for score, contents, uuids, titles, domains, published_dates in information]
57
+ uuids = [uuids for score, contents, uuids, titles, domains, published_dates in information]
58
+ titles_list = [titles for score, contents, uuids, titles, domains, published_dates in information]
59
+ domains_list = [domains for score, contents, uuids, titles, domains, published_dates in information]
60
+ published_dates = [published_dates for score, contents, uuids, titles, domains, published_dates in information]
61
+ logger.info(f"Article retrieved: {len(articles)}")
62
+ logger.info(f"Article titles: {titles_list}")
63
+ for i in range(len(articles)):
64
+ addition = f"Article {i + 1}: {articles[i]} {separator}"
65
+ token_count += num_tokens_from_string(addition, encoder)
66
+ if token_count > 3500:
67
+ break
68
+ content += addition
69
+
70
+ prompt += content
71
+ logger.info(f"Prompt: {prompt}")
72
+ llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.0)
73
+ message = [
74
+ SystemMessage(content=prompt),
75
+ HumanMessage(content=question)
76
+ ]
77
+
78
+ response = llm.invoke(message)
79
+ response_content = response.content # Access the content of the AIMessage
80
+ logger.info(f"LLM Response Content: {response_content}")
81
+
82
+ # Extract sources from the response content
83
+ inline_matches = re.findall(r'Article \d+', response_content)
84
+ parenthetical_matches = re.findall(r'\(Article \d+\)', response_content)
85
+
86
+ if not (inline_matches or parenthetical_matches):
87
+ return response_content, [], [], []
88
+
89
+ # Combine and get unique article numbers
90
+ all_matches = inline_matches + [m.strip('()') for m in parenthetical_matches]
91
+ unique_articles = list(set(all_matches))
92
+ used_article_nums = [int(re.findall(r'\d+', match)[0]) - 1 for match in unique_articles]
93
+
94
+ # Create citation mapping
95
+ citation_map = {}
96
+ citations = []
97
+ for idx, article_num in enumerate(used_article_nums, start=1):
98
+ original = f"Article {article_num + 1}"
99
+ citation_map[original] = f"[{idx}]"
100
+ publication_date = published_dates[article_num] if published_dates[article_num] else "Unknown Date"
101
+ citation = f"[{idx}] {titles_list[article_num]} ({domains_list[article_num]}) {publication_date}"
102
+ citations.append(citation)
103
+
104
+ # Replace all article references with citation numbers
105
+ modified_response = response_content
106
+ for original, citation_num in citation_map.items():
107
+ # Replace both inline and parenthetical references
108
+ modified_response = modified_response.replace(f"({original})", citation_num)
109
+ modified_response = modified_response.replace(original, citation_num)
110
+
111
+ # Format final response with citations
112
+ response_with_citations = (
113
+ f"{modified_response}\n\n"
114
+ f"References:\n"
115
+ f"{chr(10).join(citations)}"
116
+ )
117
+
118
+ # Prepare links only for cited articles
119
+ cited_links = []
120
+ cited_titles = []
121
+ cited_domains = []
122
+ cited_published_dates = []
123
+ for article_num in used_article_nums:
124
+ uuid = uuids[article_num]
125
+ link = f"https://tobaccowatcher.globaltobaccocontrol.org/articles/{uuid}/"
126
+ cited_links.append(link)
127
+ cited_titles.append(titles_list[article_num])
128
+ cited_domains.append(domains_list[article_num])
129
+ cited_published_dates.append(published_dates[article_num])
130
+ return response_with_citations, cited_links, cited_titles, cited_domains, cited_published_dates
131
+
132
+ if __name__ == "__main__":
133
+ question = "How is United States fighting against tobacco addiction?"
134
+ rerank_type = "crossencoder"
135
+ llm_type = "chat"
136
+ csv_path = save_solr_articles_full(question, keyword_type="rake")
137
+ reranked_out = crossencoder_rerank_answer(csv_path, question)
138
+ feed_articles_to_gpt_with_links(reranked_out, question)