kshitijk commited on
Commit
ccbdf6f
1 Parent(s): e33ac70

Add scrpits

Browse files
Files changed (3) hide show
  1. app.py +347 -0
  2. requirements.txt +62 -0
  3. scraper.py +178 -0
app.py ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ from openai import OpenAI
3
+ from scraper import *
4
+ from dotenv import load_dotenv
5
+ import streamlit as st
6
+ import math
7
+ USE_CLI = False
8
+ USE_HISTORY_LEN = 20
9
+
10
+ load_dotenv()
11
+
12
+
13
+ openai.api_key = os.getenv("OAI_KEY")
14
+ brave_key = os.getenv("BRAVE_KEY")
15
+ client = OpenAI()
16
+
17
+
18
+ chat_hist = []
19
+
20
+ def display_imgs(urls):
21
+ grid = st.columns(3)
22
+ col = 0
23
+ ctr = 0
24
+ for url in urls[:min(len(urls), 9)]:
25
+ ctr += 1
26
+ with grid[col]:
27
+ st.image(url)
28
+ col = ctr % 3
29
+
30
+
31
+ def update_query(raw_query, context):
32
+ context_str ="\n".join(context)
33
+ updated_query = f"""
34
+ {raw_query}
35
+
36
+ Context: {context_str}
37
+ """
38
+
39
+ return updated_query
40
+
41
+ def openAI_api_call(mode, query, raw_query = None):
42
+ print("="*50)
43
+ print(f"Using mode {mode}")
44
+ print("="*50)
45
+ if mode == "router":
46
+ curr_msgs = [
47
+ {"role": "system", "content": """You are a helpful assistant with access to the chat history, user query and following tools and their descriptions:
48
+
49
+ TOOL NAME: get_relevant_context
50
+ TOOL_DESCRIPTION: Given user query, present relevant text information about the query.
51
+
52
+ TOOL NAME: get_relevant_images
53
+ TOOL_DESCRIPTION: Given user query, present relevant image URLs about the query.
54
+
55
+ The use of tools is optional. If based on user query and chat history, you feel that no tools are required, answer saying no_tools. Otherwise,
56
+ mention the tool name(s). Note, including relevant images whenever possible is highly encouraged to enhance user experience. The answer has to be one or more from [get_relevant_context, get_relevant_images, no_tools]. Think through this step by step and come up with the answer.
57
+
58
+ Here are some examples:
59
+
60
+ User: What is the name of a Tom Cruise Movie?
61
+ Assistant: get_relevant_context
62
+
63
+ User: Suggest some books by Enid Blyton
64
+ Assistant: get_relevant_context, get_relevant_images
65
+
66
+ User: Suggest some movies by Steven Speilberg
67
+ Assistant: get_relevant_context, get_relevant_images
68
+
69
+ User: Suggest a comedy movie
70
+ Assistant: get_relevant_context, get_relevant_images
71
+
72
+ User: Suggest a book for a seven year old
73
+ Assistant: get_relevant_context, get_relevant_images
74
+
75
+ User: Can you show me a poster of the movie Space Jam?
76
+ Assistant: get_relevant_images
77
+
78
+ User: Tell me a joke
79
+ Assistant: no_tools
80
+
81
+ User: Who are you?
82
+ Assistant: no_tools
83
+
84
+ User: Can you give me a summary of the third one?
85
+ Assistant: get_relevant_context
86
+
87
+ User: Can you give me a photo of this person?
88
+ Assistant: get_relevant_images
89
+ """},
90
+ ]
91
+
92
+ elif mode == "images":
93
+ curr_msgs = [
94
+ {"role": "system", "content": """
95
+ "Given a user query and chat history, use the chat history and user query to give key words such as title, names, etc. Consider incorporating terms, phrases, or topics discussed in the chat history that may provide additional context or refine the search. Ensure the query return keywords separated by commas. Avoid ambiguity or overly broad queries that may result in irrelevant images. If no relevant chat history is available, focus on refining the query based on the user's input alone. Think through this step by step and come up with the answer."
96
+ Example:
97
+ Chat History:
98
+ User: Suggest a book for a 5 year old
99
+ Assistant: A recommended book for a 5-year-old is "Where the Wild Things Are" by Maurice Sendak
100
+
101
+ User: Can you give a picture of the author?
102
+ Assistant: Maurice Sendak
103
+
104
+ Example:
105
+ Chat History:
106
+ User: Suggest a book for a 5 year old
107
+ Assistant: I recommend the book "The Very Hungry Caterpillar" by Eric Carle for a 5-year-old
108
+
109
+ User: Can you give a picture of the book?
110
+ Assistant: The Very Hungry Caterpillar
111
+ """},
112
+ ]
113
+
114
+ elif mode == "text":
115
+
116
+ curr_msgs = [{"role": "system", "content": """You are a knowledgeable assistant with access to user queries and chat history. Your task is to revise user queries using the user query and chat history for web search to retrieve relevant information. Below are examples of user queries and optimized responses:
117
+
118
+ Example 1:
119
+ User: "I'm in the mood for a thriller novel. Any recommendations?"
120
+ Assistant: "Best thriller novels of all time"
121
+
122
+ Example 2:
123
+ User: "Who directed the movie Inception?"
124
+ Assistant: "Director of Inception"
125
+
126
+ Example 3:
127
+ User: "Can you tell me about the cast of The Godfather?"
128
+ Assistant: "Cast of The Godfather"
129
+
130
+ Example 4:
131
+ User: "What genre does The Great Gatsby belong to?"
132
+ Assistant: "Genre of The Great Gatsby"
133
+
134
+ Example 5:
135
+ User: "Suggest a book for a 5 year old"
136
+ Assistant: "Recommended book for a 5 year old"
137
+
138
+ Please provide brief and concise responses by revising the user queries accordingly. Think through this step by step and come up with the answer."
139
+ """
140
+ }]
141
+
142
+ elif mode == "direct":
143
+ curr_msgs = [{"role": "system", "content": """Your task is to provide a random fun fact about children's books or movies. Be concise with the response.
144
+ """
145
+ }]
146
+ curr_msgs.append({"role": "user", "content": query})
147
+ response = client.chat.completions.create(
148
+ model="gpt-3.5-turbo-0125",
149
+ # response_format={ "type": "json_object" },
150
+ messages=curr_msgs
151
+ )
152
+
153
+ return response.choices[0].message.content
154
+
155
+ else:
156
+ curr_msgs = [{"role": "system", "content":"""You are a knowledgeable chat assistant specialized in answering questions related to books, movies, and related topics such as authors, genres, target age groups, summaries, titles, cast, directors, producers, and plot genres. Your responses should be based on the provided chat history and/or context.
157
+ Your task is to provide accurate and relevant information to users' queries within the scope of books and movies.Remember to provide accurate and contextually relevant responses based on the user's queries and the information available from previous interactions. Think through this step by step and come up with the answer."""
158
+ }
159
+ ]
160
+
161
+ n = 5
162
+
163
+ for msg in st.session_state.messages[-min(len(st.session_state.messages), USE_HISTORY_LEN):-1]:
164
+ curr_msgs.append(msg)
165
+ curr_msgs.append({"role": "user", "content": query})
166
+ print("~"*50)
167
+ print(curr_msgs)
168
+ print("~"*50)
169
+ response = client.chat.completions.create(
170
+ model="gpt-3.5-turbo-0125",
171
+ # response_format={ "type": "json_object" },
172
+ messages=curr_msgs
173
+ )
174
+
175
+ return response.choices[0].message.content
176
+
177
+
178
+
179
+ def make_router_call(query: str):
180
+
181
+ router_answer = openAI_api_call("router", query)
182
+ print("="*50)
183
+ print(f"Router answer is: {router_answer}")
184
+ print("="*50)
185
+
186
+ return router_answer
187
+
188
+ def make_context_call(query: str, is_chat=True):
189
+ print("="*50)
190
+ print(f"get_relevant_context")
191
+ print("="*50)
192
+
193
+ opt_query = openAI_api_call("text", query)
194
+ print("="*50)
195
+ print(f"opt_query {opt_query}")
196
+ print("="*50)
197
+
198
+ context = fetch_context(opt_query)
199
+ print("="*50)
200
+ print(f"context {context}")
201
+ print("="*50)
202
+ updated_query = update_query(opt_query, context)
203
+ print("="*50)
204
+ print(f"updated_query {updated_query}")
205
+ print("="*50)
206
+ answer = openAI_api_call("",updated_query)
207
+
208
+ if is_chat:
209
+ chat_hist.append({"role": "user", "content": query})
210
+ chat_hist.append({"role": "assistant", "content": answer})
211
+ print("@"*50)
212
+ print(f"Answer: {answer}")
213
+ print("@"*50)
214
+ skip = True
215
+
216
+
217
+ return answer
218
+
219
+
220
+ def make_img_search_call(query, answer):
221
+ print("="*50)
222
+ print(f"get_relevant_images")
223
+ print("="*50)
224
+ if answer:
225
+ opt_query = openAI_api_call("images", query + ", " + answer)
226
+ else:
227
+ opt_query = openAI_api_call("images", query)
228
+ st.session_state.messages.append({"role": "assistant", "content": ""})
229
+
230
+ print("="*50)
231
+ print(f"opt_query: {opt_query}")
232
+ print("="*50)
233
+ images_urls = fetch_images(opt_query)
234
+ print("@"*50)
235
+ print(f"Found images: {images_urls}")
236
+ print("@"*50)
237
+ skip = True
238
+
239
+ return images_urls
240
+
241
+ def make_default_call(query):
242
+ print("="*50)
243
+ print(f"Answering from past")
244
+ print("="*50)
245
+ opt_query = openAI_api_call("text", query)
246
+ print("="*50)
247
+ print(f"opt_query: {opt_query}")
248
+ print("="*50)
249
+ answer = openAI_api_call("",opt_query)
250
+ chat_hist.append({"role": "user", "content": query})
251
+ chat_hist.append({"role": "assistant", "content": answer})
252
+ print("@"*50)
253
+ print(f"Answer: {answer}")
254
+ print("@"*50)
255
+
256
+ return answer
257
+
258
+ if USE_CLI:
259
+ while True:
260
+
261
+ query = input("prompt: ")
262
+ router_answer = make_router_call(query)
263
+ skip = False
264
+ answer = None
265
+
266
+ if "get_relevant_context" in router_answer:
267
+ answer = make_context_call(query)
268
+ skip = True
269
+
270
+ if "get_relevant_images" in router_answer:
271
+ images_urls = make_img_search_call(query, answer)
272
+ skip = True
273
+
274
+ if (not skip):
275
+ answer = make_default_call(query)
276
+
277
+ print("!"*50)
278
+ print("ONE TURN FINISHED")
279
+ print("!"*50)
280
+
281
+ else:
282
+ if "facts" not in st.session_state:
283
+ st.session_state.facts = [openAI_api_call("direct", "Give one random fun fact about a childrens book or movie")]
284
+ st.set_page_config(page_title="Project BookWorm: Your own Librarian!", layout="centered", initial_sidebar_state="auto", menu_items=None)
285
+ st.title("Project BookWorm: Your own Librarian!")
286
+ st.markdown(f"""> ###### _{st.session_state.facts[0]}_""")
287
+ st.info("Use this app to get recommendations for books and movies")
288
+
289
+ # Initialize chat history
290
+ if "messages" not in st.session_state:
291
+ st.session_state.messages = []
292
+ # Display chat messages from history on app rerun
293
+ for message in st.session_state.messages:
294
+ with st.chat_message(message["role"]):
295
+ st.markdown(message["content"])
296
+
297
+
298
+
299
+
300
+
301
+
302
+
303
+
304
+
305
+
306
+ # Accept user input
307
+ if query := st.chat_input("What would you like to know today?"):
308
+ # Add user message to chat history
309
+
310
+ # Display user message in chat message container
311
+ with st.chat_message("user"):
312
+ st.markdown(query)
313
+
314
+ router_answer = make_router_call(query)
315
+ skip = False
316
+ answer = None
317
+ images_urls = None
318
+
319
+ if "get_relevant_context" in router_answer:
320
+ st.session_state.messages.append({"role": "user", "content": query})
321
+ answer = make_context_call(query)
322
+ skip = True
323
+ st.session_state.messages.append({"role": "assistant", "content": answer})
324
+
325
+ if "get_relevant_images" in router_answer:
326
+ st.session_state.messages.append({"role": "user", "content": query})
327
+ images_urls = make_img_search_call(query, answer)
328
+ skip = True
329
+
330
+ if (not skip):
331
+ st.session_state.messages.append({"role": "user", "content": query})
332
+ answer = make_default_call(query)
333
+ st.session_state.messages.append({"role": "assistant", "content": answer})
334
+
335
+ print("!"*50)
336
+ print("ONE TURN FINISHED")
337
+ print("!"*50)
338
+
339
+
340
+ # Display assistant response in chat message container
341
+ with st.chat_message("assistant"):
342
+ # response = st.write_stream(response_generator(answer))
343
+ if answer: st.markdown(answer)
344
+ if images_urls: display_imgs(images_urls)
345
+
346
+ # Add assistant response to chat history
347
+
requirements.txt ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp==3.9.3
2
+ aiosignal==1.3.1
3
+ altair==5.2.0
4
+ annotated-types==0.6.0
5
+ anyio==4.3.0
6
+ attrs==23.2.0
7
+ beautifulsoup4==4.12.3
8
+ blinker==1.7.0
9
+ cachetools==5.3.3
10
+ certifi==2024.2.2
11
+ charset-normalizer==3.3.2
12
+ click==8.1.7
13
+ distro==1.9.0
14
+ frozenlist==1.4.1
15
+ gitdb==4.0.11
16
+ GitPython==3.1.42
17
+ h11==0.14.0
18
+ httpcore==1.0.4
19
+ httpx==0.27.0
20
+ idna==3.6
21
+ Jinja2==3.1.3
22
+ jsonschema==4.21.1
23
+ jsonschema-specifications==2023.12.1
24
+ markdown-it-py==3.0.0
25
+ MarkupSafe==2.1.5
26
+ mdurl==0.1.2
27
+ multidict==6.0.5
28
+ numpy==1.26.4
29
+ openai==1.14.2
30
+ packaging==23.2
31
+ pandas==2.2.1
32
+ pillow==10.2.0
33
+ protobuf==4.25.3
34
+ pyarrow==15.0.2
35
+ pydantic==2.6.4
36
+ pydantic_core==2.16.3
37
+ pydeck==0.8.1b0
38
+ Pygments==2.17.2
39
+ python-dateutil==2.9.0.post0
40
+ python-dotenv==1.0.1
41
+ pytz==2024.1
42
+ referencing==0.34.0
43
+ requests==2.31.0
44
+ rich==13.7.1
45
+ rpds-py==0.18.0
46
+ setuptools==69.2.0
47
+ six==1.16.0
48
+ smmap==5.0.1
49
+ sniffio==1.3.1
50
+ soupsieve==2.5
51
+ streamlit==1.32.2
52
+ tenacity==8.2.3
53
+ toml==0.10.2
54
+ toolz==0.12.1
55
+ tornado==6.4
56
+ tqdm==4.66.2
57
+ typing_extensions==4.10.0
58
+ tzdata==2024.1
59
+ urllib3==2.2.1
60
+ watchdog==4.0.0
61
+ wheel==0.41.2
62
+ yarl==1.9.4
scraper.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import os
4
+ import asyncio
5
+ import aiohttp
6
+
7
+ from dotenv import load_dotenv
8
+
9
+ load_dotenv()
10
+ brave_key = os.getenv("BRAVE_KEY")
11
+ # print(f"Brave Key: {brave_key}")
12
+ import time
13
+ import json
14
+
15
+
16
+ MAX_SCRAPED_LEN = 1024
17
+
18
+
19
+
20
+ def fetch_urls(response):
21
+
22
+ urls = []
23
+
24
+ results_dict = response.json()
25
+ # print(results_dict)
26
+ # Parse the HTML content of the search results page
27
+ soup = BeautifulSoup(response.text, 'html.parser')
28
+ attrs = [f"{val} \n\n" for val in soup.contents]
29
+ for res in results_dict['web']['results']:
30
+ urls.append(res['url'])
31
+ return urls
32
+
33
+ async def fetch_content(session, url):
34
+ try:
35
+ async with session.get(url) as response:
36
+ if response.status == 200:
37
+ content = await async_remove_tags(await response.read())
38
+ return content
39
+ except Exception as e:
40
+ print(f"Error fetching content from {url}: {e}")
41
+ return None
42
+
43
+ async def fetch_all(urls):
44
+ async with aiohttp.ClientSession() as session:
45
+ tasks = [fetch_content(session, url) for url in urls]
46
+ results = await asyncio.gather(*tasks, return_exceptions=True)
47
+ return results
48
+
49
+ def fetch_context(query):
50
+
51
+ url = "https://api.search.brave.com/res/v1/web/search"
52
+ api_key = brave_key
53
+
54
+ headers = {
55
+ "Accept": "application/json",
56
+ "Accept-Encoding": "gzip",
57
+ "X-Subscription-Token": api_key
58
+ }
59
+ total_content = []
60
+
61
+ params = {
62
+ "q": query,
63
+ "count": 4
64
+ }
65
+
66
+ response = requests.get(url, headers=headers, params=params)
67
+
68
+
69
+
70
+ # # Send an HTTP GET request to the search engine
71
+ if response.status_code == 200:
72
+ urls = fetch_urls(response)
73
+ try:
74
+ loop = asyncio.get_event_loop()
75
+ except:
76
+ loop = asyncio.new_event_loop()
77
+ asyncio.set_event_loop(loop)
78
+ results = loop.run_until_complete(fetch_all(urls))
79
+ # Process fetched content and summarize
80
+ for content in results:
81
+ if content:
82
+ total_content.append(content[:min(len(content), MAX_SCRAPED_LEN)])
83
+
84
+ else:
85
+ print("Failed to fetch real-time data. Status code:", response.status_code)
86
+
87
+ return total_content
88
+
89
+
90
+ # Function to remove tags
91
+ async def async_remove_tags(html):
92
+
93
+ # parse html content
94
+ soup = BeautifulSoup(html, "html.parser")
95
+
96
+ for data in soup(['style', 'script']):
97
+ # Remove tags
98
+ data.decompose()
99
+
100
+ # return data by retrieving the tag content
101
+ return ' '.join(soup.stripped_strings)
102
+
103
+ def remove_tags(html):
104
+
105
+ # parse html content
106
+ soup = BeautifulSoup(html, "html.parser")
107
+
108
+ for data in soup(['style', 'script']):
109
+ # Remove tags
110
+ data.decompose()
111
+
112
+ # return data by retrieving the tag content
113
+ return ' '.join(soup.stripped_strings)
114
+
115
+
116
+
117
+ def fetch_images(query):
118
+
119
+ url = "https://api.search.brave.com/res/v1/images/search"
120
+ api_key = brave_key
121
+
122
+ headers = {
123
+ "Accept": "application/json",
124
+ "Accept-Encoding": "gzip",
125
+ "X-Subscription-Token": api_key
126
+ }
127
+ titles = [" + ".join(query.split(','))]
128
+ url_list = []
129
+ for q in titles:
130
+ params = {
131
+ "q": q,
132
+ "count": 10
133
+ }
134
+ print(f"Image Query: {q}")
135
+ tries = 3
136
+ for _ in range(tries):
137
+ response = requests.get(url, headers=headers, params=params)
138
+ try:
139
+ # # Send an HTTP GET request to the search engine
140
+ if response.status_code == 200:
141
+ results_dict = response.json()
142
+ # Parse the HTML content of the search results page
143
+ soup = BeautifulSoup(response.text, 'html.parser')
144
+ attrs = [f"{val} \n\n" for val in soup.contents]
145
+ urls = []
146
+ # print(soup.get_text())
147
+ for res in results_dict['results']:
148
+ urls.append(res['thumbnail']['src'])
149
+ for url in urls:
150
+ try:
151
+ response = requests.get(url)
152
+ if response.status_code == 200:
153
+ url_list.append(url)
154
+
155
+ except:
156
+ print(f"Invalid url : {url}")
157
+ break # Got a result, exit
158
+ else:
159
+ print("Failed to fetch real-time data. Status code:", response.status_code)
160
+ except Exception as e:
161
+ print(f"Cant retrieve: {e}")
162
+
163
+ return url_list
164
+
165
+
166
+ if __name__ == "__main__":
167
+ import time
168
+ query = "Suggest 3 books by Enid Blyton"
169
+ start_ts = time.time()
170
+ total_content = fetch_context(query)
171
+
172
+ for c in total_content:
173
+ print("="*100)
174
+ print(c)
175
+ print("="*100)
176
+
177
+ end_ts = time.time()
178
+ print(f"Time taken {end_ts - start_ts} seconds")