Spaces:

kayteekay
/

Bookworm-websearch

Sleeping

App Files Files Community

kshitijk commited on Mar 24

Commit

ccbdf6f

•

1 Parent(s): e33ac70

Add scrpits

Browse files

Files changed (3) hide show

app.py +347 -0
requirements.txt +62 -0
scraper.py +178 -0

app.py ADDED Viewed

	@@ -0,0 +1,347 @@

+import openai
+from openai import OpenAI
+from scraper import *
+from dotenv import load_dotenv
+import  streamlit as st
+import math
+USE_CLI = False
+USE_HISTORY_LEN = 20
+load_dotenv()
+openai.api_key = os.getenv("OAI_KEY")
+brave_key = os.getenv("BRAVE_KEY")
+client = OpenAI()
+chat_hist = []
+def display_imgs(urls):
+    grid = st.columns(3)
+    col = 0
+    ctr = 0
+    for url in urls[:min(len(urls), 9)]:
+        ctr += 1
+        with grid[col]:
+            st.image(url)
+        col = ctr % 3
+def update_query(raw_query, context):
+    context_str ="\n".join(context)
+    updated_query = f"""
+                {raw_query}
+                Context: {context_str}
+                """
+    return updated_query
+def openAI_api_call(mode, query, raw_query = None):
+    print("="*50)
+    print(f"Using mode {mode}")
+    print("="*50)
+    if mode == "router":
+        curr_msgs = [
+            {"role": "system", "content": """You are a helpful assistant with access to the chat history, user query and following tools and their descriptions:
+                                        TOOL NAME: get_relevant_context
+                                        TOOL_DESCRIPTION: Given user query, present relevant text information about the query.
+                                        TOOL NAME: get_relevant_images
+                                        TOOL_DESCRIPTION: Given user query, present relevant image URLs about the query.
+                                        The use of tools is optional. If based on user query and chat history, you feel that no tools are required, answer saying no_tools. Otherwise,
+                                        mention the tool name(s). Note, including relevant images whenever possible is highly encouraged to enhance user experience. The answer has to be one or more from [get_relevant_context, get_relevant_images, no_tools]. Think through this step by step and come up with the answer.
+                                        Here are some examples:
+                                        User: What is the name of a Tom Cruise Movie?
+                                        Assistant: get_relevant_context
+                                        User: Suggest some books by Enid Blyton
+                                        Assistant: get_relevant_context, get_relevant_images
+                                        User: Suggest some movies by Steven Speilberg
+                                        Assistant: get_relevant_context, get_relevant_images
+                                        User: Suggest a comedy movie
+                                        Assistant: get_relevant_context, get_relevant_images
+                                        User: Suggest a book for a seven year old
+                                        Assistant: get_relevant_context, get_relevant_images
+                                        User: Can you show me a poster of the movie Space Jam?
+                                        Assistant: get_relevant_images
+                                        User: Tell me a joke
+                                        Assistant: no_tools
+                                        User: Who are you?
+                                        Assistant: no_tools
+                                        User: Can you give me a summary of the third one?
+                                        Assistant: get_relevant_context
+                                        User: Can you give me a photo of this person?
+                                        Assistant: get_relevant_images
+                                        """},
+        ]
+    elif mode == "images":
+        curr_msgs = [
+            {"role": "system", "content": """
+                                        "Given a user query and chat history, use the chat history and user query to give key words such as title, names, etc. Consider incorporating terms, phrases, or topics discussed in the chat history that may provide additional context or refine the search. Ensure the query return keywords separated by commas. Avoid ambiguity or overly broad queries that may result in irrelevant images. If no relevant chat history is available, focus on refining the query based on the user's input alone. Think through this step by step and come up with the answer."
+                                        Example:
+                                            Chat History:
+                                                User: Suggest a book for a 5 year old
+                                                Assistant: A recommended book for a 5-year-old is "Where the Wild Things Are" by Maurice Sendak
+                                            User: Can you give a picture of the author?
+                                            Assistant: Maurice Sendak
+                                        Example:
+                                            Chat History:
+                                                User: Suggest a book for a 5 year old
+                                                Assistant: I recommend the book "The Very Hungry Caterpillar" by Eric Carle for a 5-year-old
+                                            User: Can you give a picture of the book?
+                                            Assistant: The Very Hungry Caterpillar
+                                        """},
+                    ]
+    elif mode == "text":
+        curr_msgs = [{"role": "system", "content": """You are a knowledgeable assistant with access to user queries and chat history. Your task is to revise user queries using the user query and chat history for web search to retrieve relevant information.  Below are examples of user queries and optimized responses:
+            Example 1:
+            User: "I'm in the mood for a thriller novel. Any recommendations?"
+            Assistant: "Best thriller novels of all time"
+            Example 2:
+            User: "Who directed the movie Inception?"
+            Assistant: "Director of Inception"
+            Example 3:
+            User: "Can you tell me about the cast of The Godfather?"
+            Assistant: "Cast of The Godfather"
+            Example 4:
+            User: "What genre does The Great Gatsby belong to?"
+            Assistant: "Genre of The Great Gatsby"
+            Example 5:
+            User: "Suggest a book for a 5 year old"
+            Assistant: "Recommended book for a 5 year old"
+            Please provide brief and concise responses by revising the user queries accordingly. Think through this step by step and come up with the answer."
+            """
+        }]
+    elif mode == "direct":
+        curr_msgs = [{"role": "system", "content": """Your task is to provide a random fun fact about children's books or movies. Be concise with the response.
+            """
+        }]
+        curr_msgs.append({"role": "user", "content": query})
+        response = client.chat.completions.create(
+        model="gpt-3.5-turbo-0125",
+        # response_format={ "type": "json_object" },
+        messages=curr_msgs
+        )
+        return response.choices[0].message.content
+    else:
+        curr_msgs = [{"role": "system", "content":"""You are a knowledgeable chat assistant specialized in answering questions related to books, movies, and related topics such as authors, genres, target age groups, summaries, titles, cast, directors, producers, and plot genres. Your responses should be based on the provided chat history and/or context.
+                        Your task is to provide accurate and relevant information to users' queries within the scope of books and movies.Remember to provide accurate and contextually relevant responses based on the user's queries and the information available from previous interactions. Think through this step by step and come up with the answer."""
+                    }
+                    ]
+    n = 5
+    for msg in st.session_state.messages[-min(len(st.session_state.messages), USE_HISTORY_LEN):-1]:
+        curr_msgs.append(msg)
+    curr_msgs.append({"role": "user", "content": query})
+    print("~"*50)
+    print(curr_msgs)
+    print("~"*50)
+    response = client.chat.completions.create(
+    model="gpt-3.5-turbo-0125",
+    # response_format={ "type": "json_object" },
+    messages=curr_msgs
+    )
+    return response.choices[0].message.content
+def make_router_call(query: str):
+    router_answer = openAI_api_call("router", query)
+    print("="*50)
+    print(f"Router answer is:  {router_answer}")
+    print("="*50)
+    return router_answer
+def make_context_call(query: str, is_chat=True):
+    print("="*50)
+    print(f"get_relevant_context")
+    print("="*50)
+    opt_query = openAI_api_call("text", query)
+    print("="*50)
+    print(f"opt_query {opt_query}")
+    print("="*50)
+    context = fetch_context(opt_query)
+    print("="*50)
+    print(f"context {context}")
+    print("="*50)
+    updated_query = update_query(opt_query, context)
+    print("="*50)
+    print(f"updated_query {updated_query}")
+    print("="*50)
+    answer = openAI_api_call("",updated_query)
+    if is_chat:
+        chat_hist.append({"role": "user", "content": query})
+        chat_hist.append({"role": "assistant", "content": answer})
+    print("@"*50)
+    print(f"Answer: {answer}")
+    print("@"*50)
+    skip = True
+    return answer
+def make_img_search_call(query, answer):
+    print("="*50)
+    print(f"get_relevant_images")
+    print("="*50)
+    if answer:
+        opt_query = openAI_api_call("images", query + ", " + answer)
+    else:
+        opt_query = openAI_api_call("images", query)
+    st.session_state.messages.append({"role": "assistant", "content": ""})
+    print("="*50)
+    print(f"opt_query: {opt_query}")
+    print("="*50)
+    images_urls = fetch_images(opt_query)
+    print("@"*50)
+    print(f"Found images: {images_urls}")
+    print("@"*50)
+    skip = True
+    return images_urls
+def make_default_call(query):
+    print("="*50)
+    print(f"Answering from past")
+    print("="*50)
+    opt_query = openAI_api_call("text", query)
+    print("="*50)
+    print(f"opt_query: {opt_query}")
+    print("="*50)
+    answer = openAI_api_call("",opt_query)
+    chat_hist.append({"role": "user", "content": query})
+    chat_hist.append({"role": "assistant", "content": answer})
+    print("@"*50)
+    print(f"Answer: {answer}")
+    print("@"*50)
+    return answer
+if USE_CLI:
+    while True:
+        query = input("prompt: ")
+        router_answer = make_router_call(query)
+        skip = False
+        answer = None
+        if "get_relevant_context" in router_answer:
+            answer = make_context_call(query)
+            skip = True
+        if "get_relevant_images" in router_answer:
+            images_urls = make_img_search_call(query, answer)
+            skip = True
+        if (not skip):
+            answer = make_default_call(query)
+        print("!"*50)
+        print("ONE TURN FINISHED")
+        print("!"*50)
+else:
+    if "facts" not in st.session_state:
+        st.session_state.facts = [openAI_api_call("direct", "Give one random fun fact about a childrens book or movie")]
+    st.set_page_config(page_title="Project BookWorm: Your own Librarian!", layout="centered", initial_sidebar_state="auto", menu_items=None)
+    st.title("Project BookWorm: Your own Librarian!")
+    st.markdown(f"""> ###### _{st.session_state.facts[0]}_""")
+    st.info("Use this app to get recommendations for books and movies")
+    # Initialize chat history
+    if "messages" not in st.session_state:
+        st.session_state.messages = []
+    # Display chat messages from history on app rerun
+    for message in st.session_state.messages:
+        with st.chat_message(message["role"]):
+            st.markdown(message["content"])
+    # Accept user input
+    if query := st.chat_input("What would you like to know today?"):
+        # Add user message to chat history
+        # Display user message in chat message container
+        with st.chat_message("user"):
+            st.markdown(query)
+        router_answer = make_router_call(query)
+        skip = False
+        answer = None
+        images_urls = None
+        if "get_relevant_context" in router_answer:
+            st.session_state.messages.append({"role": "user", "content": query})
+            answer = make_context_call(query)
+            skip = True
+            st.session_state.messages.append({"role": "assistant", "content": answer})
+        if "get_relevant_images" in router_answer:
+            st.session_state.messages.append({"role": "user", "content": query})
+            images_urls = make_img_search_call(query, answer)
+            skip = True
+        if (not skip):
+            st.session_state.messages.append({"role": "user", "content": query})
+            answer = make_default_call(query)
+            st.session_state.messages.append({"role": "assistant", "content": answer})
+        print("!"*50)
+        print("ONE TURN FINISHED")
+        print("!"*50)
+        # Display assistant response in chat message container
+        with st.chat_message("assistant"):
+            # response = st.write_stream(response_generator(answer))
+            if answer: st.markdown(answer)
+            if images_urls: display_imgs(images_urls)
+        # Add assistant response to chat history

requirements.txt ADDED Viewed

	@@ -0,0 +1,62 @@

+aiohttp==3.9.3
+aiosignal==1.3.1
+altair==5.2.0
+annotated-types==0.6.0
+anyio==4.3.0
+attrs==23.2.0
+beautifulsoup4==4.12.3
+blinker==1.7.0
+cachetools==5.3.3
+certifi==2024.2.2
+charset-normalizer==3.3.2
+click==8.1.7
+distro==1.9.0
+frozenlist==1.4.1
+gitdb==4.0.11
+GitPython==3.1.42
+h11==0.14.0
+httpcore==1.0.4
+httpx==0.27.0
+idna==3.6
+Jinja2==3.1.3
+jsonschema==4.21.1
+jsonschema-specifications==2023.12.1
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+mdurl==0.1.2
+multidict==6.0.5
+numpy==1.26.4
+openai==1.14.2
+packaging==23.2
+pandas==2.2.1
+pillow==10.2.0
+protobuf==4.25.3
+pyarrow==15.0.2
+pydantic==2.6.4
+pydantic_core==2.16.3
+pydeck==0.8.1b0
+Pygments==2.17.2
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+pytz==2024.1
+referencing==0.34.0
+requests==2.31.0
+rich==13.7.1
+rpds-py==0.18.0
+setuptools==69.2.0
+six==1.16.0
+smmap==5.0.1
+sniffio==1.3.1
+soupsieve==2.5
+streamlit==1.32.2
+tenacity==8.2.3
+toml==0.10.2
+toolz==0.12.1
+tornado==6.4
+tqdm==4.66.2
+typing_extensions==4.10.0
+tzdata==2024.1
+urllib3==2.2.1
+watchdog==4.0.0
+wheel==0.41.2
+yarl==1.9.4

scraper.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import requests
+from bs4 import BeautifulSoup
+import os
+import asyncio
+import aiohttp
+from dotenv import load_dotenv
+load_dotenv()
+brave_key = os.getenv("BRAVE_KEY")
+# print(f"Brave Key: {brave_key}")
+import time
+import json
+MAX_SCRAPED_LEN = 1024
+def fetch_urls(response):
+    urls = []
+    results_dict = response.json()
+    # print(results_dict)
+    # Parse the HTML content of the search results page
+    soup = BeautifulSoup(response.text, 'html.parser')
+    attrs = [f"{val} \n\n" for val in soup.contents]
+    for res in results_dict['web']['results']:
+        urls.append(res['url'])
+    return urls
+async def fetch_content(session, url):
+    try:
+        async with session.get(url) as response:
+            if response.status == 200:
+                content = await async_remove_tags(await response.read())
+                return content
+    except Exception as e:
+        print(f"Error fetching content from {url}: {e}")
+    return None
+async def fetch_all(urls):
+    async with aiohttp.ClientSession() as session:
+        tasks = [fetch_content(session, url) for url in urls]
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+        return results
+def fetch_context(query):
+    url = "https://api.search.brave.com/res/v1/web/search"
+    api_key = brave_key
+    headers = {
+        "Accept": "application/json",
+        "Accept-Encoding": "gzip",
+        "X-Subscription-Token": api_key
+    }
+    total_content = []
+    params = {
+        "q": query,
+        "count": 4
+    }
+    response = requests.get(url, headers=headers, params=params)
+    # # Send an HTTP GET request to the search engine
+    if response.status_code == 200:
+        urls = fetch_urls(response)
+        try:
+            loop = asyncio.get_event_loop()
+        except:
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+        results = loop.run_until_complete(fetch_all(urls))
+    # Process fetched content and summarize
+        for content in results:
+            if content:
+                total_content.append(content[:min(len(content), MAX_SCRAPED_LEN)])
+    else:
+        print("Failed to fetch real-time data. Status code:", response.status_code)
+    return total_content
+# Function to remove tags
+async def async_remove_tags(html):
+    # parse html content
+    soup = BeautifulSoup(html, "html.parser")
+    for data in soup(['style', 'script']):
+        # Remove tags
+        data.decompose()
+    # return data by retrieving the tag content
+    return ' '.join(soup.stripped_strings)
+def remove_tags(html):
+    # parse html content
+    soup = BeautifulSoup(html, "html.parser")
+    for data in soup(['style', 'script']):
+        # Remove tags
+        data.decompose()
+    # return data by retrieving the tag content
+    return ' '.join(soup.stripped_strings)
+def fetch_images(query):
+    url = "https://api.search.brave.com/res/v1/images/search"
+    api_key = brave_key
+    headers = {
+        "Accept": "application/json",
+        "Accept-Encoding": "gzip",
+        "X-Subscription-Token": api_key
+    }
+    titles = [" + ".join(query.split(','))]
+    url_list = []
+    for q in titles:
+        params = {
+            "q": q,
+            "count": 10
+        }
+        print(f"Image Query: {q}")
+        tries = 3
+        for _ in range(tries):
+            response = requests.get(url, headers=headers, params=params)
+            try:
+                # # Send an HTTP GET request to the search engine
+                if response.status_code == 200:
+                    results_dict = response.json()
+                    # Parse the HTML content of the search results page
+                    soup = BeautifulSoup(response.text, 'html.parser')
+                    attrs = [f"{val} \n\n" for val in soup.contents]
+                    urls = []
+                    # print(soup.get_text())
+                    for res in results_dict['results']:
+                        urls.append(res['thumbnail']['src'])
+                    for url in urls:
+                        try:
+                            response = requests.get(url)
+                            if response.status_code == 200:
+                                url_list.append(url)
+                        except:
+                            print(f"Invalid url : {url}")
+                    break # Got a result, exit
+                else:
+                    print("Failed to fetch real-time data. Status code:", response.status_code)
+            except Exception as e:
+                print(f"Cant retrieve: {e}")
+    return url_list
+if __name__ == "__main__":
+    import time
+    query = "Suggest 3 books by Enid Blyton"
+    start_ts = time.time()
+    total_content = fetch_context(query)
+    for c in total_content:
+        print("="*100)
+        print(c)
+        print("="*100)
+    end_ts = time.time()
+    print(f"Time taken {end_ts - start_ts} seconds")