SearchGPT

Paused

App Files Files Community

Shreyas094 commited on Jul 9, 2024

Commit

4d152e0

•

1 Parent(s): 8650279

Update app.py

Browse files

Files changed (1) hide show

app.py +160 -188

app.py CHANGED Viewed

@@ -20,14 +20,13 @@ from langchain_core.runnables import RunnableParallel, RunnablePassthrough
 from langchain_core.documents import Document
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
-from datetime import datetime
-from huggingface_hub.utils import HfHubHTTPError
 huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
 # Memory database to store question-answer pairs
 memory_database = {}
 conversation_history = []
 def load_and_split_document_basic(file):
     """Loads and splits the document into pages."""
@@ -101,25 +100,15 @@ def get_model(temperature, top_p, repetition_penalty):
         huggingfacehub_api_token=huggingface_token
     )
-def generate_chunked_response(model, prompt, max_tokens=200):
     full_response = ""
-    total_length = len(prompt.split())  # Approximate token count of prompt
-    while total_length < 7800:  # Leave some margin
-        try:
-            chunk = model(prompt + full_response, max_new_tokens=min(200, 7800 - total_length))
-            chunk = chunk.strip()
-            if not chunk:
-                break
             full_response += chunk
-            total_length += len(chunk.split())  # Approximate token count
-            if chunk.endswith((".", "!", "?")):
-                break
-        except Exception as e:
-            print(f"Error generating response: {str(e)}")
             break
     return full_response.strip()
 def manage_conversation_history(question, answer, history, max_history=5):
@@ -197,10 +186,8 @@ def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_
             print(f"Found {len(result_block)} results on this page")
             for result in result_block:
                 link = result.find("a", href=True)
-                title = result.find("h3")
-                if link and title:
                     link = link["href"]
-                    title = title.get_text()
                     print(f"Processing link: {link}")
                     try:
                         webpage = session.get(link, headers=headers, timeout=timeout)
@@ -208,21 +195,20 @@ def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_
                         visible_text = extract_text_from_webpage(webpage.text)
                         if len(visible_text) > max_chars_per_page:
                             visible_text = visible_text[:max_chars_per_page] + "..."
-                        all_results.append({"link": link, "title": title, "text": visible_text})
                         print(f"Successfully extracted text from {link}")
                     except requests.exceptions.RequestException as e:
                         print(f"Error retrieving webpage content: {e}")
-                        all_results.append({"link": link, "title": title, "text": None})
                 else:
-                    print("No link or title found for this result")
-                    all_results.append({"link": None, "title": None, "text": None})
             start += len(result_block)
     print(f"Search completed. Total results: {len(all_results)}")
     print("Search results:")
     for i, result in enumerate(all_results, 1):
         print(f"Result {i}:")
-        print(f"  Title: {result['title']}")
         print(f"  Link: {result['link']}")
         if result['text']:
             print(f"  Text: {result['text'][:100]}...")  # Print first 100 characters
@@ -232,61 +218,92 @@ def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_
     if not all_results:
         print("No search results found. Returning a default message.")
-        return [{"link": None, "title": "No Results", "text": "No information found in the web search results."}]
     return all_results
-def summarize_content(content, model):
-    if content is None:
-        return "No content available to summarize."
-    summary_prompt = f"""
-    You are a financial analyst and given a task to summarize the following news article in concise and coherent brief paragraph. Focus on the key points, main events, significant details and any point that could have major implications. Ensure the summary is informative and relevant to current news:
-    {content[:3000]}  # Limit input to avoid token limits
     Summary:
     """
-    summary = generate_chunked_response(model, summary_prompt, max_tokens=300)  # Adjust max_tokens as needed
     return summary
-def rank_search_results(titles, summaries, model):
-    if not titles or not summaries:
-        print("No titles or summaries to rank.")
-        return list(range(1, len(titles) + 1))
-    ranking_prompt = (
-        "Rank the following search results from a financial analyst perspective. "
-        f"Assign a rank from 1 to {len(titles)} based on relevance, with 1 being the most relevant. "
-        "Return only the numeric ranks in order, separated by commas.\n\n"
-        "Titles and summaries:\n"
-    )
-    for i, (title, summary) in enumerate(zip(titles, summaries), 1):
-        ranking_prompt += f"{i}. Title: {title}\nSummary: {summary}\n\n"
-    ranking_prompt += "Ranks:"
-    try:
-        ranks_str = generate_chunked_response(model, ranking_prompt)
-        print(f"Model output for ranking: {ranks_str}")
-        if not ranks_str.strip():
-            print("Model returned an empty string for ranking.")
-            return list(range(1, len(titles) + 1))
-        ranks = [float(rank.strip()) for rank in ranks_str.split(',') if rank.strip()]
-        if len(ranks) != len(titles):
-            print(f"Warning: Number of ranks ({len(ranks)}) does not match number of titles ({len(titles)})")
-            return list(range(1, len(titles) + 1))
-        return ranks
-    except Exception as e:
-        print(f"Error in ranking: {str(e)}. Using fallback ranking method.")
-        return list(range(1, len(titles) + 1))
-def ask_question(question, temperature, top_p, repetition_penalty, web_search):
     global conversation_history
     if not question:
@@ -295,39 +312,24 @@ def ask_question(question, temperature, top_p, repetition_penalty, web_search):
     model = get_model(temperature, top_p, repetition_penalty)
     embed = get_embeddings()
     if web_search:
         search_results = google_search(question)
-        processed_results = []
-        for index, result in enumerate(search_results, start=1):
-            if result["text"] is not None:
-                try:
-                    summary = summarize_content(result["text"], model)
-                    processed_results.append({
-                        "title": result.get("title", f"Result {index}"),
-                        "summary": summary,
-                        "index": index
-                    })
-                except Exception as e:
-                    print(f"Error processing search result {index}: {str(e)}")
-            else:
-                print(f"Skipping result {index} due to None content")
-        if not processed_results:
-            return "No valid search results found."
-        print(f"Number of processed results: {len(processed_results)}")
-        # For news requests, return the summaries directly
-        if "news" in question.lower():
-            news_response = "Here are the latest news summaries on this topic:\n\n"
-            for result in processed_results[:5]:  # Limit to top 5 results
-                news_response += f"Title: {result['title']}\n\nSummary: {result['summary']}\n\n---\n\n"
-            return news_response.strip()
-        # For other questions, use the summaries as context
-        context_str = "\n\n".join([f"Title: {r['title']}\nSummary: {r['summary']}"
-                                   for r in processed_results])
         prompt_template = """
         Answer the question based on the following web search results:
@@ -335,17 +337,31 @@ def ask_question(question, temperature, top_p, repetition_penalty, web_search):
         {context}
         Current Question: {question}
         If the web search results don't contain relevant information, state that the information is not available in the search results.
-        Provide a concise and direct answer to the question:
         """
         prompt_val = ChatPromptTemplate.from_template(prompt_template)
         formatted_prompt = prompt_val.format(context=context_str, question=question)
-        answer = generate_chunked_response(model, formatted_prompt)
-    else:
-        if not os.path.exists("faiss_database"):
-            return "No documents available. Please upload documents or enable web search to answer questions."
-        database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
         history_str = "\n".join([f"Q: {item['question']}\nA: {item['answer']}" for item in conversation_history])
@@ -359,9 +375,26 @@ def ask_question(question, temperature, top_p, repetition_penalty, web_search):
         prompt_val = ChatPromptTemplate.from_template(prompt)
         formatted_prompt = prompt_val.format(history=history_str, context=context_str, question=question)
-        answer = generate_chunked_response(model, formatted_prompt)
-    if not web_search:
         memory_database[question] = answer
         conversation_history = manage_conversation_history(question, answer, conversation_history)
@@ -393,67 +426,6 @@ def update_vectors(files, use_recursive_splitter):
     return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files."
-def update_vector_db_with_search_results(search_results, ranks, current_date):
-    embed = get_embeddings()
-    documents = []
-    for result, rank in zip(search_results, ranks):
-        if result.get("summary"):
-            doc = Document(
-                page_content=result["summary"],
-                metadata={
-                    "search_date": current_date,
-                    "search_title": result.get("title", ""),
-                    "search_content": result.get("content", ""),
-                    "search_summary": result["summary"],
-                    "rank": rank
-                }
-            )
-            documents.append(doc)
-    if not documents:
-        print("No valid documents to add to the database.")
-        return
-    texts = [doc.page_content for doc in documents]
-    metadatas = [doc.metadata for doc in documents]
-    print(f"Number of documents to embed: {len(texts)}")
-    print(f"First document text: {texts[0][:100]}...")  # Print first 100 characters of the first document
-    try:
-        if os.path.exists("faiss_database"):
-            database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
-            database.add_texts(texts, metadatas=metadatas)
-        else:
-            database = FAISS.from_texts(texts, embed, metadatas=metadatas)
-        database.save_local("faiss_database")
-        print("Database updated successfully.")
-    except Exception as e:
-        print(f"Error updating database: {str(e)}")
-def export_vector_db_to_excel():
-    embed = get_embeddings()
-    database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
-    documents = database.docstore._dict.values()
-    data = [{
-        "Search Date": doc.metadata["search_date"],
-        "Search Title": doc.metadata["search_title"],
-        "Search Content": doc.metadata["search_content"],
-        "Search Summary": doc.metadata["search_summary"],
-        "Rank": doc.metadata["rank"]
-    } for doc in documents]
-    df = pd.DataFrame(data)
-    with NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp:
-        excel_path = tmp.name
-        df.to_excel(excel_path, index=False)
-    return excel_path
 def extract_db_to_excel():
     embed = get_embeddings()
     database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
@@ -485,7 +457,7 @@ def export_memory_db_to_excel():
 # Gradio interface
 with gr.Blocks() as demo:
-    gr.Markdown("# Chat with your PDF documents")
     with gr.Row():
         file_input = gr.Files(label="Upload your PDF documents", file_types=[".pdf"])
@@ -498,34 +470,30 @@ with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column(scale=2):
             chatbot = gr.Chatbot(label="Conversation")
-            question_input = gr.Textbox(label="Ask a question about your documents")
             submit_button = gr.Button("Submit")
         with gr.Column(scale=1):
             temperature_slider = gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, value=0.5, step=0.1)
             top_p_slider = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.9, step=0.1)
             repetition_penalty_slider = gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, value=1.0, step=0.1)
             web_search_checkbox = gr.Checkbox(label="Enable Web Search", value=False)
-    def chat(question, history, temperature, top_p, repetition_penalty, web_search):
-        answer = ask_question(question, temperature, top_p, repetition_penalty, web_search)
-        if "news" in question.lower():
-            # Split the answer into individual news items
-            news_items = answer.split("---")
-            for item in news_items:
-                if item.strip():
-                    history.append((question, item.strip()))
-        else:
-            history.append((question, answer))
         return "", history
-    submit_button.click(chat, inputs=[question_input, chatbot, temperature_slider, top_p_slider, repetition_penalty_slider, web_search_checkbox], outputs=[question_input, chatbot])
-    export_vector_db_button = gr.Button("Export Vector DB to Excel")
-    vector_db_excel_output = gr.File(label="Download Vector DB Excel File")
-    export_vector_db_button.click(export_vector_db_to_excel, inputs=[], outputs=vector_db_excel_output)
     extract_button = gr.Button("Extract Database to Excel")
     excel_output = gr.File(label="Download Excel File")
     extract_button.click(extract_db_to_excel, inputs=[], outputs=excel_output)
@@ -534,6 +502,10 @@ with gr.Blocks() as demo:
     memory_excel_output = gr.File(label="Download Memory Excel File")
     export_memory_button.click(export_memory_db_to_excel, inputs=[], outputs=memory_excel_output)
     clear_button = gr.Button("Clear Cache")
     clear_output = gr.Textbox(label="Cache Status")
     clear_button.click(clear_cache, inputs=[], outputs=clear_output)

 from langchain_core.documents import Document
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
 # Memory database to store question-answer pairs
 memory_database = {}
 conversation_history = []
+news_database = []
 def load_and_split_document_basic(file):
     """Loads and splits the document into pages."""
         huggingfacehub_api_token=huggingface_token
     )
+def generate_chunked_response(model, prompt, max_tokens=1000, max_chunks=5):
     full_response = ""
+    for i in range(max_chunks):
+        chunk = model(prompt + full_response, max_new_tokens=max_tokens)
+        chunk = chunk.strip()
+        if chunk.endswith((".", "!", "?")):
             full_response += chunk
             break
+        full_response += chunk
     return full_response.strip()
 def manage_conversation_history(question, answer, history, max_history=5):
             print(f"Found {len(result_block)} results on this page")
             for result in result_block:
                 link = result.find("a", href=True)
+                if link:
                     link = link["href"]
                     print(f"Processing link: {link}")
                     try:
                         webpage = session.get(link, headers=headers, timeout=timeout)
                         visible_text = extract_text_from_webpage(webpage.text)
                         if len(visible_text) > max_chars_per_page:
                             visible_text = visible_text[:max_chars_per_page] + "..."
+                        all_results.append({"link": link, "text": visible_text})
                         print(f"Successfully extracted text from {link}")
                     except requests.exceptions.RequestException as e:
                         print(f"Error retrieving webpage content: {e}")
+                        all_results.append({"link": link, "text": None})
                 else:
+                    print("No link found for this result")
+                    all_results.append({"link": None, "text": None})
             start += len(result_block)
     print(f"Search completed. Total results: {len(all_results)}")
     print("Search results:")
     for i, result in enumerate(all_results, 1):
         print(f"Result {i}:")
         print(f"  Link: {result['link']}")
         if result['text']:
             print(f"  Text: {result['text'][:100]}...")  # Print first 100 characters
     if not all_results:
         print("No search results found. Returning a default message.")
+        return [{"link": None, "text": "No information found in the web search results."}]
     return all_results
+def fetch_google_news_rss(query, num_results=10):
+    base_url = "https://news.google.com/rss/search"
+    params = {
+        "q": query,
+        "hl": "en-US",
+        "gl": "US",
+        "ceid": "US:en"
+    }
+    url = f"{base_url}?{urllib.parse.urlencode(params)}"
+    feed = feedparser.parse(url)
+    articles = []
+    for entry in feed.entries[:num_results]:
+        article = {
+            "published_date": entry.published,
+            "title": entry.title,
+            "url": entry.link,
+            "content": entry.summary
+        }
+        articles.append(article)
+    return articles
+def summarize_news_content(content, model):
+    prompt_template = """
+    Summarize the following news article in a concise manner:
+    {content}
     Summary:
     """
+    prompt = ChatPromptTemplate.from_template(prompt_template)
+    formatted_prompt = prompt.format(content=content)
+    summary = generate_chunked_response(model, formatted_prompt, max_tokens=200)
     return summary
+def process_google_news_rss(query, temperature, top_p, repetition_penalty):
+    model = get_model(temperature, top_p, repetition_penalty)
+    embed = get_embeddings()
+    articles = fetch_google_news_rss(query)
+    processed_articles = []
+    for article in articles:
+        summary = summarize_news_content(article["content"], model)
+        processed_article = {
+            "published_date": article["published_date"],
+            "title": article["title"],
+            "url": article["url"],
+            "content": article["content"],
+            "summary": summary
+        }
+        processed_articles.append(processed_article)
+    # Add processed articles to the database
+    docs = [Document(page_content=article["summary"], metadata={"url": article["url"], "title": article["title"], "published_date": article["published_date"]}) for article in processed_articles]
+    if os.path.exists("faiss_database"):
+        database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
+        database.add_documents(docs)
+    else:
+        database = FAISS.from_documents(docs, embed)
+    database.save_local("faiss_database")
+    # Update news_database for excel export
+    global news_database
+    news_database.extend(processed_articles)
+    return f"Processed and added {len(processed_articles)} news articles to the database."
+def export_news_to_excel():
+    global news_database
+    df = pd.DataFrame(news_database)
+    with NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp:
+        excel_path = tmp.name
+        df.to_excel(excel_path, index=False)
+    return excel_path
+def ask_question(question, temperature, top_p, repetition_penalty, web_search, google_news_rss):
     global conversation_history
     if not question:
     model = get_model(temperature, top_p, repetition_penalty)
     embed = get_embeddings()
+    # Check if the FAISS database exists
+    if os.path.exists("faiss_database"):
+        database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
+    else:
+        database = None
     if web_search:
         search_results = google_search(question)
+        web_docs = [Document(page_content=result["text"], metadata={"source": result["link"]}) for result in search_results if result["text"]]
+        if database is None:
+            database = FAISS.from_documents(web_docs, embed)
+        else:
+            database.add_documents(web_docs)
+        database.save_local("faiss_database")
+        context_str = "\n".join([doc.page_content for doc in web_docs])
         prompt_template = """
         Answer the question based on the following web search results:
         {context}
         Current Question: {question}
         If the web search results don't contain relevant information, state that the information is not available in the search results.
+        Provide a concise and direct answer to the question without mentioning the web search or these instructions:
         """
         prompt_val = ChatPromptTemplate.from_template(prompt_template)
         formatted_prompt = prompt_val.format(context=context_str, question=question)
+    elif google_news_rss:
+        if database is None:
+            return "No news articles available. Please fetch news articles first."
+        retriever = database.as_retriever()
+        relevant_docs = retriever.get_relevant_documents(question)
+        context_str = "\n".join([f"Title: {doc.metadata['title']}\nURL: {doc.metadata['url']}\nSummary: {doc.page_content}" for doc in relevant_docs])
+        prompt_template = """
+        Answer the question based on the following news summaries:
+        News Summaries:
+        {context}
+        Current Question: {question}
+        If the news summaries don't contain relevant information, state that the information is not available in the news articles.
+        Provide a concise and direct answer to the question without mentioning the news summaries or these instructions:
+        """
+        prompt_val = ChatPromptTemplate.from_template(prompt_template)
+        formatted_prompt = prompt_val.format(context=context_str, question=question)
+    else:
+        if database is None:
+            return "No documents available. Please upload documents, enable web search, or fetch news articles to answer questions."
         history_str = "\n".join([f"Q: {item['question']}\nA: {item['answer']}" for item in conversation_history])
         prompt_val = ChatPromptTemplate.from_template(prompt)
         formatted_prompt = prompt_val.format(history=history_str, context=context_str, question=question)
+    full_response = generate_chunked_response(model, formatted_prompt)
+    # Extract only the part after the last occurrence of a prompt-like sentence
+    answer_patterns = [
+        r"Provide a concise and direct answer to the question without mentioning the web search or these instructions:",
+        r"Provide a concise and direct answer to the question without mentioning the news summaries or these instructions:",
+        r"Provide a concise and direct answer to the question:",
+        r"Answer:"
+    ]
+    for pattern in answer_patterns:
+        match = re.split(pattern, full_response, flags=re.IGNORECASE)
+        if len(match) > 1:
+            answer = match[-1].strip()
+            break
+    else:
+        # If no pattern is found, return the full response
+        answer = full_response.strip()
+    if not web_search and not google_news_rss:
         memory_database[question] = answer
         conversation_history = manage_conversation_history(question, answer, conversation_history)
     return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files."
 def extract_db_to_excel():
     embed = get_embeddings()
     database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
 # Gradio interface
 with gr.Blocks() as demo:
+    gr.Markdown("# Chat with your PDF documents and News")
     with gr.Row():
         file_input = gr.Files(label="Upload your PDF documents", file_types=[".pdf"])
     with gr.Row():
         with gr.Column(scale=2):
             chatbot = gr.Chatbot(label="Conversation")
+            question_input = gr.Textbox(label="Ask a question about your documents or news")
             submit_button = gr.Button("Submit")
         with gr.Column(scale=1):
             temperature_slider = gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, value=0.5, step=0.1)
             top_p_slider = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.9, step=0.1)
             repetition_penalty_slider = gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, value=1.0, step=0.1)
             web_search_checkbox = gr.Checkbox(label="Enable Web Search", value=False)
+            google_news_rss_checkbox = gr.Checkbox(label="Google News RSS", value=False)
+    with gr.Row():
+        news_query_input = gr.Textbox(label="Enter news query")
+        fetch_news_button = gr.Button("Fetch News")
+    news_fetch_output = gr.Textbox(label="News Fetch Status")
+    def chat(question, history, temperature, top_p, repetition_penalty, web_search, google_news_rss):
+        answer = ask_question(question, temperature, top_p, repetition_penalty, web_search, google_news_rss)
+        history.append((question, answer))
         return "", history
+    submit_button.click(chat, inputs=[question_input, chatbot, temperature_slider, top_p_slider, repetition_penalty_slider, web_search_checkbox, google_news_rss_checkbox], outputs=[question_input, chatbot])
+    fetch_news_button.click(process_google_news_rss, inputs=[news_query_input, temperature_slider, top_p_slider, repetition_penalty_slider], outputs=news_fetch_output)
     extract_button = gr.Button("Extract Database to Excel")
     excel_output = gr.File(label="Download Excel File")
     extract_button.click(extract_db_to_excel, inputs=[], outputs=excel_output)
     memory_excel_output = gr.File(label="Download Memory Excel File")
     export_memory_button.click(export_memory_db_to_excel, inputs=[], outputs=memory_excel_output)
+    export_news_button = gr.Button("Download News Excel File")
+    news_excel_output = gr.File(label="Download News Excel File")
+    export_news_button.click(export_news_to_excel, inputs=[], outputs=news_excel_output)
     clear_button = gr.Button("Clear Cache")
     clear_output = gr.Textbox(label="Cache Status")
     clear_button.click(clear_cache, inputs=[], outputs=clear_output)