Spaces:

DrishtiSharma
/

chat-w-google-patents

Runtime error

App Files Files Community

DrishtiSharma commited on Dec 19, 2024

Commit

7f17ee4

verified ·

1 Parent(s): fd79de0

Update test.py

Browse files

Files changed (1) hide show

test.py +183 -167

test.py CHANGED Viewed

@@ -1,170 +1,186 @@
-from typing import List, Union, Optional
 import os
-import requests
 import re
-import time
 import shutil
-import subprocess
-import pandas as pd
-from selenium import webdriver
-from selenium.webdriver.common.keys import Keys
-from selenium.webdriver.chrome.service import Service
-from selenium.webdriver.chrome.options import Options
-from bs4 import BeautifulSoup
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-import chromedriver_autoinstaller
-class PatentDownloader:
-    url = "https://patents.google.com"
-    def __init__(self, verbose: bool = False):
-        """
-        Parameters
-        ----------
-        verbose : bool
-            Print additional debug information.
-        """
-        self.verbose = verbose
-        self.chrome_path = self.install_chrome()
-    def install_chrome(self) -> str:
-        """
-        Download and install Google Chrome dynamically.
-        Returns
-        -------
-        str: Path to the Chrome binary.
-        """
-        chrome_path = "/usr/bin/google-chrome"
-        if not shutil.which("google-chrome"):
-            print("Downloading and installing Google Chrome...")
-            subprocess.run(
-                "wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb -O chrome.deb",
-                shell=True,
-                check=True,
-            )
-            subprocess.run(
-                "apt-get update && apt-get install -y ./chrome.deb",
-                shell=True,
-                check=True,
-            )
-            os.remove("chrome.deb")
-        if not shutil.which("google-chrome"):
-            raise ValueError("Google Chrome installation failed!")
-        return chrome_path
-    def download(self, patent: Union[str, List[str]], output_path: str = "./",
-                 waiting_time: int = 10, remove_kind_codes: Optional[List[str]] = None) -> None:
-        """
-        Download patent document(s) as PDF.
-        """
-        if isinstance(patent, list) or os.path.isfile(patent):
-            self.get_pdfs(patent, output_path, waiting_time, remove_kind_codes)
-        else:
-            self.get_pdf(patent, output_path, waiting_time, remove_kind_codes)
-    def get_pdf(self, patent: str, output_path: str = "./", waiting_time: int = 10,
-                remove_kind_codes: Optional[List[str]] = None) -> None:
-        """
-        Download a single patent PDF.
-        """
-        if remove_kind_codes:
-            for kind_code in remove_kind_codes:
-                patent = re.sub(kind_code + "$", "", patent)
-        # Automatically install ChromeDriver
-        chromedriver_autoinstaller.install()
-        # Set up Chrome options
-        chrome_options = Options()
-        chrome_options.binary_location = self.chrome_path
-        chrome_options.add_argument("--headless")
-        chrome_options.add_argument("--no-sandbox")
-        chrome_options.add_argument("--disable-dev-shm-usage")
-        # Initialize Selenium WebDriver
-        service = Service()
-        driver = webdriver.Chrome(service=service, options=chrome_options)
-        pdf_link = None  # Ensure pdf_link is defined
-        try:
-            driver.get(self.url)
-            # Wait for the search input field and interact with it
-            print("Waiting for the search input field...")
-            search_input_xpath = "//input[@aria-label='Search patents']"
-            WebDriverWait(driver, 20).until(
-                EC.presence_of_element_located((By.XPATH, search_input_xpath))
-            )
-            element = driver.find_element(By.XPATH, search_input_xpath)
-            print("Search input field located.")
-            element.send_keys(patent)
-            element.send_keys(Keys.RETURN)
-            # Wait for search results to load
-            print("Waiting for search results to load...")
-            WebDriverWait(driver, 20).until(
-                EC.presence_of_element_located((By.TAG_NAME, "body"))
-            )
-            time.sleep(waiting_time)
-            # Parse HTML and get the PDF link
-            soup = BeautifulSoup(driver.page_source, "html.parser")
-            pdf_link = self.get_pdf_link(soup, patent)
-        except Exception as e:
-            print(f"Error occurred: {e}")
-        finally:
-            driver.quit()
-        # Download the PDF
-        if pdf_link:
-            validate_directory(output_path)
-            pdf_content = requests.get(pdf_link).content
-            with open(os.path.join(output_path, f"{patent}.pdf"), "wb") as file:
-                file.write(pdf_content)
-            print(f">>> Patent {patent} successfully downloaded <<<")
-        else:
-            print(f"Error: PDF link for patent {patent} not found!")
-    def get_pdfs(self, patents: Union[List[str], str], output_path: str = "./",
-                 waiting_time: int = 10, remove_kind_codes: Optional[List[str]] = None) -> None:
-        """
-        Download multiple patent PDFs from a list or file.
-        """
-        if isinstance(patents, str):
-            if patents.lower().endswith('csv'):
-                df_patents = pd.read_csv(patents)
-                patents = df_patents['patent_number'].to_list()
-            elif patents.lower().endswith('txt'):
-                with open(patents, 'r') as txt_file:
-                    patents = txt_file.read().splitlines()
-            else:
-                raise NotImplementedError(f'Unsupported file type: {patents}')
-        for i, patent in enumerate(patents):
-            print(len(patents) - i, "patent(s) remaining.")
-            self.get_pdf(patent, output_path, waiting_time, remove_kind_codes)
-    @staticmethod
-    def get_pdf_link(soup: BeautifulSoup, patent: str) -> Optional[str]:
-        """
-        Extract the PDF link from parsed HTML.
-        """
-        pdf_links = [link['href'] for link in soup.find_all('a', href=True) if link['href'].lower().endswith("pdf")]
-        for link in pdf_links:
-            if patent.lower() in link.lower():
-                return link
-        return None
-def validate_directory(directory: str) -> None:
-    """
-    Ensure the output directory exists.
-    """
-    if not os.path.exists(directory):
-        os.makedirs(directory)

+import sys
 import os
 import re
 import shutil
+import time
+import streamlit as st
+import nltk
+# Ensure NLTK 'punkt' resource is downloaded
+nltk_data_path = os.path.join(os.getcwd(), "nltk_data")
+os.makedirs(nltk_data_path, exist_ok=True)
+nltk.data.path.append(nltk_data_path)
+# Force download of the 'punkt' resource
+try:
+    print("Ensuring NLTK 'punkt' resource is downloaded...")
+    nltk.download("punkt", download_dir=nltk_data_path)
+except Exception as e:
+    print(f"Error downloading NLTK 'punkt': {e}")
+sys.path.append(os.path.abspath("."))
+from langchain.chains import ConversationalRetrievalChain
+from langchain.memory import ConversationBufferMemory
+from langchain.llms import OpenAI
+from langchain.document_loaders import UnstructuredPDFLoader
+from langchain.vectorstores import Chroma
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.text_splitter import NLTKTextSplitter
+from patent_downloader import PatentDownloader
+PERSISTED_DIRECTORY = "."
+# Fetch API key securely from the environment
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+if not OPENAI_API_KEY:
+    st.error("Critical Error: OpenAI API key not found in the environment variables. Please configure it.")
+    st.stop()
+def check_poppler_installed():
+    if not shutil.which("pdfinfo"):
+        raise EnvironmentError(
+            "Poppler is not installed or not in PATH. Install 'poppler-utils' for PDF processing."
+        )
+check_poppler_installed()
+def load_docs(document_path):
+    try:
+        loader = UnstructuredPDFLoader(
+            document_path,
+            mode="elements",
+            strategy="fast",
+            ocr_languages=None  # Explicitly disable OCR
+        )
+        documents = loader.load()
+        text_splitter = NLTKTextSplitter(chunk_size=1000)
+        return text_splitter.split_documents(documents)
+    except Exception as e:
+        st.error(f"Failed to load and process PDF: {e}")
+        st.stop()
+def already_indexed(vectordb, file_name):
+    indexed_sources = set(
+        x["source"] for x in vectordb.get(include=["metadatas"])["metadatas"]
+    )
+    return file_name in indexed_sources
+def load_chain(file_name=None):
+    loaded_patent = st.session_state.get("LOADED_PATENT")
+    vectordb = Chroma(
+        persist_directory=PERSISTED_DIRECTORY,
+        embedding_function=HuggingFaceEmbeddings(),
+    )
+    if loaded_patent == file_name or already_indexed(vectordb, file_name):
+        st.write("✅ Already indexed.")
+    else:
+        vectordb.delete_collection()
+        docs = load_docs(file_name)
+        st.write("🔍 Number of Documents: ", len(docs))
+        vectordb = Chroma.from_documents(
+            docs, HuggingFaceEmbeddings(), persist_directory=PERSISTED_DIRECTORY
+        )
+        vectordb.persist()
+        st.session_state["LOADED_PATENT"] = file_name
+    memory = ConversationBufferMemory(
+        memory_key="chat_history",
+        return_messages=True,
+        input_key="question",
+        output_key="answer",
+    )
+    return ConversationalRetrievalChain.from_llm(
+        OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY),
+        vectordb.as_retriever(search_kwargs={"k": 3}),
+        return_source_documents=False,
+        memory=memory,
+    )
+def extract_patent_number(url):
+    pattern = r"/patent/([A-Z]{2}\d+)"
+    match = re.search(pattern, url)
+    return match.group(1) if match else None
+def download_pdf(patent_number):
+    try:
+        patent_downloader = PatentDownloader(verbose=True)
+        output_path = patent_downloader.download(patents=patent_number)
+        return output_path[0]  # Return the first file path
+    except Exception as e:
+        st.error(f"Failed to download patent PDF: {e}")
+        st.stop()
+if __name__ == "__main__":
+    st.set_page_config(
+        page_title="Patent Chat: Google Patents Chat Demo",
+        page_icon="📖",
+        layout="wide",
+        initial_sidebar_state="expanded",
+    )
+    st.header("📖 Patent Chat: Google Patents Chat Demo")
+    # Allow user to input the Google patent link
+    patent_link = st.text_input("Enter Google Patent Link:", key="PATENT_LINK")
+    if not patent_link:
+        st.warning("Please enter a Google patent link to proceed.")
+        st.stop()
+    patent_number = extract_patent_number(patent_link)
+    if not patent_number:
+        st.error("Invalid patent link format. Please provide a valid Google patent link.")
+        st.stop()
+    st.write(f"Patent number: **{patent_number}**")
+    # Download the PDF file
+    pdf_path = f"{patent_number}.pdf"
+    if os.path.isfile(pdf_path):
+        st.write("✅ File already downloaded.")
+    else:
+        st.write("📥 Downloading patent file...")
+        pdf_path = download_pdf(patent_number)
+        st.write(f"✅ File downloaded: {pdf_path}")
+    # Load the conversational chain
+    st.write("🔄 Loading document into the system...")
+    chain = load_chain(pdf_path)
+    st.success("🚀 Document successfully loaded! You can now start asking questions.")
+    # Initialize the chat
+    if "messages" not in st.session_state:
+        st.session_state["messages"] = [
+            {"role": "assistant", "content": "Hello! How can I assist you with this patent?"}
+        ]
+    # Display chat history
+    for message in st.session_state.messages:
+        with st.chat_message(message["role"]):
+            st.markdown(message["content"])
+    # User input
+    if user_input := st.chat_input("What is your question?"):
+        st.session_state.messages.append({"role": "user", "content": user_input})
+        with st.chat_message("user"):
+            st.markdown(user_input)
+        # Generate assistant response
+        with st.chat_message("assistant"):
+            message_placeholder = st.empty()
+            full_response = ""
+        with st.spinner("Generating response..."):
+            try:
+                assistant_response = chain({"question": user_input})
+                for chunk in assistant_response["answer"].split():
+                    full_response += chunk + " "
+                    time.sleep(0.05)  # Simulate typing effect
+                    message_placeholder.markdown(full_response + "▌")
+            except Exception as e:
+                full_response = f"An error occurred: {e}"
+            finally:
+                message_placeholder.markdown(full_response)
+        st.session_state.messages.append({"role": "assistant", "content": full_response})