Spaces:

bbfizp
/

patent-mcp

Running

File size: 3,571 Bytes

import gradio as gr
import requests
import urllib.parse
import xml.etree.ElementTree as ET
from typing import List, Dict


athtk = "0dWGjl7XuVq54v012KjGLEhRSLjj"

def update_tk():

    url = "https://ops.epo.org/3.2/auth/accesstoken"

    payload = 'grant_type=client_credentials'
    headers = {
    'authorization': 'Basic RU45NzZ3RE5UM09lZ3ZCUURwMHE0NlJXN0xwZE5CNjNFZTNxRGJ6UnJwNGFWQVBmOnhZR05uQzk1N0dKV0lvcnM2ZWV4TmVrcUFybUVtdzU1ZkxjbUZlcDdDR0w5ZzZCdGsyY0hMeVBHZTkwQTBXOEE=',
    'Content-Type': 'application/x-www-form-urlencoded'
    }

    response = requests.request("POST", url, headers=headers, data=payload)
    global athtk 
    athtk = response.json()['access_token']

def extract_patent_abstracts(xml_content: str) -> List[Dict[str, str]]:
    """
    Extracts English abstracts, country, document number, and date from patent XML data.

    Args:
        xml_content (str): XML content as a string.

    Returns:
        List[Dict[str, str]]: A list of dictionaries containing patent information.
    """
    root = ET.fromstring(xml_content)
    namespaces = {'default': 'http://www.epo.org/exchange'}
    extracted_patents = []

    # Find all 'exchange-document' elements
    for doc in root.findall('.//default:exchange-document', namespaces):
        # Directly find the English abstract using a specific XPath predicate.
        # If no English abstract is found, skip to the next document.
        if (en_abstract := doc.find('.//default:abstract[@lang="en"]', namespaces)) is None:
            continue

        # Extract abstract text from child <p> elements
        abstract_text = ' '.join(p.text.strip() for p in en_abstract.findall('default:p', namespaces) if p.text)

        # **FIX:** The XPath has been corrected to include the namespace prefix for each element in the path.
        # This path is also made more specific to target the date within the 'docdb' document-id,
        # ensuring the correct date is retrieved.
        date_elem = doc.find('./default:bibliographic-data/default:publication-reference/default:document-id[@document-id-type="docdb"]/default:date', namespaces)
        date = date_elem.text if date_elem is not None else 'N/A'
        
        # Build the dictionary and add it to the list
        extracted_patents.append({
            'country': doc.get('country', 'N/A'),
            'doc_number': doc.get('doc-number', 'N/A'),
            'date': date,
            'abstract': abstract_text,
        })

    return extracted_patents



def search_from_abstract(query,retried=False):
    
    base_url = "https://ops.epo.org/3.2/rest-services/published-data/search"
    endpoint = "abstract"
    start_range = 1
    end_range = 100
    
    headers = {
        'accept': 'application/xml',
        'Authorization': f'Bearer {athtk}',
        }
    print(headers)
    url = f"{base_url}/{endpoint}?q={urllib.parse.quote_plus(query)}&Range={start_range}-{end_range}"

    response = requests.request("GET", url, headers=headers)
    if response.status_code == 400 and not retried:
        update_tk()
        return search_from_abstract(query, retried=True)
    elif response.status_code == 200:
        return extract_patent_abstracts(response.text)
    else:
        print(f"Error: {response.status_code} - {response.text}")
        response.raise_for_status()


app = gr.Interface(
    fn=search_from_abstract,
    inputs="text",
    outputs="text",
    title="Patent Abstract Search",
    description="Search patents by abstract using the European Patent Office API."
)

app.launch(mcp_server=True,share=True)