import re import unicodedata import requests from bs4 import BeautifulSoup def retrieve_parsed_doc(patent_information, summaries_generated): try: language_config = "en" if "https" in patent_information: patent_code = patent_information.split("/")[4] else: patent_code = patent_information URL = f"https://patents.google.com/patent/{patent_code}/{language_config}" page = requests.get(URL) soup = BeautifulSoup(page.content, 'lxml') if "Abstract" in summaries_generated: abstract = clean_text(soup.find({"div":{"class":"abstract"}}).prettify()) else: abstract = None if "Background" in summaries_generated: background = clean_text(soup.find_all(itemprop="description", itemscope="")[-1:][0].prettify()) else: background = None if "Claims" in summaries_generated: claims = soup.find(itemprop="claims") main_claim = claims.find_all({"div":{"class":"claim"}}) main_claims = main_claim[0].select("div[class=claim]") formatted_claims = set() for i in main_claims: formatted_claims.add(clean_text(i.prettify())) try: formatted_claims.remove('') except: pass claim_list = sorted(list(formatted_claims), key=len, reverse=True) else: claim_list = None return [abstract, background, claim_list] except Exception as e: print(f'[ERROR] {e}') return None def get_word_index(s, limit): try: words = re.findall(r'\s*\S+\s*', s) return sum(map(len, words[:limit])) + len(words[limit]) - len(words[limit].lstrip()) except: l = len(s) chr_limit = 3500 return l if l < chr_limit else chr_limit def post_process(s): # Basic post-processing if s[0] == " ": s = s[1:] s = s.replace("- ", "-").replace(" .", ".") return ".".join(s.split(".")[:-1])+"." def clean_text(text): # TODO: optimize text cleaning reg = re.compile(r'<.*?>') cleaned = reg.sub('', text) cleaned = re.sub(r'\([^)]*\)', '', cleaned) cleaned = re.sub(r"(\w)([A-Z]+)", r'.', cleaned) cleaned = cleaned.strip() cleaned = cleaned.lstrip() cleaned = "".join(ch for ch in cleaned if unicodedata.category(ch)[0]!="C") cleaned = re.sub(' +', ' ', cleaned) cleaned = cleaned.replace(";", ", and") cleaned = cleaned.replace(":", "") cleaned = cleaned.replace(" .", ".") cleaned = cleaned.replace(" ,", ",") cleaned = cleaned.replace("\xa0", " ") cleaned = cleaned.lstrip('0123456789.- ') # remove nums at start cleaned = re.sub(r'\b(\w+)( \1\b)+', r'\1', cleaned) #remove repeated consecutive words return cleaned