patent-summarizer / util /textproc.py
amielle's picture
fix: Update handler for text length
6bc4ea4
raw history blame
No virus
2.91 kB
import re
import unicodedata
import requests
from bs4 import BeautifulSoup
def retrieve_parsed_doc(patent_information, summaries_generated):
try:
language_config = "en"
if "https" in patent_information:
patent_code = patent_information.split("/")[4]
else:
patent_code = patent_information
URL = f"https://patents.google.com/patent/{patent_code}/{language_config}"
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'lxml')
if "Abstract" in summaries_generated:
abstract = clean_text(soup.find({"div":{"class":"abstract"}}).prettify())
else:
abstract = None
if "Background" in summaries_generated:
background = clean_text(soup.find_all(itemprop="description",
itemscope="")[-1:][0].prettify())
else:
background = None
if "Claims" in summaries_generated:
claims = soup.find(itemprop="claims")
main_claim = claims.find_all({"div":{"class":"claim"}})
main_claims = main_claim[0].select("div[class=claim]")
formatted_claims = set()
for i in main_claims:
formatted_claims.add(clean_text(i.prettify()))
try:
formatted_claims.remove('')
except:
pass
claim_list = sorted(list(formatted_claims), key=len, reverse=True)
else:
claim_list = None
return [abstract, background, claim_list]
except Exception as e:
print(f'[ERROR] {e}')
return None
def get_word_index(s, limit):
try:
words = re.findall(r'\s*\S+\s*', s)
return sum(map(len, words[:limit])) + len(words[limit]) - len(words[limit].lstrip())
except:
l = len(s)
chr_limit = 3500
return l if l < chr_limit else chr_limit
def post_process(s):
# Basic post-processing
if s[0] == " ": s = s[1:]
s = s.replace("- ", "-").replace(" .", ".")
return ".".join(s.split(".")[:-1])+"."
def clean_text(text):
# TODO: optimize text cleaning
reg = re.compile(r'<.*?>')
cleaned = reg.sub('', text)
cleaned = re.sub(r'\([^)]*\)', '', cleaned)
cleaned = re.sub(r"(\w)([A-Z]+)", r'.', cleaned)
cleaned = cleaned.strip()
cleaned = cleaned.lstrip()
cleaned = "".join(ch for ch in cleaned if unicodedata.category(ch)[0]!="C")
cleaned = re.sub(' +', ' ', cleaned)
cleaned = cleaned.replace(";", ", and")
cleaned = cleaned.replace(":", "")
cleaned = cleaned.replace(" .", ".")
cleaned = cleaned.replace(" ,", ",")
cleaned = cleaned.replace("\xa0", " ")
cleaned = cleaned.lstrip('0123456789.- ') # remove nums at start
cleaned = re.sub(r'\b(\w+)( \1\b)+', r'\1', cleaned) #remove repeated consecutive words
return cleaned