File size: 2,907 Bytes
b2fdf59
82331bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b21b95
 
82331bb
 
 
 
 
 
 
 
6bc4ea4
 
 
82331bb
 
 
 
 
 
 
 
 
b2fdf59
 
82331bb
b2fdf59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82331bb
b2fdf59
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import re
import unicodedata
import requests
from bs4 import BeautifulSoup

def retrieve_parsed_doc(patent_information, summaries_generated):
    try:
        language_config = "en"
        if "https" in patent_information:
            patent_code = patent_information.split("/")[4]
        else:
            patent_code = patent_information
        URL = f"https://patents.google.com/patent/{patent_code}/{language_config}"
        page = requests.get(URL)

        soup = BeautifulSoup(page.content, 'lxml')

        if "Abstract" in summaries_generated:
            abstract = clean_text(soup.find({"div":{"class":"abstract"}}).prettify())
        else:
            abstract = None

        if "Background" in summaries_generated:
            background = clean_text(soup.find_all(itemprop="description",
                                                  itemscope="")[-1:][0].prettify())
        else:
            background = None

        if "Claims" in summaries_generated:
            claims = soup.find(itemprop="claims")
            main_claim =  claims.find_all({"div":{"class":"claim"}})
            main_claims = main_claim[0].select("div[class=claim]")
            formatted_claims = set()
            for i in main_claims:
                formatted_claims.add(clean_text(i.prettify()))
            try:
                formatted_claims.remove('')
            except:
                pass
            claim_list = sorted(list(formatted_claims), key=len, reverse=True)
        else:
            claim_list = None

        return [abstract, background, claim_list]
    except Exception as e:
        print(f'[ERROR] {e}')
        return None


def get_word_index(s, limit):
    try:
        words = re.findall(r'\s*\S+\s*', s)
        return sum(map(len, words[:limit])) + len(words[limit]) - len(words[limit].lstrip())
    except:
        l = len(s)
        chr_limit = 3500
        return l if l < chr_limit else chr_limit


def post_process(s):
    # Basic post-processing
    
    if s[0] == " ": s = s[1:]
    s = s.replace("- ", "-").replace(" .", ".")
    return ".".join(s.split(".")[:-1])+"."


def clean_text(text):
    # TODO: optimize text cleaning
    reg = re.compile(r'<.*?>')
    cleaned = reg.sub('', text)
    cleaned = re.sub(r'\([^)]*\)', '', cleaned)
    cleaned = re.sub(r"(\w)([A-Z]+)", r'.', cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.lstrip()
    cleaned = "".join(ch for ch in cleaned if unicodedata.category(ch)[0]!="C")
    cleaned = re.sub(' +', ' ', cleaned)
    cleaned = cleaned.replace(";", ", and")
    cleaned = cleaned.replace(":", "")
    cleaned = cleaned.replace(" .", ".")
    cleaned = cleaned.replace(" ,", ",")
    cleaned = cleaned.replace("\xa0", " ")
    cleaned = cleaned.lstrip('0123456789.- ') # remove nums at start
    cleaned = re.sub(r'\b(\w+)( \1\b)+', r'\1', cleaned) #remove repeated consecutive words

    return cleaned