| | |
| | from shapely.geometry import Point |
| | import pandas as pd |
| | from tqdm import tqdm |
| | import streamlit as st |
| | import numpy as np |
| | import json, requests |
| | import pandas as pd |
| | |
| |
|
| | import matplotlib.pyplot as plt |
| | import seaborn as sns |
| | from math import radians, cos, sin, asin, sqrt |
| |
|
| | |
| | |
| |
|
| |
|
| | from sentence_transformers import SentenceTransformer, util |
| |
|
| |
|
| | path = "Climate_site/python_scripts/" |
| |
|
| |
|
| | @st.cache_resource |
| | def model_nlp(): |
| | model = SentenceTransformer('all-MiniLM-L6-v2') |
| | return model |
| |
|
| | import unicodedata |
| |
|
| | from metaphone import doublemetaphone |
| | from fuzzywuzzy import fuzz |
| | from difflib import SequenceMatcher |
| | import re |
| |
|
| | import geopandas as gpd |
| | from geopandas import GeoDataFrame |
| |
|
| |
|
| | @st.cache_data |
| | def load_data(): |
| | url = path + "institutions.tsv" |
| | dic = pd.read_csv(url, delimiter = "\t" , index_col = 0).to_dict('index') |
| | return dic |
| |
|
| | dic_institutions = load_data() |
| |
|
| |
|
| | |
| |
|
| | def URL(base_URL , entity_type , filters): |
| | url = base_URL + entity_type + filters |
| | return url |
| |
|
| |
|
| | def get_data(url): |
| | url = requests.get(url) |
| | text = url.text |
| | import json |
| | data = json.loads(text) |
| | return data |
| |
|
| | |
| | |
| |
|
| | def norm(vector): |
| | return np.sqrt(sum(x * x for x in vector)) |
| |
|
| | def cosine_similarity2(vec_a, vec_b): |
| | norm_a = norm(vec_a) |
| | norm_b = norm(vec_b) |
| | dot = sum(a * b for a, b in zip(vec_a, vec_b)) |
| | return dot / (norm_a * norm_b) |
| |
|
| |
|
| |
|
| |
|
| | |
| |
|
| |
|
| | def related_patents(research_words , display): |
| | |
| |
|
| | dic_patents = {} |
| |
|
| | max_count = 0 |
| | base_URL_PV = "https://api.patentsview.org/" |
| | filter_works = "patents/query?" |
| | filter_PV = "q={%22_and%22:[{%22_text_all%22:{%22patent_abstract%22:%22" |
| | filter_PV += research_words |
| |
|
| | filter_PV += "%22}}]}&f=[%22patent_number%22,%22patent_title%22,%22assignee_country%22,%22patent_date%22,%22assignee_organization%22,%22inventor_longitude%22,%22inventor_latitude%22,%22inventor_last_name%22,%22inventor_id%22,%22inventor_first_name%22,%22cpc_subsection_title%22,%22assignee_city%22,%22patent_abstract%22,%22patent_kind%22,%22cpc_group_id%22,%22assignee_organization%22,%22citedby_patent_number%22]" |
| |
|
| | filter_PV = filter_PV.replace(" " , "%20") |
| |
|
| | |
| |
|
| | url = URL(base_URL_PV , filter_works, filter_PV) |
| | data = get_data(url) |
| |
|
| |
|
| | if display == True: |
| | print( data["total_patent_count"] , elem[-1] ) |
| | print(url) |
| |
|
| |
|
| |
|
| | |
| | for i in range(data["count"]): |
| | dic_patents[ "US-" + data["patents"][i]["patent_number"]] = {} |
| | dic_patents[ "US-" + data["patents"][i]["patent_number"]]["title"] = data["patents"][i]["patent_title"] |
| | dic_patents["US-" + data["patents"][i]["patent_number"]]["abstract"] = data["patents"][i]["patent_abstract"] |
| |
|
| | dic_patents[ "US-" + data["patents"][i]["patent_number"]]["assignee"] = str(data["patents"][i]["assignees"][0]["assignee_organization"]) |
| | dic_patents["US-" + data["patents"][i]["patent_number"]]["assignee_city"] = str(data["patents"][i]["assignees"][0]["assignee_city"]) |
| | dic_patents["US-" + data["patents"][i]["patent_number"]]["assignee_country"] = str(data["patents"][i]["assignees"][0]["assignee_country"]) |
| | for j in range(1, len(data["patents"][i]["assignees"])): |
| | dic_patents[ "US-" + data["patents"][i]["patent_number"]]["assignee"] += ", " + str(data["patents"][i]["assignees"][j]["assignee_organization"]) |
| | dic_patents[ "US-" + data["patents"][i]["patent_number"]]["assignee_city"] += ", " + str(data["patents"][i]["assignees"][j]["assignee_city"]) |
| | dic_patents["US-" + data["patents"][i]["patent_number"]]["assignee_country"] += ", " + str(data["patents"][i]["assignees"][j]["assignee_country"]) |
| |
|
| | dic_patents[ "US-" + data["patents"][i]["patent_number"]]["list_inventors"] = data["patents"][i]["inventors"] |
| |
|
| | dic_patents[ "US-" + data["patents"][i]["patent_number"]]["inventors"] = str(data["patents"][i]["inventors"][0]["inventor_first_name"]) + " " + str(data["patents"][i]["inventors"][0]["inventor_last_name"]) |
| | for j in range(1, len(data["patents"][i]["inventors"])): |
| | dic_patents[ "US-" + data["patents"][i]["patent_number"]]["inventors"] += ", " + str(data["patents"][i]["inventors"][j]["inventor_first_name"]) + " " + str(data["patents"][i]["inventors"][j]["inventor_last_name"]) |
| |
|
| |
|
| | dic_patents["US-" + data["patents"][i]["patent_number"]]["date"] = data["patents"][i]["patent_date"] |
| | dic_patents["US-" + data["patents"][i]["patent_number"]]["number_citations"] = len(data["patents"][i]["citedby_patents"]) |
| |
|
| |
|
| | if display == True: |
| | print(" ") |
| | |
| | |
| | return dic_patents |
| | |
| | |
| | |
| | |
| | |
| | def ranking_patents( research_words, details): |
| | |
| | model = model_nlp() |
| | |
| | dic_patents = related_patents( research_words , False) |
| | |
| |
|
| | dic_scores = {} |
| |
|
| |
|
| | |
| | reference_text = details |
| |
|
| | encoded_text = model.encode(reference_text, convert_to_tensor=False).tolist() |
| | |
| | |
| | if len(dic_patents ) == 0: |
| | return "Select other key words" |
| | |
| | else: |
| | for ids in list(dic_patents.keys()): |
| |
|
| | dic_scores[ids] = {} |
| |
|
| | encoded_title = model.encode(dic_patents[ids]["title"], convert_to_tensor=False).tolist() |
| | score_title = cosine_similarity2(encoded_title, encoded_text) |
| |
|
| |
|
| | if dic_patents[ids]["abstract"] != None: |
| | encoded_abstract = model.encode(dic_patents[ids]["abstract"], convert_to_tensor=False).tolist() |
| | score_abstract = cosine_similarity2(encoded_abstract, encoded_text) |
| | else: |
| | score_abstract = None |
| |
|
| | dic_scores[ids]["title comparision"] = score_title |
| | dic_scores[ids]["abstract comparison"] = score_abstract |
| | dic_scores[ids]["title"] = dic_patents[ids]["title"] |
| | dic_scores[ids]["citations"] = dic_patents[ids]["number_citations"] |
| | dic_scores[ids]["date"] = dic_patents[ids]["date"][:4] |
| | dic_scores[ids]["assignee"] = dic_patents[ids]["assignee"] |
| | dic_scores[ids]["inventors"] = dic_patents[ids]["inventors"] |
| | dic_scores[ids]["number of co-inventors"] = len(dic_patents[ids]["inventors"].split(",")) |
| | |
| | return dic_patents , dic_scores |
| |
|
| |
|
| |
|
| |
|
| | def get_ranking_own_research(research_key_words, details , size): |
| | |
| | dic , dic_scores_papers = ranking_patents(research_key_words, details ) |
| | |
| | if dic_scores_papers == {}: |
| | return "No paper found" |
| | |
| | elif type(dic_scores_papers) == str: |
| | return dic_scores_papers |
| | |
| | else: |
| | return pd.DataFrame(dic_scores_papers).T.sort_values(by="abstract comparison" , ascending = False).head(size) |
| | |
| | |
| |
|
| | def extract_quantitative_data_patent(patent_id): |
| | |
| |
|
| | patent_id = patent_id[3:] |
| | |
| | url = "https://api.patentsview.org/patents/query?q={%22patent_id%22:%22" + str(patent_id) + "%22}&f=[%22patent_number%22,%22patent_title%22,%22patent_abstract%22,%22patent_date%22,%22inventor_last_name%22,%22inventor_first_name%22,%22assignee_organization%22]" |
| | url_google = "https://patents.google.com/patent/US" + str(patent_id) |
| | |
| | data = get_data(url)["patents"][0] |
| | title = data["patent_title"] |
| | abstract = data["patent_abstract"] |
| | co_inventors = ", ".join([ data["inventors"][i]["inventor_first_name"] + " " + data["inventors"][i]["inventor_last_name"] for i in range(len(data["inventors"])) ]) |
| | assignees = ", ".join([ str(data["assignees"][i]["assignee_organization"]) for i in range(len(data["assignees"])) ] ) |
| |
|
| | return url_google , title , abstract , data["patent_date"] , co_inventors , assignees |
| |
|
| |
|
| |
|
| |
|
| | |
| |
|
| |
|
| | |
| |
|
| | |
| | |
| |
|
| | ln_suff = ['oster', |
| | 'nordre', |
| | 'vaster', |
| | 'aust', |
| | 'vesle', |
| | 'da', |
| | 'van t', |
| | 'af', |
| | 'al', |
| | 'setya', |
| | 'zu', |
| | 'la', |
| | 'na', |
| | 'mic', |
| | 'ofver', |
| | 'el', |
| | 'vetle', |
| | 'van het', |
| | 'dos', |
| | 'ui', |
| | 'vest', |
| | 'ab', |
| | 'vste', |
| | 'nord', |
| | 'van der', |
| | 'bin', |
| | 'ibn', |
| | 'war', |
| | 'fitz', |
| | 'alam', |
| | 'di', |
| | 'erch', |
| | 'fetch', |
| | 'nga', |
| | 'ka', |
| | 'soder', |
| | 'lille', |
| | 'upp', |
| | 'ua', |
| | 'te', |
| | 'ni', |
| | 'bint', |
| | 'von und zu', |
| | 'vast', |
| | 'vestre', |
| | 'over', |
| | 'syd', |
| | 'mac', |
| | 'nin', |
| | 'nic', |
| | 'putri', |
| | 'bet', |
| | 'verch', |
| | 'norr', |
| | 'bath', |
| | 'della', |
| | 'van', |
| | 'ben', |
| | 'du', |
| | 'stor', |
| | 'das', |
| | 'neder', |
| | 'abu', |
| | 'degli', |
| | 'vre', |
| | 'ait', |
| | 'ny', |
| | 'opp', |
| | 'pour', |
| | 'kil', |
| | 'der', |
| | 'oz', |
| | 'von', |
| | 'at', |
| | 'nedre', |
| | 'van den', |
| | 'setia', |
| | 'ap', |
| | 'gil', |
| | 'myljom', |
| | 'van de', |
| | 'stre', |
| | 'dele', |
| | 'mck', |
| | 'de', |
| | 'mellom', |
| | 'mhic', |
| | 'binti', |
| | 'ath', |
| | 'binte', |
| | 'snder', |
| | 'sre', |
| | 'ned', |
| | 'ter', |
| | 'bar', |
| | 'le', |
| | 'mala', |
| | 'ost', |
| | 'syndre', |
| | 'sr', |
| | 'bat', |
| | 'sndre', |
| | 'austre', |
| | 'putra', |
| | 'putera', |
| | 'av', |
| | 'lu', |
| | 'vetch', |
| | 'ver', |
| | 'puteri', |
| | 'mc', |
| | 'tre', |
| | 'st'] |
| |
|
| | |
| | |
| |
|
| | name_del = ['2nd', '3rd', 'Jr', 'Jr.', 'Junior', 'Sr', 'Sr.', 'Senior'] |
| |
|
| |
|
| | def name_delete(string): |
| | for elmt in name_del: |
| | if f" {elmt}" in string: |
| | return string.replace(f" {elmt}","") |
| | return string |
| |
|
| | def ln_suff_merge(string): |
| | for suff in ln_suff: |
| | if f" {suff} " in string or string.startswith(f"{suff} "): |
| | return string.replace(f"{suff} ",suff.replace(" ","")) |
| | return string |
| |
|
| | |
| |
|
| | def normalize(data): |
| | normal = unicodedata.normalize('NFKD', data).encode('ASCII', 'ignore') |
| | val = normal.decode("utf-8") |
| | |
| | val = name_delete(val) |
| | |
| | val = re.sub(r"[A-Z]{3,}", lambda x: x.group().lower(), val) |
| | |
| | val = re.sub(r"(\w)([A-Z])", r"\1 \2", val) |
| | |
| | val = val.lower() |
| | |
| | val = re.sub('[^A-Za-z0-9 ]+', ' ', val) |
| | |
| | val = re.sub(' +', ' ', val) |
| | |
| | val = val.strip() |
| | |
| | val = ln_suff_merge(val) |
| |
|
| | return val |
| |
|
| |
|
| | def main_inventors( research_key_words , details , size): |
| | |
| | dic_patents , dic_ranked = ranking_patents(research_key_words , details) |
| | dic_patents_co_inventors = {} |
| | |
| |
|
| | for patent in list(dic_ranked.keys())[:size]: |
| | for k in range(len(dic_patents[patent]["list_inventors"])): |
| | |
| | inventor_id = dic_patents[patent]["list_inventors"][k]["inventor_id"] |
| | inventor_name = dic_patents[patent]["list_inventors"][k]["inventor_first_name"] + " " + dic_patents[patent]["list_inventors"][k]["inventor_last_name"] |
| | inventor_name_norm = normalize(inventor_name).split() |
| | inventor_name_norm = inventor_name_norm[0] + " " + inventor_name_norm[-1] |
| |
|
| | if inventor_name_norm not in dic_patents_co_inventors: |
| | dic_patents_co_inventors[inventor_name_norm] = {} |
| | dic_patents_co_inventors[inventor_name_norm]["Inventor's name"] = inventor_name |
| | dic_patents_co_inventors[inventor_name_norm]["PatentsView inventor's id"] = inventor_id |
| | dic_patents_co_inventors[inventor_name_norm]["Number of occurence"] = 1 |
| | dic_patents_co_inventors[inventor_name_norm]["Number of related citations"] = dic_patents[patent]["number_citations"] |
| |
|
| | else: |
| | if inventor_id not in dic_patents_co_inventors[inventor_name_norm]["PatentsView inventor's id"] : |
| | dic_patents_co_inventors[inventor_name_norm]["PatentsView inventor's id"] += ", " + inventor_id |
| | if inventor_name not in dic_patents_co_inventors[inventor_name_norm]["Inventor's name"] : |
| | dic_patents_co_inventors[inventor_name_norm]["Inventor's name"] += ", " + inventor_name |
| | dic_patents_co_inventors[inventor_name_norm]["Number of occurence"] += 1 |
| | dic_patents_co_inventors[inventor_name_norm]["Number of related citations"] += dic_patents[patent]["number_citations"] |
| |
|
| |
|
| | dic_patents_co_inventors = {k: v for k, v in sorted(dic_patents_co_inventors.items(), key=lambda item: item[1]["Number of occurence"] , reverse = True)} |
| | |
| | if dic_patents_co_inventors == {}: |
| | |
| | |
| | return "No patent, select other key words" |
| | else: |
| | |
| | for inventor_name_norm in list(dic_patents_co_inventors.keys()): |
| | list_inventors = dic_patents_co_inventors[inventor_name_norm]["PatentsView inventor's id"].split(", ") |
| | work_count = 0 |
| | cited_by_count = 0 |
| | |
| | for elem in list_inventors: |
| | url = "https://api.patentsview.org/inventors/query?q={%22inventor_id%22:[%22" + elem + "%22]}&f=[%22inventor_total_num_patents%22,%22patent_num_cited_by_us_patents%22]" |
| | data = get_data(url)["inventors"][0] |
| | work_count += int(data["inventor_total_num_patents"]) |
| | for k in range(len(data["patents"])): |
| | cited_by_count += int(data["patents"][k]["patent_num_cited_by_us_patents"]) |
| | |
| | dic_patents_co_inventors[inventor_name_norm]["Number of patents"] = work_count |
| | dic_patents_co_inventors[inventor_name_norm]["Number of US patents citations"] = cited_by_count |
| | |
| | |
| | |
| | |
| | return pd.DataFrame(dic_patents_co_inventors , index = ["Inventor's name", "PatentsView inventor's id", "Number of occurence" , "Number of patents" ,"Number of US patents citations" , "Number of related citations"]).T.style.hide(axis="index") |
| |
|
| |
|
| | |
| | def map_inventors(research_key_words , details , size): |
| | display = False |
| | dic_patents , dic_ranked = ranking_patents(research_key_words , details) |
| | dic_patents_co_inventors = {} |
| | count = 0 |
| | |
| | for patent in list(dic_ranked.keys())[:size]: |
| | for k in range(len(dic_patents[patent]["list_inventors"])): |
| | |
| | dic_patents_co_inventors[count] = {} |
| | |
| | dic_patents_co_inventors[count]["latitude"] = dic_patents[patent]["list_inventors"][k]["inventor_latitude"] |
| | dic_patents_co_inventors[count]["longitude"] = dic_patents[patent]["list_inventors"][k]["inventor_longitude"] |
| | dic_patents_co_inventors[count]["longitude"] = dic_patents[patent]["list_inventors"][k]["inventor_longitude"] |
| | dic_patents_co_inventors[count]["inventor_name"] = str(dic_patents[patent]["list_inventors"][k]["inventor_first_name"]) + " " + str(dic_patents[patent]["list_inventors"][k]["inventor_last_name"]) |
| | dic_patents_co_inventors[count]["patent_date"] = dic_patents[patent]["date"] |
| | count += 1 |
| | |
| | if dic_patents_co_inventors == {}: |
| | return "No patent, select other key words" |
| | |
| | |
| | map_df = pd.DataFrame(dic_patents_co_inventors).T |
| | map_df["longitude"]=map_df['longitude'].astype(float) |
| | map_df['latitude']=map_df['latitude'].astype(float) |
| | map_df = map_df[map_df["latitude"].notnull()] |
| |
|
| | return map_df |
| |
|
| |
|