EmmaScharfmannBerkeley's picture
Update Climate_site/python_scripts/related_scientist.py
1ed737d verified
##packages code
import pandas as pd
import streamlit as st
import numpy as np
import json, requests
#from pandas.io.json import json_normalize
from tqdm import tqdm
path = "Climate_site/python_scripts/"
@st.cache_data # 👈 Add the caching decorator
def load_data():
url = path + "institutions.tsv"
dic = pd.read_csv(url, delimiter = "\t" , index_col = 1).to_dict('index')
return dic
dic_institutions = load_data()
dic_country_codes = {'Europe': ['AD',
'AL',
'AT',
'BE',
'BG',
'BY',
'CZ',
'DE',
'DK',
'EE',
'FI',
'FR',
'GR',
'HU',
'IE',
'IS',
'IT',
'LI',
'LT',
'LU',
'LV',
'MK',
'MT',
'NL',
'NO',
'PL',
'PT',
'RO',
'RU',
'SE',
'SI',
'SK',
'SM',
'UA',
'VA',
'BA',
'HR',
'MD',
'MC',
'ME',
'RS',
'ES',
'CH',
'GB'],
'Asia': ['AF',
'AM',
'AZ',
'BD',
'BH',
'BN',
'BT',
'CN',
'CY',
'GE',
'ID',
'IL',
'IN',
'IQ',
'IR',
'JO',
'JP',
'KG',
'KP',
'KR',
'KW',
'LB',
'MM',
'MN',
'MV',
'MY',
'NP',
'OM',
'PH',
'PK',
'QA',
'SA',
'SG',
'SY',
'TH',
'TJ',
'TM',
'TR',
'UZ',
'VN',
'YE',
'KH',
'TL',
'KZ',
'LA',
'LK',
'AE'],
'North America': ['AG',
'BB',
'BS',
'BZ',
'CA',
'CR',
'CU',
'DM',
'DO',
'GT',
'GT',
'HN',
'JM',
'MX',
'NI',
'PA',
'TT',
'US',
'SV',
'GD',
'KN',
'LC',
'VC'],
'Africa': ['AO',
'BF',
'BI',
'BJ',
'BW',
'CD',
'CG',
'CI',
'CM',
'CV',
'DJ',
'EG',
'ER',
'ET',
'GA',
'GH',
'GM',
'GN',
'GW',
'KE',
'LR',
'LS',
'LY',
'MG',
'ML',
'MR',
'MU',
'MW',
'MZ',
'NA',
'NE',
'NG',
'RW',
'SC',
'SD',
'SL',
'SN',
'SO',
'ST',
'TG',
'TN',
'TZ',
'UG',
'ZM',
'ZW',
'DZ',
'CF',
'TD',
'KM',
'GQ',
'MA',
'ZA',
'SZ'],
'South America': ['AR',
'BO',
'BR',
'CL',
'CO',
'EC',
'GY',
'PE',
'PY',
'SR',
'UY',
'VE'],
'Oceania': ['AU',
'FJ',
'KI',
'MH',
'NR',
'NZ',
'PG',
'PW',
'SB',
'TO',
'TV',
'VU',
'FM',
'WS']}
#################### General Functions #############################
def URL(base_URL , entity_type , filters):
url = base_URL + entity_type + filters
return url
def get_data(url):
url = requests.get(url)
text = url.text
import json
data = json.loads(text)
return data
## selecting the ids we want
def author_id_from_name(name):
dic_names = {}
name = name.title()
url = "https://api.openalex.org/authors?search=" + name + "&per_page=200&mailto=emma_scharfmann@berkeley.edu"
data = get_data(url)["results"]
for k in range(len(data)):
author_id = data[k]["id"][21:]
dic_names[author_id] = {}
dic_names[author_id]["author_name"] = data[k]["display_name"]
dic_names[author_id]["number_of_works"] = data[k]["works_count"]
dic_names[author_id]["number_of_citations"] = data[k]["cited_by_count"]
dic_names[author_id]["field_of_study"] = ", ".join([ data[k]["x_concepts"][j]["display_name"] for j in range(min(5,len(data[k]["x_concepts"])))])
dic_names[author_id]["last_known_institution"] = None
dic_names[author_id]["country_code"] = None
if data[k]["last_known_institutions"] != None and len(data[k]["last_known_institutions"]) > 0:
dic_names[author_id]["last_known_institution"] = data[k]["last_known_institutions"][0]["display_name"]
dic_names[author_id]["country_code"] = data[k]["last_known_institutions"][0]["country_code"]
dic_names[author_id]["orcid"] = data[k]["orcid"]
return pd.DataFrame(dic_names).T , [ ( k , ", ".join([ elem["author_name"] , str(elem["last_known_institution"]) , str(elem["field_of_study"].split(", ")[0]) , "Number of works: " + str(elem["number_of_works"]) ] ) ) for k , elem in dic_names.items()]
def from_author_id(main_author_ids, year, country_code , size):
dic_main_workers = {}
for main_author_id in main_author_ids:
url = "https://api.openalex.org/works?filter=author.id:" + main_author_id + ",publication_year:>" + str(year) + "&per_page=200&mailto=emma_scharfmann@berkeley.edu"
try:
data = get_data(url)["results"]
for k in tqdm(range(len(data))):
work_id = data[k]["id"][21:]
for j in range(len(data[k]["authorships"])):
author_id = data[k]["authorships"][j]["author"]["id"][21:]
if author_id != main_author_id:
if author_id not in dic_main_workers:
dic_main_workers[author_id] = {}
dic_main_workers[author_id]["author_name"] = data[k]["authorships"][j]["author"]["display_name"]
dic_main_workers[author_id]["co_authors"] = 0
dic_main_workers[author_id]["citations"] = 0
dic_main_workers[author_id]["institution"] = None
dic_main_workers[author_id]["country_code"] = None
dic_main_workers[author_id]["id"] = None
dic_main_workers[author_id]["longitude"] = None
dic_main_workers[author_id]["latitude"] = None
if dic_main_workers[author_id]["institution"] == None and data[k]["authorships"][j]["institutions"] != [] and "display_name" in data[k]["authorships"][j]["institutions"][0]:
dic_main_workers[author_id]["institution"] = data[k]["authorships"][j]["institutions"][0]["display_name"]
if dic_main_workers[author_id]["country_code"] == None and data[k]["authorships"][j]["institutions"] != [] and "country_code" in data[k]["authorships"][j]["institutions"][0]:
dic_main_workers[author_id]["country_code"] = data[k]["authorships"][j]["institutions"][0]["country_code"]
if dic_main_workers[author_id]["id"] == None and data[k]["authorships"][j]["institutions"] != [] and "id" in data[k]["authorships"][j]["institutions"][0] and data[k]["authorships"][j]["institutions"][0]["id"] != None:
dic_main_workers[author_id]["id"] = data[k]["authorships"][j]["institutions"][0]["id"][21:]
dic_main_workers[author_id]["co_authors"] += 1
url = "https://api.openalex.org/works?filter=referenced_works:" + work_id + ",publication_year:>" + str(year) + "&per_page=200&mailto=emma_scharfmann@berkeley.edu"
try:
citations_data = get_data(url)["results"]
for i in range(len(citations_data)):
citing_work_id = citations_data[i]["id"][21:]
for j in range(len(citations_data[i]["authorships"])):
citing_author_id = citations_data[i]["authorships"][j]["author"]["id"][21:]
if citing_author_id != main_author_id:
if citing_author_id not in dic_main_workers:
dic_main_workers[citing_author_id] = {}
dic_main_workers[citing_author_id]["author_name"] = citations_data[i]["authorships"][j]["author"]["display_name"]
dic_main_workers[citing_author_id]["co_authors"] = 0
dic_main_workers[citing_author_id]["citations"] = 0
dic_main_workers[citing_author_id]["institution"] = None
dic_main_workers[citing_author_id]["country_code"] = None
dic_main_workers[citing_author_id]["id"] = None
dic_main_workers[citing_author_id]["longitude"] = None
dic_main_workers[citing_author_id]["latitude"] = None
if dic_main_workers[citing_author_id]["institution"] == None and (citations_data[i]["authorships"][j]["institutions"] != [] and "display_name" in citations_data[i]["authorships"][j]["institutions"][0]):
dic_main_workers[citing_author_id]["institution"] = citations_data[i]["authorships"][j]["institutions"][0]["display_name"]
if dic_main_workers[citing_author_id]["country_code"] == None and (citations_data[i]["authorships"][j]["institutions"] != [] and "country_code" in citations_data[i]["authorships"][j]["institutions"][0] ):
dic_main_workers[citing_author_id]["country_code"] = citations_data[i]["authorships"][j]["institutions"][0]["country_code"]
if dic_main_workers[citing_author_id]["id"] == None and ( citations_data[i]["authorships"][j]["institutions"] != [] and "id" in citations_data[i]["authorships"][j]["institutions"][0] and citations_data[i]["authorships"][j]["institutions"][0]["id"] != None):
dic_main_workers[citing_author_id]["id"] = citations_data[i]["authorships"][j]["institutions"][0]["id"][21:]
dic_main_workers[citing_author_id]["citations"] += 1
except:
pass
except:
pass
for author_id in dic_main_workers:
if dic_main_workers[author_id]["id"] != None:
institution_id = dic_main_workers[author_id]["id"]
if institution_id in dic_institutions:
geo_data = dic_institutions[institution_id]
dic_main_workers[author_id]["longitude"] = geo_data["longitude"]
dic_main_workers[author_id]["latitude"] = geo_data["latitude"]
res = pd.DataFrame(dic_main_workers).T.sort_values("citations" , ascending = False)[['author_name', 'co_authors', 'citations', 'institution', 'country_code']]
res_geo = pd.DataFrame(dic_main_workers).T.sort_values("citations" , ascending = False)[['author_name', 'co_authors', 'citations', 'institution', 'country_code','longitude', 'latitude']]
res_geo = res_geo[res_geo["longitude"].notnull()]
if country_code != False:
if country_code in [ "Europe" , "North America" , "Asia" , "South America", "Oceania" , "Africa" ]:
return res[res["country_code"].isin(dic_country_codes[country_code])].head(size) , res_geo[res_geo["country_code"].isin(dic_country_codes[country_code])].head(size)
else:
return res[res["country_code"] == country_code].head(size) , res_geo[res_geo["country_code"] == country_code].head(size)
else:
return res.head(size), res_geo.head(size)