|
|
|
|
|
|
|
|
|
|
|
import pandas as pd |
|
|
import streamlit as st |
|
|
import numpy as np |
|
|
import json, requests |
|
|
|
|
|
from tqdm import tqdm |
|
|
|
|
|
path = "Climate_site/python_scripts/" |
|
|
|
|
|
@st.cache_data |
|
|
def load_data(): |
|
|
url = path + "institutions.tsv" |
|
|
dic = pd.read_csv(url, delimiter = "\t" , index_col = 1).to_dict('index') |
|
|
return dic |
|
|
|
|
|
dic_institutions = load_data() |
|
|
|
|
|
|
|
|
dic_country_codes = {'Europe': ['AD', |
|
|
'AL', |
|
|
'AT', |
|
|
'BE', |
|
|
'BG', |
|
|
'BY', |
|
|
'CZ', |
|
|
'DE', |
|
|
'DK', |
|
|
'EE', |
|
|
'FI', |
|
|
'FR', |
|
|
'GR', |
|
|
'HU', |
|
|
'IE', |
|
|
'IS', |
|
|
'IT', |
|
|
'LI', |
|
|
'LT', |
|
|
'LU', |
|
|
'LV', |
|
|
'MK', |
|
|
'MT', |
|
|
'NL', |
|
|
'NO', |
|
|
'PL', |
|
|
'PT', |
|
|
'RO', |
|
|
'RU', |
|
|
'SE', |
|
|
'SI', |
|
|
'SK', |
|
|
'SM', |
|
|
'UA', |
|
|
'VA', |
|
|
'BA', |
|
|
'HR', |
|
|
'MD', |
|
|
'MC', |
|
|
'ME', |
|
|
'RS', |
|
|
'ES', |
|
|
'CH', |
|
|
'GB'], |
|
|
'Asia': ['AF', |
|
|
'AM', |
|
|
'AZ', |
|
|
'BD', |
|
|
'BH', |
|
|
'BN', |
|
|
'BT', |
|
|
'CN', |
|
|
'CY', |
|
|
'GE', |
|
|
'ID', |
|
|
'IL', |
|
|
'IN', |
|
|
'IQ', |
|
|
'IR', |
|
|
'JO', |
|
|
'JP', |
|
|
'KG', |
|
|
'KP', |
|
|
'KR', |
|
|
'KW', |
|
|
'LB', |
|
|
'MM', |
|
|
'MN', |
|
|
'MV', |
|
|
'MY', |
|
|
'NP', |
|
|
'OM', |
|
|
'PH', |
|
|
'PK', |
|
|
'QA', |
|
|
'SA', |
|
|
'SG', |
|
|
'SY', |
|
|
'TH', |
|
|
'TJ', |
|
|
'TM', |
|
|
'TR', |
|
|
'UZ', |
|
|
'VN', |
|
|
'YE', |
|
|
'KH', |
|
|
'TL', |
|
|
'KZ', |
|
|
'LA', |
|
|
'LK', |
|
|
'AE'], |
|
|
'North America': ['AG', |
|
|
'BB', |
|
|
'BS', |
|
|
'BZ', |
|
|
'CA', |
|
|
'CR', |
|
|
'CU', |
|
|
'DM', |
|
|
'DO', |
|
|
'GT', |
|
|
'GT', |
|
|
'HN', |
|
|
'JM', |
|
|
'MX', |
|
|
'NI', |
|
|
'PA', |
|
|
'TT', |
|
|
'US', |
|
|
'SV', |
|
|
'GD', |
|
|
'KN', |
|
|
'LC', |
|
|
'VC'], |
|
|
'Africa': ['AO', |
|
|
'BF', |
|
|
'BI', |
|
|
'BJ', |
|
|
'BW', |
|
|
'CD', |
|
|
'CG', |
|
|
'CI', |
|
|
'CM', |
|
|
'CV', |
|
|
'DJ', |
|
|
'EG', |
|
|
'ER', |
|
|
'ET', |
|
|
'GA', |
|
|
'GH', |
|
|
'GM', |
|
|
'GN', |
|
|
'GW', |
|
|
'KE', |
|
|
'LR', |
|
|
'LS', |
|
|
'LY', |
|
|
'MG', |
|
|
'ML', |
|
|
'MR', |
|
|
'MU', |
|
|
'MW', |
|
|
'MZ', |
|
|
'NA', |
|
|
'NE', |
|
|
'NG', |
|
|
'RW', |
|
|
'SC', |
|
|
'SD', |
|
|
'SL', |
|
|
'SN', |
|
|
'SO', |
|
|
'ST', |
|
|
'TG', |
|
|
'TN', |
|
|
'TZ', |
|
|
'UG', |
|
|
'ZM', |
|
|
'ZW', |
|
|
'DZ', |
|
|
'CF', |
|
|
'TD', |
|
|
'KM', |
|
|
'GQ', |
|
|
'MA', |
|
|
'ZA', |
|
|
'SZ'], |
|
|
'South America': ['AR', |
|
|
'BO', |
|
|
'BR', |
|
|
'CL', |
|
|
'CO', |
|
|
'EC', |
|
|
'GY', |
|
|
'PE', |
|
|
'PY', |
|
|
'SR', |
|
|
'UY', |
|
|
'VE'], |
|
|
'Oceania': ['AU', |
|
|
'FJ', |
|
|
'KI', |
|
|
'MH', |
|
|
'NR', |
|
|
'NZ', |
|
|
'PG', |
|
|
'PW', |
|
|
'SB', |
|
|
'TO', |
|
|
'TV', |
|
|
'VU', |
|
|
'FM', |
|
|
'WS']} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def URL(base_URL , entity_type , filters): |
|
|
url = base_URL + entity_type + filters |
|
|
return url |
|
|
|
|
|
|
|
|
def get_data(url): |
|
|
url = requests.get(url) |
|
|
text = url.text |
|
|
import json |
|
|
data = json.loads(text) |
|
|
return data |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def author_id_from_name(name): |
|
|
|
|
|
dic_names = {} |
|
|
|
|
|
name = name.title() |
|
|
|
|
|
url = "https://api.openalex.org/authors?search=" + name + "&per_page=200&mailto=emma_scharfmann@berkeley.edu" |
|
|
|
|
|
data = get_data(url)["results"] |
|
|
|
|
|
for k in range(len(data)): |
|
|
author_id = data[k]["id"][21:] |
|
|
dic_names[author_id] = {} |
|
|
dic_names[author_id]["author_name"] = data[k]["display_name"] |
|
|
|
|
|
dic_names[author_id]["number_of_works"] = data[k]["works_count"] |
|
|
dic_names[author_id]["number_of_citations"] = data[k]["cited_by_count"] |
|
|
|
|
|
dic_names[author_id]["field_of_study"] = ", ".join([ data[k]["x_concepts"][j]["display_name"] for j in range(min(5,len(data[k]["x_concepts"])))]) |
|
|
|
|
|
dic_names[author_id]["last_known_institution"] = None |
|
|
dic_names[author_id]["country_code"] = None |
|
|
|
|
|
if data[k]["last_known_institutions"] != None and len(data[k]["last_known_institutions"]) > 0: |
|
|
dic_names[author_id]["last_known_institution"] = data[k]["last_known_institutions"][0]["display_name"] |
|
|
dic_names[author_id]["country_code"] = data[k]["last_known_institutions"][0]["country_code"] |
|
|
|
|
|
dic_names[author_id]["orcid"] = data[k]["orcid"] |
|
|
|
|
|
|
|
|
return pd.DataFrame(dic_names).T , [ ( k , ", ".join([ elem["author_name"] , str(elem["last_known_institution"]) , str(elem["field_of_study"].split(", ")[0]) , "Number of works: " + str(elem["number_of_works"]) ] ) ) for k , elem in dic_names.items()] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def from_author_id(main_author_ids, year, country_code , size): |
|
|
|
|
|
dic_main_workers = {} |
|
|
|
|
|
|
|
|
for main_author_id in main_author_ids: |
|
|
|
|
|
url = "https://api.openalex.org/works?filter=author.id:" + main_author_id + ",publication_year:>" + str(year) + "&per_page=200&mailto=emma_scharfmann@berkeley.edu" |
|
|
|
|
|
try: |
|
|
data = get_data(url)["results"] |
|
|
|
|
|
for k in tqdm(range(len(data))): |
|
|
work_id = data[k]["id"][21:] |
|
|
for j in range(len(data[k]["authorships"])): |
|
|
author_id = data[k]["authorships"][j]["author"]["id"][21:] |
|
|
|
|
|
if author_id != main_author_id: |
|
|
if author_id not in dic_main_workers: |
|
|
dic_main_workers[author_id] = {} |
|
|
dic_main_workers[author_id]["author_name"] = data[k]["authorships"][j]["author"]["display_name"] |
|
|
dic_main_workers[author_id]["co_authors"] = 0 |
|
|
dic_main_workers[author_id]["citations"] = 0 |
|
|
dic_main_workers[author_id]["institution"] = None |
|
|
dic_main_workers[author_id]["country_code"] = None |
|
|
dic_main_workers[author_id]["id"] = None |
|
|
dic_main_workers[author_id]["longitude"] = None |
|
|
dic_main_workers[author_id]["latitude"] = None |
|
|
|
|
|
if dic_main_workers[author_id]["institution"] == None and data[k]["authorships"][j]["institutions"] != [] and "display_name" in data[k]["authorships"][j]["institutions"][0]: |
|
|
dic_main_workers[author_id]["institution"] = data[k]["authorships"][j]["institutions"][0]["display_name"] |
|
|
if dic_main_workers[author_id]["country_code"] == None and data[k]["authorships"][j]["institutions"] != [] and "country_code" in data[k]["authorships"][j]["institutions"][0]: |
|
|
dic_main_workers[author_id]["country_code"] = data[k]["authorships"][j]["institutions"][0]["country_code"] |
|
|
if dic_main_workers[author_id]["id"] == None and data[k]["authorships"][j]["institutions"] != [] and "id" in data[k]["authorships"][j]["institutions"][0] and data[k]["authorships"][j]["institutions"][0]["id"] != None: |
|
|
dic_main_workers[author_id]["id"] = data[k]["authorships"][j]["institutions"][0]["id"][21:] |
|
|
|
|
|
|
|
|
|
|
|
dic_main_workers[author_id]["co_authors"] += 1 |
|
|
|
|
|
url = "https://api.openalex.org/works?filter=referenced_works:" + work_id + ",publication_year:>" + str(year) + "&per_page=200&mailto=emma_scharfmann@berkeley.edu" |
|
|
|
|
|
|
|
|
try: |
|
|
citations_data = get_data(url)["results"] |
|
|
for i in range(len(citations_data)): |
|
|
citing_work_id = citations_data[i]["id"][21:] |
|
|
for j in range(len(citations_data[i]["authorships"])): |
|
|
citing_author_id = citations_data[i]["authorships"][j]["author"]["id"][21:] |
|
|
|
|
|
if citing_author_id != main_author_id: |
|
|
if citing_author_id not in dic_main_workers: |
|
|
dic_main_workers[citing_author_id] = {} |
|
|
dic_main_workers[citing_author_id]["author_name"] = citations_data[i]["authorships"][j]["author"]["display_name"] |
|
|
dic_main_workers[citing_author_id]["co_authors"] = 0 |
|
|
dic_main_workers[citing_author_id]["citations"] = 0 |
|
|
dic_main_workers[citing_author_id]["institution"] = None |
|
|
dic_main_workers[citing_author_id]["country_code"] = None |
|
|
dic_main_workers[citing_author_id]["id"] = None |
|
|
dic_main_workers[citing_author_id]["longitude"] = None |
|
|
dic_main_workers[citing_author_id]["latitude"] = None |
|
|
|
|
|
if dic_main_workers[citing_author_id]["institution"] == None and (citations_data[i]["authorships"][j]["institutions"] != [] and "display_name" in citations_data[i]["authorships"][j]["institutions"][0]): |
|
|
dic_main_workers[citing_author_id]["institution"] = citations_data[i]["authorships"][j]["institutions"][0]["display_name"] |
|
|
if dic_main_workers[citing_author_id]["country_code"] == None and (citations_data[i]["authorships"][j]["institutions"] != [] and "country_code" in citations_data[i]["authorships"][j]["institutions"][0] ): |
|
|
dic_main_workers[citing_author_id]["country_code"] = citations_data[i]["authorships"][j]["institutions"][0]["country_code"] |
|
|
if dic_main_workers[citing_author_id]["id"] == None and ( citations_data[i]["authorships"][j]["institutions"] != [] and "id" in citations_data[i]["authorships"][j]["institutions"][0] and citations_data[i]["authorships"][j]["institutions"][0]["id"] != None): |
|
|
dic_main_workers[citing_author_id]["id"] = citations_data[i]["authorships"][j]["institutions"][0]["id"][21:] |
|
|
|
|
|
dic_main_workers[citing_author_id]["citations"] += 1 |
|
|
|
|
|
except: |
|
|
pass |
|
|
except: |
|
|
pass |
|
|
|
|
|
for author_id in dic_main_workers: |
|
|
if dic_main_workers[author_id]["id"] != None: |
|
|
institution_id = dic_main_workers[author_id]["id"] |
|
|
if institution_id in dic_institutions: |
|
|
geo_data = dic_institutions[institution_id] |
|
|
dic_main_workers[author_id]["longitude"] = geo_data["longitude"] |
|
|
dic_main_workers[author_id]["latitude"] = geo_data["latitude"] |
|
|
|
|
|
|
|
|
res = pd.DataFrame(dic_main_workers).T.sort_values("citations" , ascending = False)[['author_name', 'co_authors', 'citations', 'institution', 'country_code']] |
|
|
|
|
|
res_geo = pd.DataFrame(dic_main_workers).T.sort_values("citations" , ascending = False)[['author_name', 'co_authors', 'citations', 'institution', 'country_code','longitude', 'latitude']] |
|
|
res_geo = res_geo[res_geo["longitude"].notnull()] |
|
|
if country_code != False: |
|
|
|
|
|
if country_code in [ "Europe" , "North America" , "Asia" , "South America", "Oceania" , "Africa" ]: |
|
|
|
|
|
return res[res["country_code"].isin(dic_country_codes[country_code])].head(size) , res_geo[res_geo["country_code"].isin(dic_country_codes[country_code])].head(size) |
|
|
|
|
|
else: |
|
|
return res[res["country_code"] == country_code].head(size) , res_geo[res_geo["country_code"] == country_code].head(size) |
|
|
else: |
|
|
return res.head(size), res_geo.head(size) |
|
|
|
|
|
|
|
|
|