Spaces:

EmmaScharfmannBerkeley
/

Synapse_project

Sleeping

App Files Files Community

Synapse_project / Climate_site /python_scripts /related_scientist.py

EmmaScharfmannBerkeley

Update Climate_site/python_scripts/related_scientist.py

1ed737d verified over 1 year ago

raw

history blame contribute delete

11.1 kB

	##packages code


	import pandas as pd
	import streamlit as st
	import numpy as np
	import json, requests
	#from pandas.io.json import json_normalize
	from tqdm import tqdm

	path = "Climate_site/python_scripts/"

	@st.cache_data # 👈 Add the caching decorator
	def load_data():
	url = path + "institutions.tsv"
	dic = pd.read_csv(url, delimiter = "\t" , index_col = 1).to_dict('index')
	return dic

	dic_institutions = load_data()


	dic_country_codes = {'Europe': ['AD',
	'AL',
	'AT',
	'BE',
	'BG',
	'BY',
	'CZ',
	'DE',
	'DK',
	'EE',
	'FI',
	'FR',
	'GR',
	'HU',
	'IE',
	'IS',
	'IT',
	'LI',
	'LT',
	'LU',
	'LV',
	'MK',
	'MT',
	'NL',
	'NO',
	'PL',
	'PT',
	'RO',
	'RU',
	'SE',
	'SI',
	'SK',
	'SM',
	'UA',
	'VA',
	'BA',
	'HR',
	'MD',
	'MC',
	'ME',
	'RS',
	'ES',
	'CH',
	'GB'],
	'Asia': ['AF',
	'AM',
	'AZ',
	'BD',
	'BH',
	'BN',
	'BT',
	'CN',
	'CY',
	'GE',
	'ID',
	'IL',
	'IN',
	'IQ',
	'IR',
	'JO',
	'JP',
	'KG',
	'KP',
	'KR',
	'KW',
	'LB',
	'MM',
	'MN',
	'MV',
	'MY',
	'NP',
	'OM',
	'PH',
	'PK',
	'QA',
	'SA',
	'SG',
	'SY',
	'TH',
	'TJ',
	'TM',
	'TR',
	'UZ',
	'VN',
	'YE',
	'KH',
	'TL',
	'KZ',
	'LA',
	'LK',
	'AE'],
	'North America': ['AG',
	'BB',
	'BS',
	'BZ',
	'CA',
	'CR',
	'CU',
	'DM',
	'DO',
	'GT',
	'GT',
	'HN',
	'JM',
	'MX',
	'NI',
	'PA',
	'TT',
	'US',
	'SV',
	'GD',
	'KN',
	'LC',
	'VC'],
	'Africa': ['AO',
	'BF',
	'BI',
	'BJ',
	'BW',
	'CD',
	'CG',
	'CI',
	'CM',
	'CV',
	'DJ',
	'EG',
	'ER',
	'ET',
	'GA',
	'GH',
	'GM',
	'GN',
	'GW',
	'KE',
	'LR',
	'LS',
	'LY',
	'MG',
	'ML',
	'MR',
	'MU',
	'MW',
	'MZ',
	'NA',
	'NE',
	'NG',
	'RW',
	'SC',
	'SD',
	'SL',
	'SN',
	'SO',
	'ST',
	'TG',
	'TN',
	'TZ',
	'UG',
	'ZM',
	'ZW',
	'DZ',
	'CF',
	'TD',
	'KM',
	'GQ',
	'MA',
	'ZA',
	'SZ'],
	'South America': ['AR',
	'BO',
	'BR',
	'CL',
	'CO',
	'EC',
	'GY',
	'PE',
	'PY',
	'SR',
	'UY',
	'VE'],
	'Oceania': ['AU',
	'FJ',
	'KI',
	'MH',
	'NR',
	'NZ',
	'PG',
	'PW',
	'SB',
	'TO',
	'TV',
	'VU',
	'FM',
	'WS']}


	#################### General Functions #############################

	def URL(base_URL , entity_type , filters):
	url = base_URL + entity_type + filters
	return url


	def get_data(url):
	url = requests.get(url)
	text = url.text
	import json
	data = json.loads(text)
	return data

	## selecting the ids we want


	def author_id_from_name(name):

	dic_names = {}

	name = name.title()

	url = "https://api.openalex.org/authors?search=" + name + "&per_page=200&mailto=emma_scharfmann@berkeley.edu"

	data = get_data(url)["results"]

	for k in range(len(data)):
	author_id = data[k]["id"][21:]
	dic_names[author_id] = {}
	dic_names[author_id]["author_name"] = data[k]["display_name"]

	dic_names[author_id]["number_of_works"] = data[k]["works_count"]
	dic_names[author_id]["number_of_citations"] = data[k]["cited_by_count"]

	dic_names[author_id]["field_of_study"] = ", ".join([ data[k]["x_concepts"][j]["display_name"] for j in range(min(5,len(data[k]["x_concepts"])))])

	dic_names[author_id]["last_known_institution"] = None
	dic_names[author_id]["country_code"] = None

	if data[k]["last_known_institutions"] != None and len(data[k]["last_known_institutions"]) > 0:
	dic_names[author_id]["last_known_institution"] = data[k]["last_known_institutions"][0]["display_name"]
	dic_names[author_id]["country_code"] = data[k]["last_known_institutions"][0]["country_code"]

	dic_names[author_id]["orcid"] = data[k]["orcid"]


	return pd.DataFrame(dic_names).T , [ ( k , ", ".join([ elem["author_name"] , str(elem["last_known_institution"]) , str(elem["field_of_study"].split(", ")[0]) , "Number of works: " + str(elem["number_of_works"]) ] ) ) for k , elem in dic_names.items()]





	def from_author_id(main_author_ids, year, country_code , size):

	dic_main_workers = {}


	for main_author_id in main_author_ids:

	url = "https://api.openalex.org/works?filter=author.id:" + main_author_id + ",publication_year:>" + str(year) + "&per_page=200&mailto=emma_scharfmann@berkeley.edu"

	try:
	data = get_data(url)["results"]

	for k in tqdm(range(len(data))):
	work_id = data[k]["id"][21:]
	for j in range(len(data[k]["authorships"])):
	author_id = data[k]["authorships"][j]["author"]["id"][21:]

	if author_id != main_author_id:
	if author_id not in dic_main_workers:
	dic_main_workers[author_id] = {}
	dic_main_workers[author_id]["author_name"] = data[k]["authorships"][j]["author"]["display_name"]
	dic_main_workers[author_id]["co_authors"] = 0
	dic_main_workers[author_id]["citations"] = 0
	dic_main_workers[author_id]["institution"] = None
	dic_main_workers[author_id]["country_code"] = None
	dic_main_workers[author_id]["id"] = None
	dic_main_workers[author_id]["longitude"] = None
	dic_main_workers[author_id]["latitude"] = None

	if dic_main_workers[author_id]["institution"] == None and data[k]["authorships"][j]["institutions"] != [] and "display_name" in data[k]["authorships"][j]["institutions"][0]:
	dic_main_workers[author_id]["institution"] = data[k]["authorships"][j]["institutions"][0]["display_name"]
	if dic_main_workers[author_id]["country_code"] == None and data[k]["authorships"][j]["institutions"] != [] and "country_code" in data[k]["authorships"][j]["institutions"][0]:
	dic_main_workers[author_id]["country_code"] = data[k]["authorships"][j]["institutions"][0]["country_code"]
	if dic_main_workers[author_id]["id"] == None and data[k]["authorships"][j]["institutions"] != [] and "id" in data[k]["authorships"][j]["institutions"][0] and data[k]["authorships"][j]["institutions"][0]["id"] != None:
	dic_main_workers[author_id]["id"] = data[k]["authorships"][j]["institutions"][0]["id"][21:]



	dic_main_workers[author_id]["co_authors"] += 1

	url = "https://api.openalex.org/works?filter=referenced_works:" + work_id + ",publication_year:>" + str(year) + "&per_page=200&mailto=emma_scharfmann@berkeley.edu"


	try:
	citations_data = get_data(url)["results"]
	for i in range(len(citations_data)):
	citing_work_id = citations_data[i]["id"][21:]
	for j in range(len(citations_data[i]["authorships"])):
	citing_author_id = citations_data[i]["authorships"][j]["author"]["id"][21:]

	if citing_author_id != main_author_id:
	if citing_author_id not in dic_main_workers:
	dic_main_workers[citing_author_id] = {}
	dic_main_workers[citing_author_id]["author_name"] = citations_data[i]["authorships"][j]["author"]["display_name"]
	dic_main_workers[citing_author_id]["co_authors"] = 0
	dic_main_workers[citing_author_id]["citations"] = 0
	dic_main_workers[citing_author_id]["institution"] = None
	dic_main_workers[citing_author_id]["country_code"] = None
	dic_main_workers[citing_author_id]["id"] = None
	dic_main_workers[citing_author_id]["longitude"] = None
	dic_main_workers[citing_author_id]["latitude"] = None

	if dic_main_workers[citing_author_id]["institution"] == None and (citations_data[i]["authorships"][j]["institutions"] != [] and "display_name" in citations_data[i]["authorships"][j]["institutions"][0]):
	dic_main_workers[citing_author_id]["institution"] = citations_data[i]["authorships"][j]["institutions"][0]["display_name"]
	if dic_main_workers[citing_author_id]["country_code"] == None and (citations_data[i]["authorships"][j]["institutions"] != [] and "country_code" in citations_data[i]["authorships"][j]["institutions"][0] ):
	dic_main_workers[citing_author_id]["country_code"] = citations_data[i]["authorships"][j]["institutions"][0]["country_code"]
	if dic_main_workers[citing_author_id]["id"] == None and ( citations_data[i]["authorships"][j]["institutions"] != [] and "id" in citations_data[i]["authorships"][j]["institutions"][0] and citations_data[i]["authorships"][j]["institutions"][0]["id"] != None):
	dic_main_workers[citing_author_id]["id"] = citations_data[i]["authorships"][j]["institutions"][0]["id"][21:]

	dic_main_workers[citing_author_id]["citations"] += 1

	except:
	pass
	except:
	pass

	for author_id in dic_main_workers:
	if dic_main_workers[author_id]["id"] != None:
	institution_id = dic_main_workers[author_id]["id"]
	if institution_id in dic_institutions:
	geo_data = dic_institutions[institution_id]
	dic_main_workers[author_id]["longitude"] = geo_data["longitude"]
	dic_main_workers[author_id]["latitude"] = geo_data["latitude"]


	res = pd.DataFrame(dic_main_workers).T.sort_values("citations" , ascending = False)[['author_name', 'co_authors', 'citations', 'institution', 'country_code']]

	res_geo = pd.DataFrame(dic_main_workers).T.sort_values("citations" , ascending = False)[['author_name', 'co_authors', 'citations', 'institution', 'country_code','longitude', 'latitude']]
	res_geo = res_geo[res_geo["longitude"].notnull()]
	if country_code != False:

	if country_code in [ "Europe" , "North America" , "Asia" , "South America", "Oceania" , "Africa" ]:

	return res[res["country_code"].isin(dic_country_codes[country_code])].head(size) , res_geo[res_geo["country_code"].isin(dic_country_codes[country_code])].head(size)

	else:
	return res[res["country_code"] == country_code].head(size) , res_geo[res_geo["country_code"] == country_code].head(size)
	else:
	return res.head(size), res_geo.head(size)