Spaces:

acmc
/

Universities-Explorer

Sleeping

App Files Files Community

Universities-Explorer / app.py

acmc

new model

115f2ee 6 months ago

raw

history blame contribute delete

No virus

13.7 kB

	# %%
	import gradio as gr
	import matplotlib.pyplot as plt
	import numpy as np
	import pandas as pd
	import rdflib
	import seaborn as sns
	import tensorflow as tf
	from adjustText import adjust_text
	from ampligraph.latent_features import ScoringBasedEmbeddingModel
	from ampligraph.utils import restore_model
	from sklearn.cluster import KMeans
	from sklearn.decomposition import PCA
	import logging


	logger = logging.getLogger(__name__)

	# Start timer, count time to load graph
	start_time = tf.timestamp()

	g = rdflib.Graph()
	uri = "urn:acmcmc:unis:"
	unis = rdflib.Namespace(uri)
	g.bind("unis", unis)
	g.parse("universities.ttl", format="turtle")

	# End timer
	end_time = tf.timestamp()
	logger.info("Graph loaded in {} seconds".format(end_time - start_time))

	# model = restore_model("model.pkl")

	# Start timer, count time to load model
	start_time = tf.timestamp()
	model = ScoringBasedEmbeddingModel(k=150, eta=10, scoring_type="ComplEx")
	model.load_metadata("model/model")
	model.build_full_model()
	super(ScoringBasedEmbeddingModel, model).load_weights("model/")
	# End timer
	end_time = tf.timestamp()
	logger.info("Model loaded in {} seconds".format(end_time - start_time))


	def separate_concepts(concepts):
	concept_list = concepts.split(",")
	# Trim the strings
	concept_list = [x.strip() for x in concept_list]
	return concept_list


	def pca(embeddings):
	pca = PCA(n_components=2)
	pca.fit(embeddings)
	entity_embeddings_pca = pca.transform(embeddings)
	return entity_embeddings_pca


	def cluster(embeddings):
	clustering_algorithm = KMeans(n_clusters=6, n_init=50, max_iter=500, random_state=0)
	clusters = clustering_algorithm.fit_predict(embeddings)
	return clusters


	def get_concept_name(concept_uri):
	"""
	Get the name of the concept from the URI
	"""
	results = g.query(
	f"""SELECT ?name
	WHERE {{
	<{concept_uri}> <urn:acmcmc:unis:name> ?name .
	}}"""
	)
	return pd.DataFrame(results)[0][0]


	def get_similarities_to_node(array_of_triples, model):
	"""
	Calculate the similarity between the embeddings of a node and a list of other nodes
	"""
	# Cosine similarity using tensorflow
	indexes = model.get_indexes(array_of_triples)
	scores = model(indexes)
	return scores


	def process_user_input_concept(concept_chooser):
	"""
	The user input is the URI of the concept. Get the similarites between the concept and the institutions
	"""
	all_ids_institutions = np.loadtxt(
	"institutions.csv", delimiter=",", skiprows=1, dtype=str, quotechar='"'
	)
	# Remove duplicates based on the first column
	all_ids_institutions = all_ids_institutions[
	~pd.DataFrame(all_ids_institutions).duplicated(0)
	]

	chosen_concepts = separate_concepts(concept_chooser)
	chosen_concepts_names = [get_concept_name(concept) for concept in chosen_concepts]
	all_similarities = {}
	for concept in chosen_concepts:
	s = all_ids_institutions[:, 0]
	p = np.array(["urn:acmcmc:unis:institution_related_to_concept"] * len(s))
	o = np.array([concept] * len(s))

	array_of_triples = np.array([s, p, o]).T

	scores = get_similarities_to_node(array_of_triples, model)
	all_similarities[concept] = scores

	# Now, average the similarities
	scores = np.stack(list(all_similarities.values()), axis=0)
	scores = np.mean(scores, axis=0)

	table_df = pd.DataFrame(
	{
	"institution": s,
	"mean_similarity": scores.flatten(),
	"institution_name": all_ids_institutions[:, 1],
	# "num_articles": all_ids_institutions[:, 2].astype(int),
	}
	)

	# Add the individual similarities
	for i, concept in enumerate(chosen_concepts):
	table_df[f"similarity_to_{chosen_concepts_names[i]}"] = all_similarities[concept]

	# Reorder the columns so that the mean similarity is after the individual similarities and before the institution name
	table_df = table_df[
	["institution"]
	+ [f"similarity_to_{chosen_concepts_names[i]}" for i in range(len(chosen_concepts))]
	+ ["mean_similarity", "institution_name"]
	]

	# Sort by mean similarity
	table_df = table_df.sort_values(by=["mean_similarity"], ascending=False)

	concept_names = [get_concept_name(concept_uri) for concept_uri in chosen_concepts]
	return (
	table_df,
	gr.update(visible=True),
	gr.update(visible=True),
	#gr.update(visible=True),
	#f'Concept names: {", ".join(concept_names)}',
	)


	def calculate_emdeddings_and_pca(table):
	gr.Info("Performing PCA and clustering...")
	# Perform PCA
	embeddings_of_institutions = model.get_embeddings(
	entities=np.array(table["institution"])
	)

	entity_embeddings_pca = pca(embeddings_of_institutions)

	# Perform clustering
	clusters = cluster(embeddings_of_institutions)

	plot_df = pd.DataFrame(
	{
	"embedding_x": entity_embeddings_pca[:, 0],
	"embedding_y": entity_embeddings_pca[:, 1],
	"cluster": "cluster" + pd.Series(clusters).astype(str),
	}
	)

	# Toast message
	gr.Info("PCA and clustering done!")
	return plot_df


	def click_on_institution(table, embeddings_var, evt: gr.SelectData):
	institution_id = table["institution"][evt.index[0]]
	try:
	embeddings_df = embeddings_var["embeddings_df"]
	plot_df = pd.DataFrame(
	{
	"institution": table["institution"].values,
	"institution_name": table["institution_name"].values,
	"embedding_x": embeddings_df["embedding_x"].values,
	"embedding_y": embeddings_df["embedding_y"].values,
	"cluster": embeddings_df["cluster"].values,
	# "num_articles": table["num_articles"].values,
	}
	)
	return plot_embeddings(plot_df, institution_id)
	except:
	pass


	def click_on_show_plot(table):
	embeddings_df = calculate_emdeddings_and_pca(table)

	plot_df = pd.DataFrame(
	{
	"institution": table["institution"].values,
	"Institution_name": table["institution Name"].values,
	"embedding_x": embeddings_df["embedding_x"].values,
	"embedding_y": embeddings_df["embedding_y"].values,
	"cluster": embeddings_df["cluster"].values,
	# "num_articles": table["num_articles"].values,
	}
	)
	fig = plot_embeddings(plot_df, None)

	return fig, {"embeddings_df": plot_df}


	def plot_embeddings(plot_df, institution_id):
	fig = plt.figure(figsize=(12, 12))
	np.random.seed(0)
	# fig.title("{} embeddings".format(parameter).capitalize())
	ax = sns.scatterplot(
	data=plot_df,
	x="embedding_x",
	y="embedding_y",
	hue="cluster",
	)

	row_of_institution = plot_df[plot_df["institution"] == institution_id]
	if not row_of_institution.empty:
	ax.text(
	row_of_institution["embedding_x"],
	row_of_institution["embedding_y"],
	row_of_institution["institution_name"].values[0],
	horizontalalignment="left",
	size="medium",
	color="black",
	weight="normal",
	)
	# Also draw a point for the institution
	ax.scatter(
	row_of_institution["embedding_x"],
	row_of_institution["embedding_y"],
	color="black",
	s=100,
	marker="x",
	)
	# texts = []
	# for i, point in plot_df.iterrows():
	# if point["institution"] == institution_id:
	# texts.append(
	# fig.text(
	# point["embedding_x"] + 0.02,
	# point["embedding_y"] + 0.01,
	# str(point["institution_name"]),
	# )
	# )
	# adjust_text(texts)
	return fig


	def get_authors_of_institution(institutions_table, concept_chooser, evt: gr.SelectData):
	"""
	Get the authors of an institution
	"""
	institution = institutions_table["institution"][0]
	number_of_row = evt.index[0]
	institution = institutions_table["institution"][number_of_row]
	concepts = separate_concepts(concept_chooser)
	results_dfs = []
	for concept in concepts:
	# Create a dataframe of the authors and the number of articles they have written for each concept
	result = g.query(
	f"""SELECT ?author ?name (COUNT (?article) AS ?num_articles)
	WHERE {{
	?author a <urn:acmcmc:unis:Author> .
	?author <urn:acmcmc:unis:name> ?name .
	?article <urn:acmcmc:unis:written_in_institution> <{institution}> .
	?article <urn:acmcmc:unis:has_author> ?author .
	?article <urn:acmcmc:unis:related_to_concept> <{concept}> .
	}}
	GROUP BY ?author ?name
	ORDER BY DESC(COUNT (?article))
	"""
	)
	result_df = pd.DataFrame(result)
	result_df.columns = ["author", "name", "num_articles"]
	results_dfs.append(result_df)
	# Now, aggregate the results into a single dataframe by summing the number of articles
	results_df = pd.concat(results_dfs)
	results_df = results_df.groupby(["author", "name"]).sum().reset_index()
	# Sort by number of articles
	results_df = results_df.sort_values(by=["num_articles"], ascending=False)
	return results_df, gr.update(visible=True)


	# %%
	theme = gr.themes.Default(primary_hue="cyan", secondary_hue="fuchsia")

	with gr.Blocks(theme=theme) as demo:
	embeddings_df = gr.State({})
	# App title and description
	title = gr.Markdown(
	"""
	# Universities Explorer
	This app allows you to explore the institutions more closely related to a concept.

	It uses embeddings of institutions and concepts to calculate the similarity between them. The embedding model, [ComplEx](https://doi.org/10.48550/arXiv.1606.06357), was trained using the [AmpliGraph](https://github.com/Accenture/AmpliGraph) library. The data comes from the [OpenAlex](https://openalex.org/) dataset, which contains information about scientific articles, authors, institutions, and concepts.
	"""
	)
	with gr.Group() as institution_search:
	concept_chooser = gr.Textbox(
	label="Concept URI",
	info="Using OpenAlex, find the URI of the concept you want to search for. For example, the URI of the concept 'Knowledge Graph' is https://openalex.org/C2987255567, while the URI of the concept 'Natural Language Processing' is https://openalex.org/C204321447. You can find the URI of a concept by searching for it on OpenAlex and copying the URL from the address bar. You can also search for multiple concepts by separating them with a comma.",
	placeholder="https://openalex.org/C2987255567, https://openalex.org/C204321447",
	value="https://openalex.org/C2987255567, https://openalex.org/C204321447",
	)
	concept_name_label = gr.Markdown("Concept name: ", visible=False)
	# Table for name of institution and similarity to concept
	btn_search_institutions = gr.Button("Search institutions", variant="primary")
	table = gr.Dataframe(
	interactive=False, visible=False, elem_classes="institutions", wrap=True
	)
	btn_search_institutions.click(
	lambda: gr.update(visible=True), outputs=[table], queue=True
	)

	btn_plot_embeddings = gr.Button(
	"Plot embeddings", variant="primary", visible=False, elem_classes="embeddings"
	)
	# Description of what plot embeddings does
	plot_embeddings_info = gr.Markdown(
	"""
	This button will plot the embeddings of the institutions related to the concept. The embeddings are calculated using the trained model and then reduced to 2 dimensions using PCA. The institutions are then clustered using KMeans.

	Running this may take a while, as we need to calculate the embeddings for all institutions and then perform PCA and clustering.
	""",
	visible=False,
	)
	btn_search_institutions.click(
	process_user_input_concept,
	inputs=[concept_chooser],
	outputs=[
	table,
	btn_plot_embeddings,
	plot_embeddings_info,
	#concept_name_label,
	#concept_name_label,
	],
	queue=True,
	)
	plot = gr.Plot(visible=False, elem_classes="embeddings")
	btn_plot_embeddings.click(
	lambda: gr.update(visible=True), outputs=[plot], queue=True
	)
	btn_plot_embeddings.click(
	click_on_show_plot,
	inputs=[table],
	outputs=[plot, embeddings_df],
	queue=True,
	)

	# When the user selects a row in the table, get the authors of that institution and display them in a dataframe
	with gr.Group(visible=False, elem_classes="authors") as authors:
	table_authors = gr.Dataframe(
	interactive=False, label="Authors in institution writing about concept"
	)
	table.select(
	get_authors_of_institution,
	inputs=[table, concept_chooser],
	outputs=[table_authors],
	)
	table.select(
	click_on_institution,
	inputs=[table, embeddings_df],
	outputs=[plot],
	)

	btn_clear = gr.ClearButton(components=[table, plot, table_authors])

	# Author information
	author_info = gr.Markdown(
	"""
	This demo has been built by [Aldan Creo](
	https://acmc-website.web.app/).
	"""
	)

	demo.queue()
	demo.launch()