# Copyright 2023 by Jan Philip Wahle, https://jpwahle.com/ # All rights reserved. import os import numpy as np import pandas as pd import seaborn as sns from matplotlib import pyplot as plt from scipy.stats import gaussian_kde dirname = os.path.dirname(__file__) # Load the csv file into a pandas DataFrame papers_df = pd.read_csv( os.path.join(dirname, "data/nlp_papers_field_diversity.csv") ) # Compute the mean CFDI mean_cfdi = papers_df["incoming_diversity"].mean() # Compute the mean CADI mean_citation_ages = [] # Commenting out the old code #|# Open the file and read the content in a list #|with open( #| os.path.join(dirname, "data/nlp_papers_citation_age.txt"), #| "r", #| encoding="utf-8", #|) as filehandle: #| for line in filehandle: #| temp = float(line[:-1]) #| mean_citation_ages.append(temp) def generate_cfdi_plot(input_cfdi, compute_type="paper"): """ Function to generate a plot for CFDI """ # Using kdeplot to fill the distribution curve sns.set(font_scale=1.3, style="whitegrid") data = papers_df[papers_df["incoming_diversity"] > 0]["incoming_diversity"] kde = gaussian_kde(data) x_vals = np.linspace(data.min(), data.max(), 1000) y_vals = kde.evaluate(x_vals) fig, ax = plt.subplots() # create a new figure and axis ax.fill_between(x_vals, y_vals, color="skyblue", alpha=0.3) ax.plot(x_vals, y_vals, color="skyblue", linewidth=2, label="Distribution") interpolated_y_cfdi = np.interp(input_cfdi, x_vals, y_vals) ax.scatter( input_cfdi, interpolated_y_cfdi, c="r", marker="*", linewidths=2, zorder=2, s=32, ) ax.vlines( input_cfdi, 0, interpolated_y_cfdi, color="tomato", ls="--", lw=1.5, ) epsilon = 0.005 # Compute the average and plot it as a light grey vertical line mean_val = np.mean(data) # Interpolate the y value for the mean interpolated_y_mean = np.interp(mean_val, x_vals, y_vals) ax.vlines(mean_val, 0, interpolated_y_mean, color="grey", ls="--", lw=1.5) ax.text( mean_val + epsilon, interpolated_y_mean + epsilon, "Avg.", {"color": "grey", "fontsize": 13}, ha="left", # Horizontal alignment ) ax.text( input_cfdi + epsilon, interpolated_y_cfdi + epsilon, f"This {compute_type}", {"color": "#DC143C", "fontsize": 13}, ha="left", # Horizontal alignment ) ax.set_xlabel("Citation Field Diversity Index (CFDI)", fontsize=15) ax.set_ylabel("Density", fontsize=15) sns.despine(left=True, bottom=True, right=True, top=True) return fig def generate_maoc_plot(input_maoc, compute_type="paper"): """ Function to generate a plot for MAOC """ # Using kdeplot to fill the distribution curve sns.set(font_scale=1.3, style="whitegrid") data = pd.DataFrame(mean_citation_ages)[0] kde = gaussian_kde(data) x_vals = np.linspace(data.min(), data.max(), 1000) y_vals = kde.evaluate(x_vals) fig, ax = plt.subplots() # create a new figure and axis ax.fill_between(x_vals, y_vals, color="skyblue", alpha=0.3) ax.plot(x_vals, y_vals, color="skyblue", linewidth=2, label="Distribution") interpolated_y_cfdi = np.interp(input_maoc, x_vals, y_vals) ax.scatter( input_maoc, interpolated_y_cfdi, c="r", marker="*", linewidths=2, zorder=2, s=32, ) ax.vlines( input_maoc, 0, interpolated_y_cfdi, color="tomato", ls="--", lw=1.5, ) epsilon = 0.005 # Compute the average and plot it as a light grey vertical line mean_val = np.mean(data) # Interpolate the y value for the mean interpolated_y_mean = np.interp(mean_val, x_vals, y_vals) ax.vlines(mean_val, 0, interpolated_y_mean, color="grey", ls="--", lw=1.5) ax.text( mean_val + epsilon, interpolated_y_mean + epsilon, "Avg.", {"color": "grey", "fontsize": 13}, ha="left", # Horizontal alignment ) ax.text( input_maoc + epsilon, interpolated_y_cfdi + epsilon, f"This {compute_type}", {"color": "#DC143C", "fontsize": 13}, ha="left", # Horizontal alignment ) ax.set_xlabel("Mean Age of Citation (mAoC)", fontsize=15) ax.set_ylabel("Density", fontsize=15) sns.despine(left=True, bottom=True, right=True, top=True) return fig