# %% # Set logging level to DEBUG import logging import os import dotenv import numpy as np import pandas as pd import pyalex import rdflib from ampligraph.datasets import ( GraphDataLoader, SQLiteAdapter, DataSourceIdentifier, ) from ampligraph.datasets.graph_partitioner import NaiveGraphPartitioner, BucketGraphPartitioner from ampligraph.evaluation import train_test_split_no_unseen from ampligraph.latent_features import ScoringBasedEmbeddingModel from pyalex import Authors, Concepts, Funders, Institutions, Publishers, Sources, Works from sklearn.model_selection import train_test_split import tensorflow as tf from ampligraph.evaluation import hits_at_n_score, mrr_score from ampligraph.latent_features import ScoringBasedEmbeddingModel from ampligraph.latent_features.loss_functions import get as get_loss from ampligraph.latent_features.regularizers import get as get_regularizer logging.basicConfig(level=logging.DEBUG) loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict] for logger in loggers: logger.setLevel(logging.INFO) # Load the triples from the file X_train = np.load("train.npy") X_valid = np.load("valid.npy") X_test = np.load("test.npy") ## Store as CSVs. There are commas in the names of some institutions, so we need to use a tab as the delimiter #np.savetxt("train.csv", X_train, delimiter="\t", fmt="%s") #np.savetxt("valid.csv", X_valid, delimiter="\t", fmt="%s") #np.savetxt("test.csv", X_test, delimiter="\t", fmt="%s") # #print(f"Train size: {X_train.shape[0]}") #print(f"Valid size: {X_valid.shape[0]}") #print(f"Test size: {X_test.shape[0]}") # Initialize a ComplEx neural embedding model: the embedding size is k, # eta specifies the number of corruptions to generate per each positive, # scoring_type determines the scoring function of the embedding model. partitioned_model = ScoringBasedEmbeddingModel(k=150, eta=10, scoring_type="ComplEx") # Optimizer, loss and regularizer definition optim = tf.keras.optimizers.Adam(learning_rate=1e-3) loss = get_loss("pairwise", {"margin": 0.5}) regularizer = get_regularizer("LP", {"p": 2, "lambda": 1e-5}) # Compilation of the model partitioned_model.compile( optimizer=optim, loss=loss, entity_relation_regularizer=regularizer ) # For evaluation, we can use a filter which would be used to filter out # positives statements created by the corruption procedure. # Here we define the filter set by concatenating all the positives filter = {"test": np.concatenate((X_train, X_valid, X_test))} # Early Stopping callback checkpoint = tf.keras.callbacks.EarlyStopping( monitor="val_{}".format("hits10"), min_delta=0, patience=5, verbose=1, mode="max", restore_best_weights=True, ) ###### use_db = False if use_db: AMPLIGRAPH_DATA_HOME = os.path.join(os.getcwd(), "data") # + os.sep from ampligraph.datasets.data_indexer import SQLite as SQLiteIndexer, DataIndexer # Initialize GraphDataLoader from .csv file sqlite_indexer = SQLiteIndexer( data=None, db_file="main_partition.db", root_directory=AMPLIGRAPH_DATA_HOME, name="main_partition", ) indexer = DataIndexer( X=None, backend_type='sqlite', backend=sqlite_indexer, ) dataset_loader = GraphDataLoader( "train.csv", backend=SQLiteAdapter, in_memory=False, verbose=True, root_directory=AMPLIGRAPH_DATA_HOME, db_name="mydb.db", use_indexer=indexer, ) # adapter = SQLiteAdapter( # "database_25-12-2023_07-28-41_485047_PM_2a11fc49-2337-415e-8672-2bfa48a83745.db", # identifier=DataSourceIdentifier, # root_directory=AMPLIGRAPH_DATA_HOME, # ) print("Graph data loader initialized") # for elem in next(dataset_loader._get_batch_generator()): # print(elem) # break ###### else: X_train = np.load("train.npy") dataset_loader = GraphDataLoader( X_train, verbose=True, use_indexer=True, in_memory=True, ) print(f'next: {next(dataset_loader)}') print(f'next: {next(dataset_loader)}') print(f'next: {next(dataset_loader)}') #x = np.loadtxt( # "train.csv", # delimiter="\t", # dtype=str, #) #print(x[0]) # Choose the partitioner - in this case we choose RandomEdges partitioner partition = False if partition: print("Will start partitioning now") graph_partitioner_train = NaiveGraphPartitioner(dataset_loader, k=6) print("Graph partitioner initialized") #indexer = ( # partitioned_model.data_handler.get_mapper() #) # get the mapper from the trained model # dataset_loader_test = GraphDataLoader( # data_source=X_test, # backend=SQLiteAdapter, # type of backend to use # batch_size=400, # batch size to use while iterating over this dataset # dataset_type="test", # dataset type # use_indexer=indexer, # mapper to map test concepts to the same indices used during training # verbose=True, # ) # graph_partitioner_test = BucketGraphPartitioner(data=partitioner, k=3) print("Will start training now") # Fit the model on training and validation set partitioned_model.fit( #graph_partitioner_train if partition else dataset_loader, X_train, batch_size=500, epochs=45, # Number of training epochs validation_freq=20, # Epochs between successive validation validation_burn_in=100, # Epoch to start validation validation_data=X_test, # Validation data validation_filter=filter, # Filter positives from validation corruptions callbacks=[ checkpoint ], # Early stopping callback (more from tf.keras.callbacks are supported) verbose=True, # Enable stdout messages #partitioning_k=7, # Number of partitions to create ) # %% # Store the model super(ScoringBasedEmbeddingModel, partitioned_model).save_weights("model/") partitioned_model.save_metadata(filedir="model") # from ampligraph.utils import save_model # save_model(partitioned_model, model_name_path='model.pkl') # %% # Create a dataframe of the institutions and their names import pandas as pd import rdflib g = rdflib.Graph() uri = "urn:acmcmc:unis:" unis = rdflib.Namespace(uri) g.bind("unis", unis) g.parse("universities_large_1200.ttl", format="turtle") query_results = g.query( """ SELECT DISTINCT ?institution ?name WHERE { ?institution a unis:Institution . ?institution unis:name ?name . } """ ) institutions = pd.DataFrame(query_results, columns=["institution", "name"]) institutions["institution"] = institutions["institution"].apply(lambda x: str(x)) institutions["name"] = institutions["name"].apply(lambda x: str(x)) # Store the dataframe institutions.to_csv("institutions.csv", index=False) # %% # Run the evaluation procedure on the test set (with filtering) # To disable filtering: use_filter=None # Usually, we corrupt subject and object sides separately and compute ranks ranks = partitioned_model.evaluate(X_test, use_filter=filter, corrupt_side="s,o") # compute and print metrics: mrr = mrr_score(ranks) hits_10 = hits_at_n_score(ranks, n=10) print("MRR: %f, Hits@10: %f" % (mrr, hits_10))