Spaces:
Sleeping
Sleeping
import csv | |
import pandas as pd | |
# read an Excel file into a Pandas dataframe | |
xls = pd.read_excel('description.xlsx', sheet_name='likely_similar', engine='openpyxl') | |
xls.iloc[:, 1:] # select all rows and all columns starting from the second column | |
# get the names of the source databases from the dataframe | |
source_databases = xls.iloc[:, 1:].columns | |
# create a dictionary to hold the similarity data | |
similarity_dict = {} | |
# loop through each source database and get the list of similar databases | |
for source_database in source_databases: | |
series = xls.loc[:, source_database] | |
similar_databases = series[series != False].values.tolist() | |
similarity_dict[source_database] = similar_databases | |
# find the length of the longest list | |
max_len = max(len(v) for v in similarity_dict.values()) | |
# pad the shorter lists with NaNs to make them the same length as the longest list | |
for k, v in similarity_dict.items(): | |
if len(v) < max_len: | |
similarity_dict[k] = v + [float('nan')] * (max_len - len(v)) | |
# convert the dictionary to a Pandas dataframe and transpose it | |
df = pd.DataFrame.from_dict(similarity_dict) | |
df = df.transpose() | |
# write the dataframe to a CSV file | |
df.to_csv('similarity_dict.csv', index=False) |