text-analysis / read_sheet.py
Daryl Fung
initial commit
9b9ea2f
raw
history blame
1.22 kB
import csv
import pandas as pd
# read an Excel file into a Pandas dataframe
xls = pd.read_excel('description.xlsx', sheet_name='likely_similar', engine='openpyxl')
xls.iloc[:, 1:] # select all rows and all columns starting from the second column
# get the names of the source databases from the dataframe
source_databases = xls.iloc[:, 1:].columns
# create a dictionary to hold the similarity data
similarity_dict = {}
# loop through each source database and get the list of similar databases
for source_database in source_databases:
series = xls.loc[:, source_database]
similar_databases = series[series != False].values.tolist()
similarity_dict[source_database] = similar_databases
# find the length of the longest list
max_len = max(len(v) for v in similarity_dict.values())
# pad the shorter lists with NaNs to make them the same length as the longest list
for k, v in similarity_dict.items():
if len(v) < max_len:
similarity_dict[k] = v + [float('nan')] * (max_len - len(v))
# convert the dictionary to a Pandas dataframe and transpose it
df = pd.DataFrame.from_dict(similarity_dict)
df = df.transpose()
# write the dataframe to a CSV file
df.to_csv('similarity_dict.csv', index=False)