|
import csv |
|
import pandas as pd |
|
|
|
|
|
xls = pd.read_excel('description.xlsx', sheet_name='likely_similar', engine='openpyxl') |
|
xls.iloc[:, 1:] |
|
|
|
|
|
source_databases = xls.iloc[:, 1:].columns |
|
|
|
|
|
similarity_dict = {} |
|
|
|
|
|
for source_database in source_databases: |
|
series = xls.loc[:, source_database] |
|
similar_databases = series[series != False].values.tolist() |
|
similarity_dict[source_database] = similar_databases |
|
|
|
|
|
max_len = max(len(v) for v in similarity_dict.values()) |
|
|
|
|
|
for k, v in similarity_dict.items(): |
|
if len(v) < max_len: |
|
similarity_dict[k] = v + [float('nan')] * (max_len - len(v)) |
|
|
|
|
|
df = pd.DataFrame.from_dict(similarity_dict) |
|
df = df.transpose() |
|
|
|
|
|
df.to_csv('similarity_dict.csv', index=False) |