下面是一个示例函数,该函数接受两个数据集作为参数,使用皮尔逊相关系数计算它们之间的相关性并返回最强相关性的列:
def strongest_correlation(dataset1, dataset2):
strongest_corr = None
strongest_col = None
for col1 in dataset1.columns:
for col2 in dataset2.columns:
corr = dataset1[col1].corr(dataset2[col2])
if strongest_corr is None or abs(corr) > abs(strongest_corr):
strongest_corr = corr
strongest_col = (col1, col2)
return strongest_col
使用示例:
import pandas as pd
# 数据集 1
df1 = pd.read_csv('dataset1.csv')
# 数据集 2
df2 = pd.read_csv('dataset2.csv')
# 找到最强相关性
strongest_col = strongest_correlation(df1, df2)
# 输出结果
print('最强相关性为:', strongest_col, '相关系数为:', abs(strongest_corr))