Pandas 相关性分析
Pandas 相关性分析详解
1. 相关性基础概念
1.1 相关性类型
- 皮尔逊相关系数:衡量线性关系,假设正态分布
- 斯皮尔曼相关系数:基于排名的非参数方法,适合非线性单调关系
- 肯德尔相关系数:基于一致性对数,适合序数数据
- 偏相关:控制其他变量后的相关性
1.2 相关系数解释
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# 相关系数强度指南
correlation_guide = {
'强相关': (0.7, 1.0),
'中相关': (0.3, 0.7),
'弱相关': (0.1, 0.3),
'无相关': (0.0, 0.1)
}
def interpret_correlation(coef):
"""解释相关系数强度"""
abs_coef = abs(coef)
for strength, (low, high) in correlation_guide.items():
if low <= abs_coef < high:
direction = '正' if coef > 0 else '负'
return f"{direction}{strength}相关 (r={coef:.3f})"
return "无相关"
2. 基本相关性计算
2.1 皮尔逊相关系数
# 创建示例数据
np.random.seed(42)
df = pd.DataFrame({
'sales': np.random.normal(100, 20, 100),
'advertising': np.random.normal(50, 10, 100),
'price': np.random.normal(20, 5, 100),
'temperature': np.random.normal(25, 5, 100)
})
# 添加相关性
df['sales'] = df['advertising'] * 0.5 + df['price'] * (-0.3) + np.random.normal(0, 5, 100)
# 计算相关系数
corr_matrix = df.corr() # 默认皮尔逊
print("皮尔逊相关系数矩阵:")
print(corr_matrix)
# 特定变量相关性
sales_adv_corr = df['sales'].corr(df['advertising'])
print(f"\n销售与广告相关性: {sales_adv_corr:.3f}")
print(interpret_correlation(sales_adv_corr))
2.2 不同相关系数方法
# 斯皮尔曼相关(排名相关)
spearman_corr = df.corr(method='spearman')
print("斯皮尔曼相关系数:")
print(spearman_corr)
# 肯德尔相关
kendall_corr = df.corr(method='kendall')
print("肯德尔相关系数:")
print(kendall_corr)
# 对比不同方法
methods = ['pearson', 'spearman', 'kendall']
for method in methods:
corr = df['sales'].corr(df['advertising'], method=method)
print(f"{method.capitalize()}: {corr:.3f}")
2.3 协方差矩阵
# 协方差(考虑变量尺度)
cov_matrix = df.cov()
print("协方差矩阵:")
print(cov_matrix)
# 标准化协方差(相关系数)
print("\n标准化后(相关系数):")
print(corr_matrix)
3. 相关性可视化
3.1 热力图
# 基础热力图
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix,
annot=True, # 显示数值
cmap='coolwarm', # 颜色映射
center=0, # 中心点
square=True, # 正方形
fmt='.3f') # 格式
plt.title('相关系数热力图')
plt.tight_layout()
plt.show()
# 高级热力图
mask = np.triu(np.ones_like(corr_matrix, dtype=bool)) # 上三角掩码
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix,
mask=mask,
annot=True,
cmap='RdBu_r',
center=0,
square=True,
fmt='.3f',
cbar_kws={'shrink': 0.8})
plt.title('相关系数热力图(上三角)')
plt.show()
3.2 相关系数条形图
# 销售与其他变量的相关性
sales_corr = corr_matrix['sales'].drop('sales').sort_values(ascending=False)
plt.figure(figsize=(10, 6))
sales_corr.plot(kind='bar')
plt.title('销售与其他变量的相关性')
plt.ylabel('相关系数')
plt.xticks(rotation=45)
plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)
plt.tight_layout()
plt.show()
3.3 散点图矩阵
# 散点图对
sns.pairplot(df,
kind='scatter',
diag_kind='kde',
plot_kws={'alpha': 0.6})
plt.suptitle('变量散点图矩阵', y=1.02)
plt.show()
# 只显示感兴趣的变量
sns.pairplot(df[['sales', 'advertising', 'price']],
kind='reg', # 添加回归线
diag_kind='hist')
plt.show()
4. 统计显著性检验
4.1 p值计算
from scipy import stats
import pandas as pd
def correlation_with_pvalue(x, y, method='pearson'):
"""计算相关系数和p值"""
if method == 'pearson':
r, p = stats.pearsonr(x, y)
elif method == 'spearman':
r, p = stats.spearmanr(x, y)
elif method == 'kendall':
r, p = stats.kendalltau(x, y)
return {'correlation': r, 'p_value': p, 'significant': p < 0.05}
# 对所有变量对计算
def correlation_matrix_with_pvalues(df):
"""完整相关性矩阵带p值"""
corr_matrix = df.corr()
p_matrix = pd.DataFrame(index=corr_matrix.columns, columns=corr_matrix.columns)
for col1 in df.columns:
for col2 in df.columns:
if col1 != col2:
result = correlation_with_pvalue(df[col1], df[col2])
p_matrix.loc[col1, col2] = result['p_value']
else:
p_matrix.loc[col1, col2] = np.nan
return corr_matrix, p_matrix
corr, pvals = correlation_matrix_with_pvalues(df)
# 可视化显著性
def plot_significant_correlations(corr, pvals, threshold=0.05):
"""绘制显著相关性"""
mask = pvals > threshold
sig_corr = corr.copy()
sig_corr[mask] = np.nan
plt.figure(figsize=(10, 8))
sns.heatmap(sig_corr,
annot=True,
cmap='coolwarm',
center=0,
square=True,
fmt='.3f')
plt.title(f'显著相关系数 (p < {threshold})')
plt.show()
plot_significant_correlations(corr, pvals)
4.2 置信区间
from scipy import stats
def correlation_ci(x, y, confidence=0.95):
"""计算相关系数的置信区间"""
r, p = stats.pearsonr(x, y)
n = len(x)
# Fisher Z变换
z = np.arctanh(r)
se = 1 / np.sqrt(n - 3)
z_critical = stats.norm.ppf((1 + confidence) / 2)
# 置信区间
z_lower = z - z_critical * se
z_upper = z + z_critical * se
r_lower = np.tanh(z_lower)
r_upper = np.tanh(z_upper)
return {
'correlation': r,
'p_value': p,
'ci_lower': r_lower,
'ci_upper': r_upper,
'significant': p < 0.05
}
# 示例
result = correlation_ci(df['sales'], df['advertising'])
print(f"相关系数: {result['correlation']:.3f}")
print(f"95% CI: ({result['ci_lower']:.3f}, {result['ci_upper']:.3f})")
print(f"p值: {result['p_value']:.3f}, 显著: {result['significant']}")
5. 分组相关性分析
5.1 按组计算相关性
# 添加分组变量
df['region'] = np.random.choice(['North', 'South', 'East'], size=len(df))
# 按地区分组计算相关性
def group_correlation(group, col1, col2):
"""分组相关性"""
return group[col1].corr(group[col2])
grouped_corr = df.groupby('region').apply(
lambda g: g['sales'].corr(g['advertising'])
)
print("各地区销售-广告相关性:")
print(grouped_corr)
# 完整分组相关矩阵
def group_corr_matrix(group):
"""分组相关矩阵"""
return group.corr()
group_corrs = df.groupby('region').apply(group_corr_matrix)
print("\n各地区相关矩阵:")
print(group_corrs)
5.2 统计检验比较
from scipy.stats import fisher_exact
def compare_group_correlations(group1_corr, group2_corr):
"""比较两组相关系数(Fisher Z检验)"""
# Fisher Z变换
z1 = np.arctanh(group1_corr)
z2 = np.arctanh(group2_corr)
n1, n2 = 30, 30 # 示例样本量
# 标准误
se_diff = np.sqrt(1/(n1-3) + 1/(n2-3))
# Z统计量
z_stat = (z1 - z2) / se_diff
p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))
return {
'z_statistic': z_stat,
'p_value': p_value,
'significant_diff': p_value < 0.05
}
# 示例比较
north_corr = df[df['region'] == 'North']['sales'].corr(df[df['region'] == 'North']['advertising'])
south_corr = df[df['region'] == 'South']['sales'].corr(df[df['region'] == 'South']['advertising'])
comparison = compare_group_correlations(north_corr, south_corr)
print(f"地区相关性差异检验: p={comparison['p_value']:.3f}")
6. 偏相关分析
6.1 控制混杂变量
from pingouin import partial_corr # 需要安装: pip install pingouin
# 偏相关(控制温度对销售-广告关系的影响)
partial_result = partial_corr(data=df,
x='advertising',
y='sales',
covar='temperature')
print("偏相关分析(控制温度):")
print(partial_result)
# 手动计算偏相关(线性回归残差法)
def partial_correlation(X, Y, Z):
"""手动计算偏相关"""
# 回归残差
from sklearn.linear_model import LinearRegression
# X对Z回归残差
model_x = LinearRegression().fit(Z.reshape(-1, 1), X)
X_resid = X - model_x.predict(Z.reshape(-1, 1))
# Y对Z回归残差
model_y = LinearRegression().fit(Z.reshape(-1, 1), Y)
Y_resid = Y - model_y.predict(Z.reshape(-1, 1))
# 残差相关
return np.corrcoef(X_resid, Y_resid)[0, 1]
# 示例
manual_partial = partial_correlation(
df['advertising'].values,
df['sales'].values,
df['temperature'].values
)
print(f"手动计算偏相关: {manual_partial:.3f}")
7. 非线性相关性
7.1 距离相关系数
# 需要安装: pip install dcor
import dcor
# 距离相关(捕捉非线性关系)
dist_corr = dcor.distance_correlation(df['sales'], df['temperature'])
print(f"距离相关系数: {dist_corr:.3f}")
# 互信息(非线性依赖)
from sklearn.feature_selection import mutual_info_regression
mi = mutual_info_regression(df[['temperature']], df['sales'])
print(f"互信息: {mi[0]:.3f}")
7.2 相关性趋势分析
# 添加非线性关系
df['nonlinear'] = df['temperature']**2 + np.random.normal(0, 1, len(df))
# 滚动相关性(检测关系变化)
def rolling_correlation(x, y, window):
return pd.Series(x).rolling(window).corr(pd.Series(y))
rolling_corr = rolling_correlation(df['sales'], df['advertising'], 20)
plt.figure(figsize=(12, 6))
rolling_corr.plot()
plt.title('滚动相关性(窗口=20)')
plt.ylabel('相关系数')
plt.axhline(y=0, color='black', linestyle='--', alpha=0.5)
plt.show()
8. 高级可视化和报告
8.1 相关性网络图
import networkx as nx
import matplotlib.pyplot as plt
def correlation_network(df, threshold=0.5):
"""创建相关性网络图"""
corr_abs = df.corr().abs()
edges = []
for i in range(len(corr_abs.columns)):
for j in range(i+1, len(corr_abs.columns)):
if corr_abs.iloc[i, j] > threshold:
edges.append((corr_abs.columns[i],
corr_abs.columns[j],
corr_abs.iloc[i, j]))
G = nx.Graph()
G.add_weighted_edges_from(edges)
plt.figure(figsize=(10, 8))
pos = nx.spring_layout(G)
edges = G.edges()
weights = [G[u][v]['weight'] * 5 for u,v in edges]
nx.draw(G, pos,
edgelist=edges,
width=weights,
edge_color='gray',
node_color='lightblue',
with_labels=True,
node_size=2000,
font_size=10)
plt.title(f'相关性网络图 (阈值 > {threshold})')
plt.show()
correlation_network(df, threshold=0.3)
8.2 相关性报告生成
def generate_correlation_report(df, method='pearson', alpha=0.05):
"""生成详细相关性报告"""
report = {}
# 计算相关性和p值
corr_matrix, p_matrix = correlation_matrix_with_pvalues(df)
# 显著相关对
significant_pairs = []
for i in range(len(corr_matrix.columns)):
for j in range(i+1, len(corr_matrix.columns)):
var1, var2 = corr_matrix.columns[i], corr_matrix.columns[j]
r = corr_matrix.loc[var1, var2]
p = p_matrix.loc[var1, var2]
if p < alpha:
significant_pairs.append({
'variables': f"{var1} - {var2}",
'correlation': r,
'p_value': p,
'interpretation': interpret_correlation(r),
'strength': '强' if abs(r) > 0.7 else '中' if abs(r) > 0.3 else '弱'
})
# 排序(按绝对值)
significant_pairs.sort(key=lambda x: abs(x['correlation']), reverse=True)
report['summary'] = {
'total_variables': len(df.columns),
'significant_pairs': len(significant_pairs),
'method': method,
'alpha': alpha
}
report['significant_correlations'] = significant_pairs[:10] # Top 10
# 最高相关对
max_corr = corr_matrix.abs().unstack().sort_values(ascending=False)
max_corr = max_corr[max_corr < 1].drop_duplicates().head(5)
report['top_correlations'] = max_corr.to_dict()
return report
# 生成报告
report = generate_correlation_report(df)
print("相关性分析报告:")
print(f"显著相关对数: {report['summary']['significant_pairs']}")
for pair in report['significant_correlations']:
print(f"{pair['variables']}: {pair['correlation']:.3f} (p={pair['p_value']:.3f})")
9. 实际应用案例
9.1 金融数据相关性
# 模拟股票数据
dates = pd.date_range('2023-01-01', periods=252)
stocks = pd.DataFrame({
'AAPL': np.random.normal(100, 2, 252).cumsum(),
'GOOGL': np.random.normal(100, 2, 252).cumsum(),
'MSFT': np.random.normal(100, 2, 252).cumsum(),
'SP500': np.random.normal(4000, 50, 252).cumsum()
}, index=dates)
# 添加市场相关性
stocks['AAPL'] += stocks['SP500'] * 0.6
stocks['GOOGL'] += stocks['SP500'] * 0.7
stocks['MSFT'] += stocks['SP500'] * 0.5
# 日回报率
returns = stocks.pct_change().dropna()
# 相关性分析
stock_corr = returns.corr()
print("股票回报率相关性:")
print(stock_corr)
# 滚动相关性(检测市场体制变化)
rolling_corr = returns['AAPL'].rolling(30).corr(returns['SP500'])
plt.figure(figsize=(12, 6))
rolling_corr.plot()
plt.title('AAPL与S&P500滚动相关性')
plt.ylabel('相关系数')
plt.show()
9.2 销售数据分析
# 销售数据集
sales_df = pd.DataFrame({
'month': pd.date_range('2023-01-01', periods=12, freq='M'),
'sales': [100, 120, 110, 150, 160, 140, 180, 200, 190, 220, 210, 230],
'advertising_spend': [50, 55, 60, 70, 80, 75, 90, 100, 95, 110, 105, 120],
'price': [20, 19.5, 20, 19, 18.5, 19, 18, 17.5, 18, 17, 17.5, 17],
'competitor_price': [21, 20.5, 21, 20, 19.5, 20, 19, 18.5, 19, 18, 18.5, 18],
'season': ['Winter']*3 + ['Spring']*3 + ['Summer']*3 + ['Fall']*3
})
# 相关性分析
sales_corr = sales_df.select_dtypes(include=[np.number]).corr()
print("销售相关性:")
print(sales_corr['sales'].sort_values(ascending=False))
# 分季节分析
seasonal_corr = sales_df.groupby('season').apply(
lambda g: g.select_dtypes([np.number]).corr().loc['sales']
)
print("\n分季节相关性:")
print(seasonal_corr)
10. 相关性分析最佳实践
10.1 前处理建议
def preprocess_for_correlation(df):
"""相关性分析前处理"""
df_clean = df.copy()
# 1. 处理缺失值
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
df_clean[numeric_cols] = df_clean[numeric_cols].fillna(df_clean[numeric_cols].median())
# 2. 检测和处理异常值
for col in numeric_cols:
Q1 = df_clean[col].quantile(0.25)
Q3 = df_clean[col].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
# 截断异常值
df_clean[col] = df_clean[col].clip(lower=lower, upper=upper)
# 3. 标准化(可选,避免尺度影响可视化)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_clean[numeric_cols] = scaler.fit_transform(df_clean[numeric_cols])
# 4. 检查多重共线性
corr_matrix = df_clean[numeric_cols].corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
high_corr_pairs = [(col1, col2, corr_matrix.loc[col1, col2])
for col1 in upper.columns
for col2 in upper.index
if upper.loc[col2, col1] > 0.8]
print("高相关变量对(>0.8):")
for col1, col2, corr in high_corr_pairs:
print(f"{col1} - {col2}: {corr:.3f}")
return df_clean, numeric_cols
# 使用
df_processed, num_cols = preprocess_for_correlation(df)
10.2 自动化相关性分析
class CorrelationAnalyzer:
"""自动化相关性分析类"""
def __init__(self, df, method='pearson', alpha=0.05):
self.df = df
self.method = method
self.alpha = alpha
self.results = {}
def analyze(self):
"""执行完整分析"""
numeric_df = self.df.select_dtypes(include=[np.number])
# 相关矩阵
self.results['correlation_matrix'] = numeric_df.corr(method=self.method)
# p值矩阵
corr, pvals = correlation_matrix_with_pvalues(numeric_df)
self.results['p_values'] = pvals
# 显著相关对
significant = []
for i in range(len(corr.columns)):
for j in range(i+1, len(corr.columns)):
r = corr.iloc[i, j]
p = pvals.iloc[i, j]
if p < self.alpha:
significant.append({
'var1': corr.columns[i],
'var2': corr.columns[j],
'correlation': r,
'p_value': p
})
self.results['significant_pairs'] = sorted(
significant, key=lambda x: abs(x['correlation']), reverse=True
)
return self.results
def plot(self, figsize=(12, 10)):
"""可视化结果"""
corr = self.results['correlation_matrix']
fig, axes = plt.subplots(2, 2, figsize=figsize)
# 热力图
sns.heatmap(corr, annot=True, cmap='coolwarm', center=0, ax=axes[0,0])
axes[0,0].set_title('相关系数热力图')
# 显著性热力图
mask = self.results['p_values'] > self.alpha
sig_corr = corr.copy()
sig_corr[mask] = np.nan
sns.heatmap(sig_corr, annot=True, cmap='coolwarm', center=0, ax=axes[0,1])
axes[0,1].set_title(f'显著相关 (p < {self.alpha})')
# 相关系数分布
corr_values = corr.values[np.triu_indices_from(corr.values, k=1)]
axes[1,0].hist(corr_values, bins=20, alpha=0.7)
axes[1,0].set_title('相关系数分布')
axes[1,0].set_xlabel('相关系数')
# Top相关对
top_pairs = self.results['significant_pairs'][:5]
if top_pairs:
top_corr = [abs(p['correlation']) for p in top_pairs]
variables = [f"{p['var1']}-{p['var2']}" for p in top_pairs]
axes[1,1].barh(range(len(top_corr)), top_corr)
axes[1,1].set_yticks(range(len(top_corr)))
axes[1,1].set_yticklabels(variables)
axes[1,1].set_title('Top 5 显著相关对')
plt.tight_layout()
plt.show()
def summary(self):
"""生成摘要"""
results = self.results
print(f"分析方法: {self.method}")
print(f"显著水平: {self.alpha}")
print(f"数值变量数: {len(results['correlation_matrix'].columns)}")
print(f"显著相关对数: {len(results['significant_pairs'])}")
if results['significant_pairs']:
print("\nTop 5 显著相关对:")
for pair in results['significant_pairs'][:5]:
print(f" {pair['var1']} - {pair['var2']}: "
f"r={pair['correlation']:.3f}, p={pair['p_value']:.3f}")
# 使用
analyzer = CorrelationAnalyzer(df)
results = analyzer.analyze()
analyzer.plot()
analyzer.summary()
11. 注意事项和局限性
11.1 常见误区
# 1. 相关不等于因果
print("⚠️ 相关性 ≠ 因果关系")
print("示例:冰激凌销量与溺水事件相关,但不是因果")
# 2. 样本量影响
small_sample = df.head(10)
large_sample = df
print(f"小样本相关性: {small_sample['sales'].corr(small_sample['advertising']):.3f}")
print(f"大样本相关性: {large_sample['sales'].corr(large_sample['advertising']):.3f}")
# 3. 非线性关系检测
df['temp_squared'] = df['temperature'] ** 2
linear_corr = df['sales'].corr(df['temperature'])
nonlinear_corr = df['sales'].corr(df['temp_squared'])
print(f"线性相关: {linear_corr:.3f}")
print(f"非线性相关: {nonlinear_corr:.3f}")
11.2 稳健性检查
def robustness_check(df, var1, var2):
"""相关性稳健性检查"""
results = {}
# 原始相关
results['original'] = df[var1].corr(df[var2])
# 删除异常值后
Q1, Q3 = df[[var1, var2]].quantile([0.25, 0.75])
IQR = Q3 - Q1
mask = ((df[var1] >= Q1[var1] - 1.5*IQR[var1]) &
(df[var1] <= Q3[var1] + 1.5*IQR[var1]) &
(df[var2] >= Q1[var2] - 1.5*IQR[var2]) &
(df[var2] <= Q3[var2] + 1.5*IQR[var2]))
results['no_outliers'] = df[mask][var1].corr(df[mask][var2])
# 不同样本比例
for frac in [0.5, 0.8, 1.0]:
sample = df.sample(frac=frac, random_state=42)
results[f'sample_{frac}'] = sample[var1].corr(sample[var2])
return pd.DataFrame([results]).T
# 检查
robust_results = robustness_check(df, 'sales', 'advertising')
print("稳健性检查:")
print(robust_results)
相关性分析是探索性数据分析的重要工具,但必须结合业务背景、统计显著性和因果推理。Pandas提供了强大的相关性计算能力,结合可视化和统计检验可以获得深入的变量关系洞察。