|

Pandas 相关性分析

Pandas 相关性分析详解

1. 相关性基础概念

1.1 相关性类型

  • 皮尔逊相关系数:衡量线性关系,假设正态分布
  • 斯皮尔曼相关系数:基于排名的非参数方法,适合非线性单调关系
  • 肯德尔相关系数:基于一致性对数,适合序数数据
  • 偏相关:控制其他变量后的相关性

1.2 相关系数解释

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# 相关系数强度指南
correlation_guide = {
    '强相关': (0.7, 1.0),
    '中相关': (0.3, 0.7),
    '弱相关': (0.1, 0.3),
    '无相关': (0.0, 0.1)
}

def interpret_correlation(coef):
    """解释相关系数强度"""
    abs_coef = abs(coef)
    for strength, (low, high) in correlation_guide.items():
        if low <= abs_coef < high:
            direction = '正' if coef > 0 else '负'
            return f"{direction}{strength}相关 (r={coef:.3f})"
    return "无相关"

2. 基本相关性计算

2.1 皮尔逊相关系数

# 创建示例数据
np.random.seed(42)
df = pd.DataFrame({
    'sales': np.random.normal(100, 20, 100),
    'advertising': np.random.normal(50, 10, 100),
    'price': np.random.normal(20, 5, 100),
    'temperature': np.random.normal(25, 5, 100)
})

# 添加相关性
df['sales'] = df['advertising'] * 0.5 + df['price'] * (-0.3) + np.random.normal(0, 5, 100)

# 计算相关系数
corr_matrix = df.corr()  # 默认皮尔逊
print("皮尔逊相关系数矩阵:")
print(corr_matrix)

# 特定变量相关性
sales_adv_corr = df['sales'].corr(df['advertising'])
print(f"\n销售与广告相关性: {sales_adv_corr:.3f}")
print(interpret_correlation(sales_adv_corr))

2.2 不同相关系数方法

# 斯皮尔曼相关(排名相关)
spearman_corr = df.corr(method='spearman')
print("斯皮尔曼相关系数:")
print(spearman_corr)

# 肯德尔相关
kendall_corr = df.corr(method='kendall')
print("肯德尔相关系数:")
print(kendall_corr)

# 对比不同方法
methods = ['pearson', 'spearman', 'kendall']
for method in methods:
    corr = df['sales'].corr(df['advertising'], method=method)
    print(f"{method.capitalize()}: {corr:.3f}")

2.3 协方差矩阵

# 协方差(考虑变量尺度)
cov_matrix = df.cov()
print("协方差矩阵:")
print(cov_matrix)

# 标准化协方差(相关系数)
print("\n标准化后(相关系数):")
print(corr_matrix)

3. 相关性可视化

3.1 热力图

# 基础热力图
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, 
            annot=True,          # 显示数值
            cmap='coolwarm',     # 颜色映射
            center=0,            # 中心点
            square=True,         # 正方形
            fmt='.3f')           # 格式
plt.title('相关系数热力图')
plt.tight_layout()
plt.show()

# 高级热力图
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))  # 上三角掩码
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, 
            mask=mask,
            annot=True,
            cmap='RdBu_r',
            center=0,
            square=True,
            fmt='.3f',
            cbar_kws={'shrink': 0.8})
plt.title('相关系数热力图(上三角)')
plt.show()

3.2 相关系数条形图

# 销售与其他变量的相关性
sales_corr = corr_matrix['sales'].drop('sales').sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sales_corr.plot(kind='bar')
plt.title('销售与其他变量的相关性')
plt.ylabel('相关系数')
plt.xticks(rotation=45)
plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)
plt.tight_layout()
plt.show()

3.3 散点图矩阵

# 散点图对
sns.pairplot(df, 
             kind='scatter', 
             diag_kind='kde',
             plot_kws={'alpha': 0.6})
plt.suptitle('变量散点图矩阵', y=1.02)
plt.show()

# 只显示感兴趣的变量
sns.pairplot(df[['sales', 'advertising', 'price']], 
             kind='reg',  # 添加回归线
             diag_kind='hist')
plt.show()

4. 统计显著性检验

4.1 p值计算

from scipy import stats
import pandas as pd

def correlation_with_pvalue(x, y, method='pearson'):
    """计算相关系数和p值"""
    if method == 'pearson':
        r, p = stats.pearsonr(x, y)
    elif method == 'spearman':
        r, p = stats.spearmanr(x, y)
    elif method == 'kendall':
        r, p = stats.kendalltau(x, y)

    return {'correlation': r, 'p_value': p, 'significant': p < 0.05}

# 对所有变量对计算
def correlation_matrix_with_pvalues(df):
    """完整相关性矩阵带p值"""
    corr_matrix = df.corr()
    p_matrix = pd.DataFrame(index=corr_matrix.columns, columns=corr_matrix.columns)

    for col1 in df.columns:
        for col2 in df.columns:
            if col1 != col2:
                result = correlation_with_pvalue(df[col1], df[col2])
                p_matrix.loc[col1, col2] = result['p_value']
            else:
                p_matrix.loc[col1, col2] = np.nan

    return corr_matrix, p_matrix

corr, pvals = correlation_matrix_with_pvalues(df)

# 可视化显著性
def plot_significant_correlations(corr, pvals, threshold=0.05):
    """绘制显著相关性"""
    mask = pvals > threshold
    sig_corr = corr.copy()
    sig_corr[mask] = np.nan

    plt.figure(figsize=(10, 8))
    sns.heatmap(sig_corr, 
                annot=True,
                cmap='coolwarm',
                center=0,
                square=True,
                fmt='.3f')
    plt.title(f'显著相关系数 (p < {threshold})')
    plt.show()

plot_significant_correlations(corr, pvals)

4.2 置信区间

from scipy import stats

def correlation_ci(x, y, confidence=0.95):
    """计算相关系数的置信区间"""
    r, p = stats.pearsonr(x, y)
    n = len(x)

    # Fisher Z变换
    z = np.arctanh(r)
    se = 1 / np.sqrt(n - 3)
    z_critical = stats.norm.ppf((1 + confidence) / 2)

    # 置信区间
    z_lower = z - z_critical * se
    z_upper = z + z_critical * se

    r_lower = np.tanh(z_lower)
    r_upper = np.tanh(z_upper)

    return {
        'correlation': r,
        'p_value': p,
        'ci_lower': r_lower,
        'ci_upper': r_upper,
        'significant': p < 0.05
    }

# 示例
result = correlation_ci(df['sales'], df['advertising'])
print(f"相关系数: {result['correlation']:.3f}")
print(f"95% CI: ({result['ci_lower']:.3f}, {result['ci_upper']:.3f})")
print(f"p值: {result['p_value']:.3f}, 显著: {result['significant']}")

5. 分组相关性分析

5.1 按组计算相关性

# 添加分组变量
df['region'] = np.random.choice(['North', 'South', 'East'], size=len(df))

# 按地区分组计算相关性
def group_correlation(group, col1, col2):
    """分组相关性"""
    return group[col1].corr(group[col2])

grouped_corr = df.groupby('region').apply(
    lambda g: g['sales'].corr(g['advertising'])
)
print("各地区销售-广告相关性:")
print(grouped_corr)

# 完整分组相关矩阵
def group_corr_matrix(group):
    """分组相关矩阵"""
    return group.corr()

group_corrs = df.groupby('region').apply(group_corr_matrix)
print("\n各地区相关矩阵:")
print(group_corrs)

5.2 统计检验比较

from scipy.stats import fisher_exact

def compare_group_correlations(group1_corr, group2_corr):
    """比较两组相关系数(Fisher Z检验)"""
    # Fisher Z变换
    z1 = np.arctanh(group1_corr)
    z2 = np.arctanh(group2_corr)
    n1, n2 = 30, 30  # 示例样本量

    # 标准误
    se_diff = np.sqrt(1/(n1-3) + 1/(n2-3))

    # Z统计量
    z_stat = (z1 - z2) / se_diff
    p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))

    return {
        'z_statistic': z_stat,
        'p_value': p_value,
        'significant_diff': p_value < 0.05
    }

# 示例比较
north_corr = df[df['region'] == 'North']['sales'].corr(df[df['region'] == 'North']['advertising'])
south_corr = df[df['region'] == 'South']['sales'].corr(df[df['region'] == 'South']['advertising'])
comparison = compare_group_correlations(north_corr, south_corr)
print(f"地区相关性差异检验: p={comparison['p_value']:.3f}")

6. 偏相关分析

6.1 控制混杂变量

from pingouin import partial_corr  # 需要安装: pip install pingouin

# 偏相关(控制温度对销售-广告关系的影响)
partial_result = partial_corr(data=df, 
                             x='advertising', 
                             y='sales', 
                             covar='temperature')
print("偏相关分析(控制温度):")
print(partial_result)

# 手动计算偏相关(线性回归残差法)
def partial_correlation(X, Y, Z):
    """手动计算偏相关"""
    # 回归残差
    from sklearn.linear_model import LinearRegression

    # X对Z回归残差
    model_x = LinearRegression().fit(Z.reshape(-1, 1), X)
    X_resid = X - model_x.predict(Z.reshape(-1, 1))

    # Y对Z回归残差
    model_y = LinearRegression().fit(Z.reshape(-1, 1), Y)
    Y_resid = Y - model_y.predict(Z.reshape(-1, 1))

    # 残差相关
    return np.corrcoef(X_resid, Y_resid)[0, 1]

# 示例
manual_partial = partial_correlation(
    df['advertising'].values,
    df['sales'].values,
    df['temperature'].values
)
print(f"手动计算偏相关: {manual_partial:.3f}")

7. 非线性相关性

7.1 距离相关系数

# 需要安装: pip install dcor
import dcor

# 距离相关(捕捉非线性关系)
dist_corr = dcor.distance_correlation(df['sales'], df['temperature'])
print(f"距离相关系数: {dist_corr:.3f}")

# 互信息(非线性依赖)
from sklearn.feature_selection import mutual_info_regression
mi = mutual_info_regression(df[['temperature']], df['sales'])
print(f"互信息: {mi[0]:.3f}")

7.2 相关性趋势分析

# 添加非线性关系
df['nonlinear'] = df['temperature']**2 + np.random.normal(0, 1, len(df))

# 滚动相关性(检测关系变化)
def rolling_correlation(x, y, window):
    return pd.Series(x).rolling(window).corr(pd.Series(y))

rolling_corr = rolling_correlation(df['sales'], df['advertising'], 20)
plt.figure(figsize=(12, 6))
rolling_corr.plot()
plt.title('滚动相关性(窗口=20)')
plt.ylabel('相关系数')
plt.axhline(y=0, color='black', linestyle='--', alpha=0.5)
plt.show()

8. 高级可视化和报告

8.1 相关性网络图

import networkx as nx
import matplotlib.pyplot as plt

def correlation_network(df, threshold=0.5):
    """创建相关性网络图"""
    corr_abs = df.corr().abs()
    edges = []
    for i in range(len(corr_abs.columns)):
        for j in range(i+1, len(corr_abs.columns)):
            if corr_abs.iloc[i, j] > threshold:
                edges.append((corr_abs.columns[i], 
                            corr_abs.columns[j], 
                            corr_abs.iloc[i, j]))

    G = nx.Graph()
    G.add_weighted_edges_from(edges)

    plt.figure(figsize=(10, 8))
    pos = nx.spring_layout(G)
    edges = G.edges()
    weights = [G[u][v]['weight'] * 5 for u,v in edges]

    nx.draw(G, pos, 
            edgelist=edges,
            width=weights,
            edge_color='gray',
            node_color='lightblue',
            with_labels=True,
            node_size=2000,
            font_size=10)
    plt.title(f'相关性网络图 (阈值 > {threshold})')
    plt.show()

correlation_network(df, threshold=0.3)

8.2 相关性报告生成

def generate_correlation_report(df, method='pearson', alpha=0.05):
    """生成详细相关性报告"""
    report = {}

    # 计算相关性和p值
    corr_matrix, p_matrix = correlation_matrix_with_pvalues(df)

    # 显著相关对
    significant_pairs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            var1, var2 = corr_matrix.columns[i], corr_matrix.columns[j]
            r = corr_matrix.loc[var1, var2]
            p = p_matrix.loc[var1, var2]

            if p < alpha:
                significant_pairs.append({
                    'variables': f"{var1} - {var2}",
                    'correlation': r,
                    'p_value': p,
                    'interpretation': interpret_correlation(r),
                    'strength': '强' if abs(r) > 0.7 else '中' if abs(r) > 0.3 else '弱'
                })

    # 排序(按绝对值)
    significant_pairs.sort(key=lambda x: abs(x['correlation']), reverse=True)

    report['summary'] = {
        'total_variables': len(df.columns),
        'significant_pairs': len(significant_pairs),
        'method': method,
        'alpha': alpha
    }

    report['significant_correlations'] = significant_pairs[:10]  # Top 10

    # 最高相关对
    max_corr = corr_matrix.abs().unstack().sort_values(ascending=False)
    max_corr = max_corr[max_corr < 1].drop_duplicates().head(5)
    report['top_correlations'] = max_corr.to_dict()

    return report

# 生成报告
report = generate_correlation_report(df)
print("相关性分析报告:")
print(f"显著相关对数: {report['summary']['significant_pairs']}")
for pair in report['significant_correlations']:
    print(f"{pair['variables']}: {pair['correlation']:.3f} (p={pair['p_value']:.3f})")

9. 实际应用案例

9.1 金融数据相关性

# 模拟股票数据
dates = pd.date_range('2023-01-01', periods=252)
stocks = pd.DataFrame({
    'AAPL': np.random.normal(100, 2, 252).cumsum(),
    'GOOGL': np.random.normal(100, 2, 252).cumsum(),
    'MSFT': np.random.normal(100, 2, 252).cumsum(),
    'SP500': np.random.normal(4000, 50, 252).cumsum()
}, index=dates)

# 添加市场相关性
stocks['AAPL'] += stocks['SP500'] * 0.6
stocks['GOOGL'] += stocks['SP500'] * 0.7
stocks['MSFT'] += stocks['SP500'] * 0.5

# 日回报率
returns = stocks.pct_change().dropna()

# 相关性分析
stock_corr = returns.corr()
print("股票回报率相关性:")
print(stock_corr)

# 滚动相关性(检测市场体制变化)
rolling_corr = returns['AAPL'].rolling(30).corr(returns['SP500'])
plt.figure(figsize=(12, 6))
rolling_corr.plot()
plt.title('AAPL与S&P500滚动相关性')
plt.ylabel('相关系数')
plt.show()

9.2 销售数据分析

# 销售数据集
sales_df = pd.DataFrame({
    'month': pd.date_range('2023-01-01', periods=12, freq='M'),
    'sales': [100, 120, 110, 150, 160, 140, 180, 200, 190, 220, 210, 230],
    'advertising_spend': [50, 55, 60, 70, 80, 75, 90, 100, 95, 110, 105, 120],
    'price': [20, 19.5, 20, 19, 18.5, 19, 18, 17.5, 18, 17, 17.5, 17],
    'competitor_price': [21, 20.5, 21, 20, 19.5, 20, 19, 18.5, 19, 18, 18.5, 18],
    'season': ['Winter']*3 + ['Spring']*3 + ['Summer']*3 + ['Fall']*3
})

# 相关性分析
sales_corr = sales_df.select_dtypes(include=[np.number]).corr()
print("销售相关性:")
print(sales_corr['sales'].sort_values(ascending=False))

# 分季节分析
seasonal_corr = sales_df.groupby('season').apply(
    lambda g: g.select_dtypes([np.number]).corr().loc['sales']
)
print("\n分季节相关性:")
print(seasonal_corr)

10. 相关性分析最佳实践

10.1 前处理建议

def preprocess_for_correlation(df):
    """相关性分析前处理"""
    df_clean = df.copy()

    # 1. 处理缺失值
    numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
    df_clean[numeric_cols] = df_clean[numeric_cols].fillna(df_clean[numeric_cols].median())

    # 2. 检测和处理异常值
    for col in numeric_cols:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        # 截断异常值
        df_clean[col] = df_clean[col].clip(lower=lower, upper=upper)

    # 3. 标准化(可选,避免尺度影响可视化)
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    df_clean[numeric_cols] = scaler.fit_transform(df_clean[numeric_cols])

    # 4. 检查多重共线性
    corr_matrix = df_clean[numeric_cols].corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    high_corr_pairs = [(col1, col2, corr_matrix.loc[col1, col2]) 
                      for col1 in upper.columns 
                      for col2 in upper.index 
                      if upper.loc[col2, col1] > 0.8]

    print("高相关变量对(>0.8):")
    for col1, col2, corr in high_corr_pairs:
        print(f"{col1} - {col2}: {corr:.3f}")

    return df_clean, numeric_cols

# 使用
df_processed, num_cols = preprocess_for_correlation(df)

10.2 自动化相关性分析

class CorrelationAnalyzer:
    """自动化相关性分析类"""

    def __init__(self, df, method='pearson', alpha=0.05):
        self.df = df
        self.method = method
        self.alpha = alpha
        self.results = {}

    def analyze(self):
        """执行完整分析"""
        numeric_df = self.df.select_dtypes(include=[np.number])

        # 相关矩阵
        self.results['correlation_matrix'] = numeric_df.corr(method=self.method)

        # p值矩阵
        corr, pvals = correlation_matrix_with_pvalues(numeric_df)
        self.results['p_values'] = pvals

        # 显著相关对
        significant = []
        for i in range(len(corr.columns)):
            for j in range(i+1, len(corr.columns)):
                r = corr.iloc[i, j]
                p = pvals.iloc[i, j]
                if p < self.alpha:
                    significant.append({
                        'var1': corr.columns[i],
                        'var2': corr.columns[j],
                        'correlation': r,
                        'p_value': p
                    })

        self.results['significant_pairs'] = sorted(
            significant, key=lambda x: abs(x['correlation']), reverse=True
        )

        return self.results

    def plot(self, figsize=(12, 10)):
        """可视化结果"""
        corr = self.results['correlation_matrix']

        fig, axes = plt.subplots(2, 2, figsize=figsize)

        # 热力图
        sns.heatmap(corr, annot=True, cmap='coolwarm', center=0, ax=axes[0,0])
        axes[0,0].set_title('相关系数热力图')

        # 显著性热力图
        mask = self.results['p_values'] > self.alpha
        sig_corr = corr.copy()
        sig_corr[mask] = np.nan
        sns.heatmap(sig_corr, annot=True, cmap='coolwarm', center=0, ax=axes[0,1])
        axes[0,1].set_title(f'显著相关 (p < {self.alpha})')

        # 相关系数分布
        corr_values = corr.values[np.triu_indices_from(corr.values, k=1)]
        axes[1,0].hist(corr_values, bins=20, alpha=0.7)
        axes[1,0].set_title('相关系数分布')
        axes[1,0].set_xlabel('相关系数')

        # Top相关对
        top_pairs = self.results['significant_pairs'][:5]
        if top_pairs:
            top_corr = [abs(p['correlation']) for p in top_pairs]
            variables = [f"{p['var1']}-{p['var2']}" for p in top_pairs]
            axes[1,1].barh(range(len(top_corr)), top_corr)
            axes[1,1].set_yticks(range(len(top_corr)))
            axes[1,1].set_yticklabels(variables)
            axes[1,1].set_title('Top 5 显著相关对')

        plt.tight_layout()
        plt.show()

    def summary(self):
        """生成摘要"""
        results = self.results
        print(f"分析方法: {self.method}")
        print(f"显著水平: {self.alpha}")
        print(f"数值变量数: {len(results['correlation_matrix'].columns)}")
        print(f"显著相关对数: {len(results['significant_pairs'])}")

        if results['significant_pairs']:
            print("\nTop 5 显著相关对:")
            for pair in results['significant_pairs'][:5]:
                print(f"  {pair['var1']} - {pair['var2']}: "
                      f"r={pair['correlation']:.3f}, p={pair['p_value']:.3f}")

# 使用
analyzer = CorrelationAnalyzer(df)
results = analyzer.analyze()
analyzer.plot()
analyzer.summary()

11. 注意事项和局限性

11.1 常见误区

# 1. 相关不等于因果
print("⚠️  相关性 ≠ 因果关系")
print("示例:冰激凌销量与溺水事件相关,但不是因果")

# 2. 样本量影响
small_sample = df.head(10)
large_sample = df
print(f"小样本相关性: {small_sample['sales'].corr(small_sample['advertising']):.3f}")
print(f"大样本相关性: {large_sample['sales'].corr(large_sample['advertising']):.3f}")

# 3. 非线性关系检测
df['temp_squared'] = df['temperature'] ** 2
linear_corr = df['sales'].corr(df['temperature'])
nonlinear_corr = df['sales'].corr(df['temp_squared'])
print(f"线性相关: {linear_corr:.3f}")
print(f"非线性相关: {nonlinear_corr:.3f}")

11.2 稳健性检查

def robustness_check(df, var1, var2):
    """相关性稳健性检查"""
    results = {}

    # 原始相关
    results['original'] = df[var1].corr(df[var2])

    # 删除异常值后
    Q1, Q3 = df[[var1, var2]].quantile([0.25, 0.75])
    IQR = Q3 - Q1
    mask = ((df[var1] >= Q1[var1] - 1.5*IQR[var1]) & 
            (df[var1] <= Q3[var1] + 1.5*IQR[var1]) &
            (df[var2] >= Q1[var2] - 1.5*IQR[var2]) & 
            (df[var2] <= Q3[var2] + 1.5*IQR[var2]))
    results['no_outliers'] = df[mask][var1].corr(df[mask][var2])

    # 不同样本比例
    for frac in [0.5, 0.8, 1.0]:
        sample = df.sample(frac=frac, random_state=42)
        results[f'sample_{frac}'] = sample[var1].corr(sample[var2])

    return pd.DataFrame([results]).T

# 检查
robust_results = robustness_check(df, 'sales', 'advertising')
print("稳健性检查:")
print(robust_results)

相关性分析是探索性数据分析的重要工具,但必须结合业务背景、统计显著性和因果推理。Pandas提供了强大的相关性计算能力,结合可视化和统计检验可以获得深入的变量关系洞察。

类似文章

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注