|

Pandas 数组/标量/数据类型参考手册

Pandas 数组/标量/数据类型完整参考手册

1. Pandas 数据类型体系

1.1 核心数据类型分类

import pandas as pd
import numpy as np
from pandas import array, Int64, Float64, Boolean

# Pandas数据类型分类
print("Pandas数据类型分类:")
print("1. NumPy兼容类型: int8, int16, int32, int64, float32, float64, bool, object")
print("2. Pandas扩展类型: Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64")
print("3. Float类型: Float32, Float64")
print("4. 布尔类型: boolean, Boolean")
print("5. 字符串类型: string")
print("6. 分类类型: category")
print("7. 日期时间类型: datetime64[ns], timedelta64[ns]")
print("8. Pandas Array: ExtensionArray")

1.2 类型层次结构

# 数据类型层次
dtype_hierarchy = {
    'Numeric': ['int8', 'int16', 'int32', 'int64', 'float32', 'float64'],
    'Nullable Integer': ['Int8', 'Int16', 'Int32', 'Int64', 'UInt8', 'UInt16', 'UInt32', 'UInt64'],
    'Nullable Float': ['Float32', 'Float64'],
    'Boolean': ['bool', 'boolean'],
    'String': ['object', 'string'],
    'Category': ['category'],
    'Datetime': ['datetime64[ns]', 'Timestamp'],
    'Timedelta': ['timedelta64[ns]']
}

for category, types in dtype_hierarchy.items():
    print(f"{category}: {types}")

2. 数值类型详解

2.1 NumPy 数值类型

# NumPy内置整数类型
numpy_int_types = {
    'int8': np.int8,      # -128 到 127
    'int16': np.int16,    # -32,768 到 32,767
    'int32': np.int32,    # -2^31 到 2^31-1
    'int64': np.int64,    # -2^63 到 2^63-1
    'uint8': np.uint8,    # 0 到 255
    'uint16': np.uint16,  # 0 到 65,535
    'uint32': np.uint32,  # 0 到 4,294,967,295
    'uint64': np.uint64   # 0 到 2^64-1
}

# 示例
s = pd.Series([127, 128, -129], dtype='int8')
print("int8溢出行为:")
print(s)  # 128会被截断为-128,-129会被截断为127

# 浮点类型
s_float = pd.Series([1.23456789], dtype='float32')
print("float32精度:")
print(f"float32: {s_float[0]:.8f}")
print(f"float64: {float(s_float[0]):.8f}")

2.2 Pandas 可空整数类型(推荐)

# Pandas nullable整数(支持NaN)
s_nullable = pd.Series([1, 2, None, 4], dtype="Int64")
print("可空整数类型:")
print(s_nullable)
print("类型:", s_nullable.dtype)
print("是否可空:", s_nullable.dtype.is_nullable)
print("NaN表示:", pd.NA)

# 对比传统int64
s_traditional = pd.Series([1, 2, np.nan, 4], dtype="float64")
print("\n传统float64:")
print(s_traditional)

# 无符号可空整数
s_uint = pd.Series([1, 2, None, 4], dtype="UInt32")
print("\n无符号可空整数:")
print(s_uint)

2.3 浮点类型优化

# 可空浮点类型
s_float_null = pd.Series([1.1, 2.2, None, 4.4], dtype="Float64")
print("可空浮点:")
print(s_float_null)

# 精度对比
values = [0.1, 0.2, 0.30000000000000004]
s32 = pd.Series(values, dtype="float32")
s64 = pd.Series(values, dtype="float64")
print("\n浮点精度对比:")
print(f"float32: {s32[2]:.17f}")
print(f"float64: {s64[2]:.17f}")

3. 布尔和逻辑类型

3.1 传统布尔类型

# 传统bool(不支持NaN)
s_bool = pd.Series([True, False, True])
print("传统bool:")
print(s_bool)
print("内存:", s_bool.memory_usage())

# bool不支持NaN
try:
    pd.Series([True, False, None], dtype="bool")
except ValueError as e:
    print("错误:", e)

3.2 可空布尔类型

# Pandas可空布尔(推荐)
s_pandas_bool = pd.Series([True, False, None, True], dtype="boolean")
print("可空布尔:")
print(s_pandas_bool)
print("类型:", s_pandas_bool.dtype)
print("内存:", s_pandas_bool.memory_usage())

# 逻辑操作
print("\n逻辑操作:")
print(s_pandas_bool & pd.Series([True, True, True, False], dtype="boolean"))
print(s_pandas_bool | pd.Series([False, True, False, True], dtype="boolean"))

4. 字符串类型

4.1 传统object vs 新string类型

# 传统object类型
s_object = pd.Series(["apple", "banana", None, "cherry"])
print("object类型:")
print(s_object)
print("类型:", s_object.dtype)
print("内存:", s_object.memory_usage(deep=True))

# 新string类型(Pandas 1.0+)
s_string = pd.Series(["apple", "banana", None, "cherry"], dtype="string")
print("\nstring类型:")
print(s_string)
print("类型:", s_string.dtype)
print("内存:", s_string.memory_usage(deep=True))

# 字符串方法
print("\n字符串操作:")
print(s_string.str.upper())
print(s_string.str.len())
print(s_string.str.contains("a"))

4.2 字符串内存优化

# 大量重复字符串的内存对比
data = ["apple"] * 1000 + ["banana"] * 1000 + [None] * 1000

s_obj = pd.Series(data)
s_str = pd.Series(data, dtype="string")

print("字符串内存对比:")
print(f"object: {s_obj.memory_usage(deep=True).sum() / 1024:.1f} KB")
print(f"string: {s_str.memory_usage(deep=True).sum() / 1024:.1f} KB")

5. 分类类型(Categorical)

5.1 创建分类数据

# 基本分类
categories = ["poor", "average", "good", "excellent"]
s_cat = pd.Series(["good", "poor", "excellent", "average"], 
                  dtype=pd.CategoricalDtype(categories, ordered=True))

print("有序分类:")
print(s_cat)
print("分类:", s_cat.cat.categories)
print("编码:", s_cat.cat.codes)
print("有序:", s_cat.cat.ordered)

# 无序分类
s_cat_unordered = pd.Series(["A", "B", "C"]).astype("category")

5.2 分类操作

# 添加/删除分类
s_cat = s_cat.cat.add_categories(["superb"])
print("添加分类:", s_cat.cat.categories)

s_cat = s_cat.cat.remove_categories("poor")
print("移除分类:", s_cat.cat.categories)

# 重新分类
s_cat = s_cat.cat.set_categories(["low", "medium", "high", "very_high"], 
                                ordered=True)
print("重新分类:", s_cat.cat.categories)

# 分类排序
print("分类排序:", s_cat.sort_values())

5.3 内存优化效果

# 高基数vs低基数字符串
high_cardinality = [f"item_{i}" for i in range(10000)]
low_cardinality = ["A", "B", "C"] * 3334

# 低基数:分类更省内存
s_low_str = pd.Series(low_cardinality, dtype="string")
s_low_cat = pd.Series(low_cardinality, dtype="category")

print("低基数内存对比:")
print(f"string: {s_low_str.memory_usage(deep=True).sum() / 1024:.1f} KB")
print(f"category: {s_low_cat.memory_usage(deep=True).sum() / 1024:.1f} KB")
print(f"节省: {1 - s_low_cat.memory_usage(deep=True).sum()/s_low_str.memory_usage(deep=True).sum():.1%}")

6. 时间和日期类型

6.1 datetime64 和 Timestamp

# datetime64[ns](推荐)
dates = pd.date_range("2023-01-01", periods=5, freq="D")
s_datetime = pd.Series(dates)
print("datetime64[ns]:")
print(s_datetime)
print("类型:", s_datetime.dtype)

# Timestamp(单个时间戳)
ts = pd.Timestamp("2023-01-01 12:00:00")
print("\nTimestamp:", ts)
print("时区:", ts.tz_localize("UTC"))

# pandas可空datetime
s_nullable_dt = pd.Series([pd.Timestamp("2023-01-01"), None, pd.Timestamp("2023-01-03")], 
                         dtype="datetime64[ns]")
print("\n可空datetime:")
print(s_nullable_dt)

6.2 时区处理

# 时区本地化
s_tz = pd.Series(pd.date_range("2023-01-01", periods=3)).dt.tz_localize("US/Eastern")
print("带时区:")
print(s_tz)

# 时区转换
s_utc = s_tz.dt.tz_convert("UTC")
print("\nUTC转换:")
print(s_utc)

# 时区感知操作
s_periods = s_tz.to_period("D")
print("周期表示:")
print(s_periods)

6.3 Timedelta

# 时间差
deltas = pd.Series(pd.date_range("2023-01-01", periods=3)) - pd.Timestamp("2022-12-31")
print("Timedelta:")
print(deltas)
print("类型:", deltas.dtype)

# timedelta算术
print("\n时间运算:")
print(deltas + pd.Timedelta(days=1))
print(deltas * 2)

7. Pandas ExtensionArray

7.1 什么是ExtensionArray

# ExtensionArray是Pandas的扩展数组接口
# 支持自定义数据类型和缺失值表示

# 使用pandas.array创建
arr = pd.array([1, 2, None, 4], dtype="Int64")
print("ExtensionArray:")
print(arr)
print("类型:", type(arr))
print("是否为ExtensionArray:", pd.api.types.is_extension_array_dtype(arr.dtype))

# 与NumPy数组对比
np_arr = np.array([1, 2, None, 4], dtype="object")
print("\nNumPy数组:")
print(np_arr)

7.2 自定义ExtensionArray

# 简单自定义数组示例
from pandas.api.extensions import ExtensionArray, ExtensionDtype
import pandas.api.extensions as extensions

class CustomDtype(ExtensionDtype):
    name = "custom"
    na_value = pd.NA

    @classmethod
    def construct_array_type(cls):
        return CustomArray

class CustomArray(ExtensionArray):
    def __init__(self, values, dtype=None):
        self._values = np.asarray(values)
        self._dtype = CustomDtype()

    @property
    def dtype(self):
        return self._dtype

    @property
    def nbytes(self):
        return self._values.nbytes

    def __len__(self):
        return len(self._values)

    def __getitem__(self, item):
        return self._values[item]

    def isna(self):
        return pd.isna(self._values)

    @property
    def _na_value(self):
        return pd.NA

# 使用自定义数组
custom_arr = CustomArray([1, 2, pd.NA, 4])
s_custom = pd.Series(custom_arr)
print("自定义ExtensionArray:")
print(s_custom)

8. 数据类型转换和优化

8.1 自动类型推断

# pd.to_numeric(智能转换)
mixed = pd.Series(["1", "2", "3", None, "4.5"])
s_numeric = pd.to_numeric(mixed, errors="coerce")
print("数值转换:")
print(s_numeric)
print("类型:", s_numeric.dtype)

# pd.to_datetime
date_strs = ["2023-01-01", "2023-01-02", None, "2023-01-04"]
s_dates = pd.to_datetime(date_strs)
print("\n日期转换:")
print(s_dates)
print("类型:", s_dates.dtype)

8.2 强制类型转换

df = pd.DataFrame({
    "integers": [1, 2, 3, 4],
    "floats": [1.0, 2.0, 3.0, 4.0],
    "strings": ["a", "b", "c", "d"],
    "booleans": [True, False, True, False],
    "dates": pd.date_range("2023-01-01", periods=4)
})

# astype转换
df_converted = df.astype({
    "integers": "Int64",      # 可空整数
    "floats": "Float32",      # 32位浮点
    "strings": "string",      # 字符串类型
    "booleans": "boolean"     # 可空布尔
})
print("类型转换后:")
print(df_converted.dtypes)

8.3 智能下转换(内存优化)

def optimize_dtypes(df):
    """智能数据类型优化"""
    optimized = df.copy()

    # 1. 整数优化
    for col in optimized.select_dtypes(include=["int64"]):
        min_val = optimized[col].min()
        max_val = optimized[col].max()

        if min_val >= np.iinfo(np.int8).min and max_val <= np.iinfo(np.int8).max:
            optimized[col] = optimized[col].astype("Int8")
        elif min_val >= np.iinfo(np.int16).min and max_val <= np.iinfo(np.int16).max:
            optimized[col] = optimized[col].astype("Int16")
        # ... 其他类型

    # 2. 浮点优化
    for col in optimized.select_dtypes(include=["float64"]):
        if optimized[col].dropna().apply(float.is_integer).all():
            # 全为整数,转换为可空整数
            optimized[col] = pd.to_numeric(optimized[col], downcast="integer").astype("Int64")
        else:
            optimized[col] = optimized[col].astype("Float32")

    # 3. 字符串→分类
    for col in optimized.select_dtypes(include=["object", "string"]):
        nunique = optimized[col].nunique()
        if nunique / len(optimized) < 0.5 and nunique < 100:
            optimized[col] = optimized[col].astype("category")

    # 4. 布尔优化
    for col in optimized.select_dtypes(include=["object"]):
        unique_vals = set(optimized[col].dropna().unique())
        if unique_vals <= {True, False, 1, 0}:
            optimized[col] = optimized[col].map({True: True, False: False, 1: True, 0: False}).astype("boolean")

    return optimized

# 测试优化效果
np.random.seed(42)
n = 100000
df_original = pd.DataFrame({
    "int_col": np.random.randint(-100, 100, n),
    "float_col": np.random.randn(n),
    "str_col": np.random.choice(["A", "B", "C"], n),
    "bool_col": np.random.choice([True, False], n)
})

original_memory = df_original.memory_usage(deep=True).sum()
df_opt = optimize_dtypes(df_original)
optimized_memory = df_opt.memory_usage(deep=True).sum()

print("内存优化效果:")
print(f"原始: {original_memory / 1024**2:.1f} MB")
print(f"优化后: {optimized_memory / 1024**2:.1f} MB")
print(f"节省: {100 * (1 - optimized_memory/original_memory):.1f}%")

9. 缺失值表示和处理

9.1 不同类型的NaN表示

# Pandas统一使用pd.NA作为可空类型的缺失值
print("Pandas缺失值表示:")
print("pd.NA:", pd.NA)
print("类型:", type(pd.NA))
print("是否为NaN:", pd.isna(pd.NA))

# 传统NumPy NaN
print("\nNumPy NaN:")
print("np.nan:", np.nan)
print("类型:", type(np.nan))
print("float(np.nan) == float(np.nan):", float(np.nan) == float(np.nan))  # False!

# None
print("\nPython None:")
print("None == None:", None == None)  # True

9.2 缺失值检测

s_mixed = pd.Series([1, pd.NA, None, np.nan, 5], dtype="Int64")
print("混合缺失值:")
print(s_mixed)
print("isna():", s_mixed.isna())
print("isnull():", s_mixed.isnull())  # 别名

# 统计
print("缺失统计:")
print("总数:", s_mixed.isna().sum())
print("比例:", s_mixed.isna().mean())

9.3 缺失值填充策略

s_with_na = pd.Series([1, pd.NA, 3, pd.NA, 5], dtype="Int64")

# 简单填充
print("填充0:", s_with_na.fillna(0))
print("前向填充:", s_with_na.fillna(method="ffill"))
print("后向填充:", s_with_na.fillna(method="bfill"))

# 统计填充
print("均值填充:", s_with_na.fillna(s_with_na.mean()))

# 插值
s_interp = pd.Series([1, pd.NA, pd.NA, 4, pd.NA], index=pd.date_range("2023-01-01", periods=5))
print("线性插值:", s_interp.interpolate(method="linear"))

10. 类型检查和验证

10.1 类型检查函数

# pandas.api.types模块
from pandas.api.types import (
    is_numeric_dtype, is_integer_dtype, is_float_dtype,
    is_bool_dtype, is_string_dtype, is_categorical_dtype,
    is_datetime64_any_dtype, is_extension_array_dtype,
    is_object_dtype
)

s_int = pd.Series([1, 2, 3], dtype="Int64")
s_float = pd.Series([1.1, 2.2], dtype="Float64")
s_str = pd.Series(["a", "b"], dtype="string")
s_cat = pd.Series(["A", "B"]).astype("category")

print("类型检查:")
print(f"Int64是数值: {is_numeric_dtype(s_int)}")
print(f"Int64是整数: {is_integer_dtype(s_int)}")
print(f"Float64是浮点: {is_float_dtype(s_float)}")
print(f"string是字符串: {is_string_dtype(s_str)}")
print(f"category是分类: {is_categorical_dtype(s_cat)}")
print(f"Int64是ExtensionArray: {is_extension_array_dtype(s_int.dtype)}")

10.2 运行时类型验证

def validate_column_dtype(df, col_name, expected_dtype):
    """验证列数据类型"""
    actual_dtype = df[col_name].dtype

    if not is_extension_array_dtype(actual_dtype) and expected_dtype == "Int64":
        # 允许float64作为Int64的fallback
        if is_integer_dtype(actual_dtype):
            return True

    return str(actual_dtype) == expected_dtype

# 使用
df_test = pd.DataFrame({"ints": pd.Series([1, 2, None], dtype="Int64")})
print("类型验证:", validate_column_dtype(df_test, "ints", "Int64"))

11. 性能对比和最佳实践

11.1 类型性能对比

import time

n = 1000000
data = np.random.randint(-1000, 1000, n)

# 不同整数类型的性能测试
types = ["int64", "Int64", "int32", "Int32", "int16", "Int16", "int8", "Int8"]
results = {}

for dtype in types:
    if "Int" in dtype and "8" in dtype and np.any(data < -127):
        continue  # 跳过无法表示的数据

    s = pd.Series(data.astype(dtype) if "int" in dtype else data, dtype=dtype)

    # 创建时间
    start = time.time()
    _ = pd.Series(data, dtype=dtype)
    create_time = time.time() - start

    # 操作时间(简单计算)
    start = time.time()
    result = s.sum() + s.mean()
    op_time = time.time() - start

    results[dtype] = {
        "create_time": create_time,
        "op_time": op_time,
        "memory_mb": s.memory_usage() / 1024**2
    }

# 结果展示
df_perf = pd.DataFrame(results).T
print("类型性能对比:")
print(df_perf.round(4))

11.2 最佳实践总结

def best_practices_dtype_selection(data_preview):
    """数据类型选择最佳实践"""
    recommendations = {}

    for col, sample in data_preview.items():
        # 数值列
        if pd.api.types.is_numeric_dtype(sample):
            min_val, max_val = sample.min(), sample.max()
            nunique = sample.nunique()

            # 整数优化
            if min_val >= np.iinfo(np.int8).min and max_val <= np.iinfo(np.int8).max:
                recommendations[col] = "Int8"
            elif sample.isna().any():
                recommendations[col] = "Int64"  # 支持NaN
            else:
                recommendations[col] = "int32"  # 节省内存

        # 字符串列
        elif pd.api.types.is_object_dtype(sample):
            nunique_ratio = sample.nunique() / len(sample)
            if nunique_ratio < 0.1:  # 低基数
                recommendations[col] = "category"
            else:
                recommendations[col] = "string"  # 新string类型

        # 布尔列
        elif set(sample.dropna().unique()) <= {True, False, 1.0, 0.0}:
            recommendations[col] = "boolean"

    return recommendations

# 示例使用
sample_data = {
    "id": pd.Series(range(1000)),
    "category": pd.Series(["A", "B"] * 500),
    "flag": pd.Series([True, False] * 500),
    "score": pd.Series(np.random.randn(1000))
}

recommendations = best_practices_dtype_selection(sample_data)
print("类型推荐:")
for col, dtype in recommendations.items():
    print(f"{col}: {dtype}")

12. 常见问题和陷阱

12.1 整数NaN陷阱

# 问题:传统int64不支持NaN
try:
    s = pd.Series([1, 2, np.nan], dtype="int64")
except ValueError as e:
    print("错误:", e)
    print("解决方案:使用Int64")
    s_fixed = pd.Series([1, 2, np.nan], dtype="Int64")
    print(s_fixed)

12.2 浮点精度问题

# 浮点精度陷阱
a = 0.1 + 0.2
b = 0.3
print(f"0.1 + 0.2 == 0.3: {a == b}")  # False!

# 解决方案:使用decimal或容差比较
import decimal
a_dec = decimal.Decimal('0.1') + decimal.Decimal('0.2')
b_dec = decimal.Decimal('0.3')
print(f"Decimal比较: {a_dec == b_dec}")  # True

# Pandas容差比较
s1 = pd.Series([0.1, 0.2])
s2 = pd.Series([0.1000000001, 0.2000000001])
print("Pandas容差比较:")
print(pd.testing.assert_series_equal(s1, s2, check_exact=False, rtol=1e-6))

12.3 分类类型陷阱

# 分类类型注意事项
s_cat = pd.Series(["A", "B", "C"]).astype("category")

# 问题:新值会变成NaN
try:
    s_cat[0] = "D"  # D不在categories中
except ValueError as e:
    print("错误:", e)

# 解决方案:先添加分类
s_cat = s_cat.cat.add_categories("D")
s_cat[0] = "D"
print("正确添加新分类:", s_cat)

Pandas提供了丰富的数据类型系统,特别是可空类型(Int64, Float64, boolean, string)大大提升了数据处理的灵活性和内存效率。选择合适的数据类型是性能优化的关键,建议优先使用可空类型和分类类型来处理缺失值和低基数数据。

类似文章

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注