Pandas 数组/标量/数据类型参考手册
Pandas 数组/标量/数据类型完整参考手册
1. Pandas 数据类型体系
1.1 核心数据类型分类
import pandas as pd
import numpy as np
from pandas import array, Int64, Float64, Boolean
# Pandas数据类型分类
print("Pandas数据类型分类:")
print("1. NumPy兼容类型: int8, int16, int32, int64, float32, float64, bool, object")
print("2. Pandas扩展类型: Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64")
print("3. Float类型: Float32, Float64")
print("4. 布尔类型: boolean, Boolean")
print("5. 字符串类型: string")
print("6. 分类类型: category")
print("7. 日期时间类型: datetime64[ns], timedelta64[ns]")
print("8. Pandas Array: ExtensionArray")
1.2 类型层次结构
# 数据类型层次
dtype_hierarchy = {
'Numeric': ['int8', 'int16', 'int32', 'int64', 'float32', 'float64'],
'Nullable Integer': ['Int8', 'Int16', 'Int32', 'Int64', 'UInt8', 'UInt16', 'UInt32', 'UInt64'],
'Nullable Float': ['Float32', 'Float64'],
'Boolean': ['bool', 'boolean'],
'String': ['object', 'string'],
'Category': ['category'],
'Datetime': ['datetime64[ns]', 'Timestamp'],
'Timedelta': ['timedelta64[ns]']
}
for category, types in dtype_hierarchy.items():
print(f"{category}: {types}")
2. 数值类型详解
2.1 NumPy 数值类型
# NumPy内置整数类型
numpy_int_types = {
'int8': np.int8, # -128 到 127
'int16': np.int16, # -32,768 到 32,767
'int32': np.int32, # -2^31 到 2^31-1
'int64': np.int64, # -2^63 到 2^63-1
'uint8': np.uint8, # 0 到 255
'uint16': np.uint16, # 0 到 65,535
'uint32': np.uint32, # 0 到 4,294,967,295
'uint64': np.uint64 # 0 到 2^64-1
}
# 示例
s = pd.Series([127, 128, -129], dtype='int8')
print("int8溢出行为:")
print(s) # 128会被截断为-128,-129会被截断为127
# 浮点类型
s_float = pd.Series([1.23456789], dtype='float32')
print("float32精度:")
print(f"float32: {s_float[0]:.8f}")
print(f"float64: {float(s_float[0]):.8f}")
2.2 Pandas 可空整数类型(推荐)
# Pandas nullable整数(支持NaN)
s_nullable = pd.Series([1, 2, None, 4], dtype="Int64")
print("可空整数类型:")
print(s_nullable)
print("类型:", s_nullable.dtype)
print("是否可空:", s_nullable.dtype.is_nullable)
print("NaN表示:", pd.NA)
# 对比传统int64
s_traditional = pd.Series([1, 2, np.nan, 4], dtype="float64")
print("\n传统float64:")
print(s_traditional)
# 无符号可空整数
s_uint = pd.Series([1, 2, None, 4], dtype="UInt32")
print("\n无符号可空整数:")
print(s_uint)
2.3 浮点类型优化
# 可空浮点类型
s_float_null = pd.Series([1.1, 2.2, None, 4.4], dtype="Float64")
print("可空浮点:")
print(s_float_null)
# 精度对比
values = [0.1, 0.2, 0.30000000000000004]
s32 = pd.Series(values, dtype="float32")
s64 = pd.Series(values, dtype="float64")
print("\n浮点精度对比:")
print(f"float32: {s32[2]:.17f}")
print(f"float64: {s64[2]:.17f}")
3. 布尔和逻辑类型
3.1 传统布尔类型
# 传统bool(不支持NaN)
s_bool = pd.Series([True, False, True])
print("传统bool:")
print(s_bool)
print("内存:", s_bool.memory_usage())
# bool不支持NaN
try:
pd.Series([True, False, None], dtype="bool")
except ValueError as e:
print("错误:", e)
3.2 可空布尔类型
# Pandas可空布尔(推荐)
s_pandas_bool = pd.Series([True, False, None, True], dtype="boolean")
print("可空布尔:")
print(s_pandas_bool)
print("类型:", s_pandas_bool.dtype)
print("内存:", s_pandas_bool.memory_usage())
# 逻辑操作
print("\n逻辑操作:")
print(s_pandas_bool & pd.Series([True, True, True, False], dtype="boolean"))
print(s_pandas_bool | pd.Series([False, True, False, True], dtype="boolean"))
4. 字符串类型
4.1 传统object vs 新string类型
# 传统object类型
s_object = pd.Series(["apple", "banana", None, "cherry"])
print("object类型:")
print(s_object)
print("类型:", s_object.dtype)
print("内存:", s_object.memory_usage(deep=True))
# 新string类型(Pandas 1.0+)
s_string = pd.Series(["apple", "banana", None, "cherry"], dtype="string")
print("\nstring类型:")
print(s_string)
print("类型:", s_string.dtype)
print("内存:", s_string.memory_usage(deep=True))
# 字符串方法
print("\n字符串操作:")
print(s_string.str.upper())
print(s_string.str.len())
print(s_string.str.contains("a"))
4.2 字符串内存优化
# 大量重复字符串的内存对比
data = ["apple"] * 1000 + ["banana"] * 1000 + [None] * 1000
s_obj = pd.Series(data)
s_str = pd.Series(data, dtype="string")
print("字符串内存对比:")
print(f"object: {s_obj.memory_usage(deep=True).sum() / 1024:.1f} KB")
print(f"string: {s_str.memory_usage(deep=True).sum() / 1024:.1f} KB")
5. 分类类型(Categorical)
5.1 创建分类数据
# 基本分类
categories = ["poor", "average", "good", "excellent"]
s_cat = pd.Series(["good", "poor", "excellent", "average"],
dtype=pd.CategoricalDtype(categories, ordered=True))
print("有序分类:")
print(s_cat)
print("分类:", s_cat.cat.categories)
print("编码:", s_cat.cat.codes)
print("有序:", s_cat.cat.ordered)
# 无序分类
s_cat_unordered = pd.Series(["A", "B", "C"]).astype("category")
5.2 分类操作
# 添加/删除分类
s_cat = s_cat.cat.add_categories(["superb"])
print("添加分类:", s_cat.cat.categories)
s_cat = s_cat.cat.remove_categories("poor")
print("移除分类:", s_cat.cat.categories)
# 重新分类
s_cat = s_cat.cat.set_categories(["low", "medium", "high", "very_high"],
ordered=True)
print("重新分类:", s_cat.cat.categories)
# 分类排序
print("分类排序:", s_cat.sort_values())
5.3 内存优化效果
# 高基数vs低基数字符串
high_cardinality = [f"item_{i}" for i in range(10000)]
low_cardinality = ["A", "B", "C"] * 3334
# 低基数:分类更省内存
s_low_str = pd.Series(low_cardinality, dtype="string")
s_low_cat = pd.Series(low_cardinality, dtype="category")
print("低基数内存对比:")
print(f"string: {s_low_str.memory_usage(deep=True).sum() / 1024:.1f} KB")
print(f"category: {s_low_cat.memory_usage(deep=True).sum() / 1024:.1f} KB")
print(f"节省: {1 - s_low_cat.memory_usage(deep=True).sum()/s_low_str.memory_usage(deep=True).sum():.1%}")
6. 时间和日期类型
6.1 datetime64 和 Timestamp
# datetime64[ns](推荐)
dates = pd.date_range("2023-01-01", periods=5, freq="D")
s_datetime = pd.Series(dates)
print("datetime64[ns]:")
print(s_datetime)
print("类型:", s_datetime.dtype)
# Timestamp(单个时间戳)
ts = pd.Timestamp("2023-01-01 12:00:00")
print("\nTimestamp:", ts)
print("时区:", ts.tz_localize("UTC"))
# pandas可空datetime
s_nullable_dt = pd.Series([pd.Timestamp("2023-01-01"), None, pd.Timestamp("2023-01-03")],
dtype="datetime64[ns]")
print("\n可空datetime:")
print(s_nullable_dt)
6.2 时区处理
# 时区本地化
s_tz = pd.Series(pd.date_range("2023-01-01", periods=3)).dt.tz_localize("US/Eastern")
print("带时区:")
print(s_tz)
# 时区转换
s_utc = s_tz.dt.tz_convert("UTC")
print("\nUTC转换:")
print(s_utc)
# 时区感知操作
s_periods = s_tz.to_period("D")
print("周期表示:")
print(s_periods)
6.3 Timedelta
# 时间差
deltas = pd.Series(pd.date_range("2023-01-01", periods=3)) - pd.Timestamp("2022-12-31")
print("Timedelta:")
print(deltas)
print("类型:", deltas.dtype)
# timedelta算术
print("\n时间运算:")
print(deltas + pd.Timedelta(days=1))
print(deltas * 2)
7. Pandas ExtensionArray
7.1 什么是ExtensionArray
# ExtensionArray是Pandas的扩展数组接口
# 支持自定义数据类型和缺失值表示
# 使用pandas.array创建
arr = pd.array([1, 2, None, 4], dtype="Int64")
print("ExtensionArray:")
print(arr)
print("类型:", type(arr))
print("是否为ExtensionArray:", pd.api.types.is_extension_array_dtype(arr.dtype))
# 与NumPy数组对比
np_arr = np.array([1, 2, None, 4], dtype="object")
print("\nNumPy数组:")
print(np_arr)
7.2 自定义ExtensionArray
# 简单自定义数组示例
from pandas.api.extensions import ExtensionArray, ExtensionDtype
import pandas.api.extensions as extensions
class CustomDtype(ExtensionDtype):
name = "custom"
na_value = pd.NA
@classmethod
def construct_array_type(cls):
return CustomArray
class CustomArray(ExtensionArray):
def __init__(self, values, dtype=None):
self._values = np.asarray(values)
self._dtype = CustomDtype()
@property
def dtype(self):
return self._dtype
@property
def nbytes(self):
return self._values.nbytes
def __len__(self):
return len(self._values)
def __getitem__(self, item):
return self._values[item]
def isna(self):
return pd.isna(self._values)
@property
def _na_value(self):
return pd.NA
# 使用自定义数组
custom_arr = CustomArray([1, 2, pd.NA, 4])
s_custom = pd.Series(custom_arr)
print("自定义ExtensionArray:")
print(s_custom)
8. 数据类型转换和优化
8.1 自动类型推断
# pd.to_numeric(智能转换)
mixed = pd.Series(["1", "2", "3", None, "4.5"])
s_numeric = pd.to_numeric(mixed, errors="coerce")
print("数值转换:")
print(s_numeric)
print("类型:", s_numeric.dtype)
# pd.to_datetime
date_strs = ["2023-01-01", "2023-01-02", None, "2023-01-04"]
s_dates = pd.to_datetime(date_strs)
print("\n日期转换:")
print(s_dates)
print("类型:", s_dates.dtype)
8.2 强制类型转换
df = pd.DataFrame({
"integers": [1, 2, 3, 4],
"floats": [1.0, 2.0, 3.0, 4.0],
"strings": ["a", "b", "c", "d"],
"booleans": [True, False, True, False],
"dates": pd.date_range("2023-01-01", periods=4)
})
# astype转换
df_converted = df.astype({
"integers": "Int64", # 可空整数
"floats": "Float32", # 32位浮点
"strings": "string", # 字符串类型
"booleans": "boolean" # 可空布尔
})
print("类型转换后:")
print(df_converted.dtypes)
8.3 智能下转换(内存优化)
def optimize_dtypes(df):
"""智能数据类型优化"""
optimized = df.copy()
# 1. 整数优化
for col in optimized.select_dtypes(include=["int64"]):
min_val = optimized[col].min()
max_val = optimized[col].max()
if min_val >= np.iinfo(np.int8).min and max_val <= np.iinfo(np.int8).max:
optimized[col] = optimized[col].astype("Int8")
elif min_val >= np.iinfo(np.int16).min and max_val <= np.iinfo(np.int16).max:
optimized[col] = optimized[col].astype("Int16")
# ... 其他类型
# 2. 浮点优化
for col in optimized.select_dtypes(include=["float64"]):
if optimized[col].dropna().apply(float.is_integer).all():
# 全为整数,转换为可空整数
optimized[col] = pd.to_numeric(optimized[col], downcast="integer").astype("Int64")
else:
optimized[col] = optimized[col].astype("Float32")
# 3. 字符串→分类
for col in optimized.select_dtypes(include=["object", "string"]):
nunique = optimized[col].nunique()
if nunique / len(optimized) < 0.5 and nunique < 100:
optimized[col] = optimized[col].astype("category")
# 4. 布尔优化
for col in optimized.select_dtypes(include=["object"]):
unique_vals = set(optimized[col].dropna().unique())
if unique_vals <= {True, False, 1, 0}:
optimized[col] = optimized[col].map({True: True, False: False, 1: True, 0: False}).astype("boolean")
return optimized
# 测试优化效果
np.random.seed(42)
n = 100000
df_original = pd.DataFrame({
"int_col": np.random.randint(-100, 100, n),
"float_col": np.random.randn(n),
"str_col": np.random.choice(["A", "B", "C"], n),
"bool_col": np.random.choice([True, False], n)
})
original_memory = df_original.memory_usage(deep=True).sum()
df_opt = optimize_dtypes(df_original)
optimized_memory = df_opt.memory_usage(deep=True).sum()
print("内存优化效果:")
print(f"原始: {original_memory / 1024**2:.1f} MB")
print(f"优化后: {optimized_memory / 1024**2:.1f} MB")
print(f"节省: {100 * (1 - optimized_memory/original_memory):.1f}%")
9. 缺失值表示和处理
9.1 不同类型的NaN表示
# Pandas统一使用pd.NA作为可空类型的缺失值
print("Pandas缺失值表示:")
print("pd.NA:", pd.NA)
print("类型:", type(pd.NA))
print("是否为NaN:", pd.isna(pd.NA))
# 传统NumPy NaN
print("\nNumPy NaN:")
print("np.nan:", np.nan)
print("类型:", type(np.nan))
print("float(np.nan) == float(np.nan):", float(np.nan) == float(np.nan)) # False!
# None
print("\nPython None:")
print("None == None:", None == None) # True
9.2 缺失值检测
s_mixed = pd.Series([1, pd.NA, None, np.nan, 5], dtype="Int64")
print("混合缺失值:")
print(s_mixed)
print("isna():", s_mixed.isna())
print("isnull():", s_mixed.isnull()) # 别名
# 统计
print("缺失统计:")
print("总数:", s_mixed.isna().sum())
print("比例:", s_mixed.isna().mean())
9.3 缺失值填充策略
s_with_na = pd.Series([1, pd.NA, 3, pd.NA, 5], dtype="Int64")
# 简单填充
print("填充0:", s_with_na.fillna(0))
print("前向填充:", s_with_na.fillna(method="ffill"))
print("后向填充:", s_with_na.fillna(method="bfill"))
# 统计填充
print("均值填充:", s_with_na.fillna(s_with_na.mean()))
# 插值
s_interp = pd.Series([1, pd.NA, pd.NA, 4, pd.NA], index=pd.date_range("2023-01-01", periods=5))
print("线性插值:", s_interp.interpolate(method="linear"))
10. 类型检查和验证
10.1 类型检查函数
# pandas.api.types模块
from pandas.api.types import (
is_numeric_dtype, is_integer_dtype, is_float_dtype,
is_bool_dtype, is_string_dtype, is_categorical_dtype,
is_datetime64_any_dtype, is_extension_array_dtype,
is_object_dtype
)
s_int = pd.Series([1, 2, 3], dtype="Int64")
s_float = pd.Series([1.1, 2.2], dtype="Float64")
s_str = pd.Series(["a", "b"], dtype="string")
s_cat = pd.Series(["A", "B"]).astype("category")
print("类型检查:")
print(f"Int64是数值: {is_numeric_dtype(s_int)}")
print(f"Int64是整数: {is_integer_dtype(s_int)}")
print(f"Float64是浮点: {is_float_dtype(s_float)}")
print(f"string是字符串: {is_string_dtype(s_str)}")
print(f"category是分类: {is_categorical_dtype(s_cat)}")
print(f"Int64是ExtensionArray: {is_extension_array_dtype(s_int.dtype)}")
10.2 运行时类型验证
def validate_column_dtype(df, col_name, expected_dtype):
"""验证列数据类型"""
actual_dtype = df[col_name].dtype
if not is_extension_array_dtype(actual_dtype) and expected_dtype == "Int64":
# 允许float64作为Int64的fallback
if is_integer_dtype(actual_dtype):
return True
return str(actual_dtype) == expected_dtype
# 使用
df_test = pd.DataFrame({"ints": pd.Series([1, 2, None], dtype="Int64")})
print("类型验证:", validate_column_dtype(df_test, "ints", "Int64"))
11. 性能对比和最佳实践
11.1 类型性能对比
import time
n = 1000000
data = np.random.randint(-1000, 1000, n)
# 不同整数类型的性能测试
types = ["int64", "Int64", "int32", "Int32", "int16", "Int16", "int8", "Int8"]
results = {}
for dtype in types:
if "Int" in dtype and "8" in dtype and np.any(data < -127):
continue # 跳过无法表示的数据
s = pd.Series(data.astype(dtype) if "int" in dtype else data, dtype=dtype)
# 创建时间
start = time.time()
_ = pd.Series(data, dtype=dtype)
create_time = time.time() - start
# 操作时间(简单计算)
start = time.time()
result = s.sum() + s.mean()
op_time = time.time() - start
results[dtype] = {
"create_time": create_time,
"op_time": op_time,
"memory_mb": s.memory_usage() / 1024**2
}
# 结果展示
df_perf = pd.DataFrame(results).T
print("类型性能对比:")
print(df_perf.round(4))
11.2 最佳实践总结
def best_practices_dtype_selection(data_preview):
"""数据类型选择最佳实践"""
recommendations = {}
for col, sample in data_preview.items():
# 数值列
if pd.api.types.is_numeric_dtype(sample):
min_val, max_val = sample.min(), sample.max()
nunique = sample.nunique()
# 整数优化
if min_val >= np.iinfo(np.int8).min and max_val <= np.iinfo(np.int8).max:
recommendations[col] = "Int8"
elif sample.isna().any():
recommendations[col] = "Int64" # 支持NaN
else:
recommendations[col] = "int32" # 节省内存
# 字符串列
elif pd.api.types.is_object_dtype(sample):
nunique_ratio = sample.nunique() / len(sample)
if nunique_ratio < 0.1: # 低基数
recommendations[col] = "category"
else:
recommendations[col] = "string" # 新string类型
# 布尔列
elif set(sample.dropna().unique()) <= {True, False, 1.0, 0.0}:
recommendations[col] = "boolean"
return recommendations
# 示例使用
sample_data = {
"id": pd.Series(range(1000)),
"category": pd.Series(["A", "B"] * 500),
"flag": pd.Series([True, False] * 500),
"score": pd.Series(np.random.randn(1000))
}
recommendations = best_practices_dtype_selection(sample_data)
print("类型推荐:")
for col, dtype in recommendations.items():
print(f"{col}: {dtype}")
12. 常见问题和陷阱
12.1 整数NaN陷阱
# 问题:传统int64不支持NaN
try:
s = pd.Series([1, 2, np.nan], dtype="int64")
except ValueError as e:
print("错误:", e)
print("解决方案:使用Int64")
s_fixed = pd.Series([1, 2, np.nan], dtype="Int64")
print(s_fixed)
12.2 浮点精度问题
# 浮点精度陷阱
a = 0.1 + 0.2
b = 0.3
print(f"0.1 + 0.2 == 0.3: {a == b}") # False!
# 解决方案:使用decimal或容差比较
import decimal
a_dec = decimal.Decimal('0.1') + decimal.Decimal('0.2')
b_dec = decimal.Decimal('0.3')
print(f"Decimal比较: {a_dec == b_dec}") # True
# Pandas容差比较
s1 = pd.Series([0.1, 0.2])
s2 = pd.Series([0.1000000001, 0.2000000001])
print("Pandas容差比较:")
print(pd.testing.assert_series_equal(s1, s2, check_exact=False, rtol=1e-6))
12.3 分类类型陷阱
# 分类类型注意事项
s_cat = pd.Series(["A", "B", "C"]).astype("category")
# 问题:新值会变成NaN
try:
s_cat[0] = "D" # D不在categories中
except ValueError as e:
print("错误:", e)
# 解决方案:先添加分类
s_cat = s_cat.cat.add_categories("D")
s_cat[0] = "D"
print("正确添加新分类:", s_cat)
Pandas提供了丰富的数据类型系统,特别是可空类型(Int64, Float64, boolean, string)大大提升了数据处理的灵活性和内存效率。选择合适的数据类型是性能优化的关键,建议优先使用可空类型和分类类型来处理缺失值和低基数数据。