Pandas 检验数值列的数据质量¶
-
输出每列:
- 缺失值数量(NaN)
- 无穷值数量(Inf)
- 唯一值数量(Unique)
-
检测并打印:
- 任意列包含 NaN 或 Inf 的行(最多前 10 条)
import pandas as pd
import numpy as np
def check_num_cols(df, num_cols):
"""
检查指定列:
- NaN 数量
- Inf 数量
- 唯一值数量
- 打印存在 NaN / Inf 的前10行
"""
print("\n=== Column Summary ===")
for col in num_cols:
if col not in df.columns:
print(f" ERROR: Column '{col}' not found!")
continue
na_count = df[col].isna().sum()
inf_count = np.isinf(df[col]).sum()
unique_count = df[col].nunique(dropna=True)
print(
f"{col:20} -> NaN: {na_count:5d} | Inf: {inf_count:5d} | Unique: {unique_count}"
)
# === 行级问题检测 ===
problem_mask = (
df[num_cols].isna().any(axis=1)
| np.isinf(df[num_cols]).any(axis=1)
)
if problem_mask.any():
print(f"\n⚠️ Found {problem_mask.sum()} rows with NaN or Inf. Showing first 10:")
print(df.loc[problem_mask, num_cols].head(10))
else:
print("\n✅ No NaN or Inf found in given columns.")