本文目录导读:
我来为你介绍几种常见的Python数据补全案例的实现方法。
使用Pandas进行数据补全
基础库准备
import pandas as pd
import numpy as np
# 创建示例数据
df = pd.DataFrame({
'A': [1, 2, None, 4, 5],
'B': [None, 2, 3, None, 5],
'C': [1, None, None, 4, 5]
})
print("原始数据:")
print(df)
常用补全方法
方法1:固定值填充
# 用0填充所有缺失值
df_filled_0 = df.fillna(0)
# 用指定值填充特定列
df_filled_specific = df.fillna({'A': 0, 'B': 'missing', 'C': -1})
print("固定值填充结果:")
print(df_filled_0)
方法2:统计值填充
# 用列平均值填充
df_filled_mean = df.fillna(df.mean())
# 用列中位数填充
df_filled_median = df.fillna(df.median())
# 用列众数填充
df_filled_mode = df.fillna(df.mode().iloc[0])
print("平均值填充结果:")
print(df_filled_mean)
方法3:前后值填充
# 前向填充(用前一个有效值填充)
df_ffill = df.fillna(method='ffill')
# 后向填充(用后一个有效值填充)
df_bfill = df.fillna(method='bfill')
print("前向填充结果:")
print(df_ffill)
方法4:插值法
# 线性插值
df_interpolate = df.interpolate(method='linear')
# 时间序列插值
df_interpolate_time = df.interpolate(method='time')
print("线性插值结果:")
print(df_interpolate)
使用Scikit-learn进行高级补全
安装依赖
pip install scikit-learn
完整案例:多特征数据补全
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.ensemble import RandomForestRegressor
# 创建更复杂的数据集
np.random.seed(42)
data = {
'年龄': [25, 30, None, 35, 40, None, 28, 33],
'收入': [50000, 60000, 75000, None, 80000, 65000, None, 70000],
'教育年限': [16, None, 14, 18, None, 12, 16, 14],
'工作经验': [3, 5, 8, None, 12, 7, 2, 6]
}
df = pd.DataFrame(data)
print("原始数据:")
print(df)
# 1. 简单填充(均值/中位数/众数)
simple_imputer = SimpleImputer(strategy='mean')
df_simple_imputed = pd.DataFrame(
simple_imputer.fit_transform(df),
columns=df.columns
)
print("\n简单均值填充:")
print(df_simple_imputed)
# 2. KNN填充
knn_imputer = KNNImputer(n_neighbors=2)
df_knn_imputed = pd.DataFrame(
knn_imputer.fit_transform(df),
columns=df.columns
)
print("\nKNN填充结果:")
print(df_knn_imputed)
# 3. 迭代填充(MICE方法)
iterative_imputer = IterativeImputer(
estimator=RandomForestRegressor(n_estimators=10),
max_iter=10,
random_state=42
)
df_mice_imputed = pd.DataFrame(
iterative_imputer.fit_transform(df),
columns=df.columns
)
print("\nMICE迭代填充结果:")
print(df_mice_imputed)
项目实战案例:销售数据补全
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
# 生成模拟销售数据
def generate_sales_data():
dates = pd.date_range(start='2023-01-01', end='2023-12-31', freq='D')
np.random.seed(42)
# 创建基础销售数据(有季节性模式)
sales = 100 + 20 * np.sin(2 * np.pi * dates.dayofyear / 365) + np.random.normal(0, 15, len(dates))
# 随机删除一些数据(模拟缺失)
missing_mask = np.random.random(len(dates)) < 0.05
sales[missing_mask] = np.nan
return pd.DataFrame({
'date': dates,
'sales': sales
})
# 加载数据
df_sales = generate_sales_data()
print("数据概览(前10行):")
print(df_sales.head(10))
print(f"\n总缺失值:{df_sales['sales'].isna().sum()}")
# 1. 时间序列特定的填充方法
# 使用前面的值填充
df_sales['sales_ffill'] = df_sales['sales'].fillna(method='ffill')
# 使用移动平均填充
window_size = 7
df_sales['sales_ma'] = df_sales['sales'].rolling(window=window_size, min_periods=1).mean()
df_sales['sales_ma_filled'] = df_sales['sales'].fillna(df_sales['sales_ma'])
# 2. 高级时间序列插值
# 线性插值(按时间)
df_sales['sales_interp_linear'] = df_sales['sales'].interpolate(method='linear')
# 多项式插值
df_sales['sales_interp_poly'] = df_sales['sales'].interpolate(method='polynomial', order=2)
# 3. 基于季节性的填充
# 计算每周同一天的平均值
df_sales['day_of_week'] = df_sales['date'].dt.dayofweek
weekly_pattern = df_sales.groupby('day_of_week')['sales'].mean()
df_sales['sales_weekly_fill'] = df_sales['sales'].fillna(
df_sales['day_of_week'].map(weekly_pattern)
)
# 可视化结果
plt.figure(figsize=(12, 6))
# 原始数据(显示缺失)
plt.subplot(2, 1, 1)
plt.plot(df_sales['date'], df_sales['sales'], 'b-', alpha=0.7, label='原始数据')
plt.plot(df_sales['date'][df_sales['sales'].isna()],
df_sales['sales_interp_linear'][df_sales['sales'].isna()],
'ro', label='线性插值补全点', markersize=4)'销售数据补全对比')
plt.legend()
plt.xticks(rotation=45)
# 不同补全方法对比
plt.subplot(2, 1, 2)
plt.plot(df_sales['date'], df_sales['sales'], 'b-', alpha=0.5, label='原始数据')
plt.plot(df_sales['date'], df_sales['sales_ffill'], 'g-', alpha=0.7, label='前向填充')
plt.plot(df_sales['date'], df_sales['sales_interp_linear'], 'r--', alpha=0.7, label='线性插值')
plt.plot(df_sales['date'], df_sales['sales_weekly_fill'], 'm:', alpha=0.7, label='周模式填充')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# 评估不同方法的性能
# 创建测试集(故意删除一些数据做验证)
df_test = df_sales.copy()
true_values = df_test.loc[df_test['sales'].isna(), 'sales_weekly_fill'].copy() # 使用原值作为真实值
# 由于我们不知道真实缺失值,这里用未缺失的数据做验证
print("\n不同填充方法在完整数据点的MAE:")
complete_data = df_sales.dropna()
for method in ['sales_ffill', 'sales_ma_filled', 'sales_interp_linear', 'sales_weekly_fill']:
mae = np.mean(np.abs(complete_data['sales'] - complete_data[method]))
print(f"{method}: {mae:.2f}")
最佳实践建议
选择补全方法的原则
def suggest_imputation_method(data_description):
"""
根据数据特征推荐补全方法
参数:
- data_description: 数据特征描述字典
"""
suggestions = []
if data_description['missing_rate'] < 0.05:
suggestions.append("缺失率低,简单的均值/中位数填充即可")
elif data_description['missing_rate'] < 0.2:
suggestions.append("中等缺失率,考虑KNN或迭代填充")
else:
suggestions.append("高缺失率,需要谨慎处理,考虑删除或高级填充方法")
if data_description['is_time_series']:
suggestions.append("时间序列数据,推荐使用插值法或前向/后向填充")
if data_description['has_correlated_features']:
suggestions.append("特征间相关,推荐使用KNN或MICE填充")
return suggestions
# 示例使用
data_info = {
'missing_rate': 0.1,
'is_time_series': True,
'has_correlated_features': True
}
print("数据补全建议:")
for suggestion in suggest_imputation_method(data_info):
print(f"- {suggestion}")
- 简单场景:使用Pandas的fillna()方法,选择适合的填充值
- 中等复杂度:使用插值法或KNN填充
- 复杂场景:使用迭代填充(MICE)或机器学习方法
- 时间序列:优先考虑时间序列特定的填充方法
选择补全方法时,要考虑缺失率、数据特征、业务场景和计算资源,建议尝试多种方法,并通过交叉验证选择最佳方案。
标签: Python案例