特征工程实践:泰坦尼克号幸存者预测

泰坦尼克号幸存者预测

数据集下载地址:https://www.kaggle.com/c/titanic/data
本案例主要展示特征工程对数据集的处理方法,模型只选择了简单的lr模型,最后得分并不高。

import pandas as pd
import numpy as np
# Seaborn其实是在matplotlib的基础上进行了更高级的API封装,从而使得作图更加容易
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import boxcox
from sklearn.model_selection import train_test_split, learning_curve, validation_curve, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, chi2


# 统计每个特征的缺失值占总样本的比例
def draw_missing_data_table(data):
   total = data.isnull().sum().sort_values(ascending=False)
   percent = (data.isnull().sum() / data.shape[0]).sort_values(ascending=False)
   missing_data = pd.concat([total, percent], axis=1, keys=["Total", "Percent"])
   return missing_data


train_data = pd.read_csv("./titanic/train.csv")
# print(df.head(2))
# print(df.describe())
# print(df.dtypes)

missing_df = draw_missing_data_table(train_data)
# print(missing_df)
# Cabin特征的缺失值超过了25%,故直接丢弃该特征,Age特征也有接近20%的值是缺失值,需要填充
train_data.drop(["Ticket", "Cabin"], axis=1, inplace=True)
# 我们利用姓名的中的头衔,统计每种头衔的平均年龄,然后用平均年龄填充每种头衔的人的Age特征的缺失值
for _ in train_data:
   # 使用正则表达式提取头衔
   train_data["Title"] = train_data["Name"].str.extract("([A-Za-z]+)\.", expand=False)

train_data.drop(["Name"], axis=1, inplace=True)

# # 画图看不同头衔的人的平均年龄
# plt.figure(figsize=(15, 5))
# # Seaborn会对Title列中的数值进行归类后按照estimator参数的方法(默认为平均值)计算相应的值,计算出来的值就作为条形图所显示的值
# # 条形图上的误差线则表示各类的数值相对于条形图所显示的值的误差
# sns.barplot(x=train_data["Title"], y=train_data["Age"])
# plt.show()
# plt.close()

# 字典形式记录每个头衔的样本数量(头衔出现次数)
# print(train_data["Title"].value_counts().to_dict())
# 字典形式记录每个头衔的人的age的平均值
age_means = train_data.groupby("Title")["Age"].mean().to_dict()
# 记录age特征是缺失值的行index
index_nan_age = train_data.loc[np.isnan(train_data["Age"])].index
# .map(age_means)方法根据对应的键映射值
train_data.loc[index_nan_age, "Age"].loc[index_nan_age] = train_data["Title"].loc[index_nan_age].map(age_means)
train_data["Imputed"] = 0
# 生成一个新特征,用来记录哪些行的Age数据是用平均值填充的,填充过的记为1
train_data.at[index_nan_age.values, "Imputed"] = 1

# # 探索不同变量之间的关联关系,我们假设乘坐的舱等级越高,逃生的概率越大,我们先画出图看一看
# sns.barplot(train_data["Pclass"], train_data["Survived"])
# plt.show()
# plt.close()
# # 我们发现舱等级越高逃生概率越大

# 我们假设人们的头衔会影响他们受到的待遇,分析一下头衔,看看能否找到一种合理的方法来把它们组合起来
# print(train_data.groupby(["Title"])["PassengerId"].count().to_dict())
# 我们可以发现group以后的项还是太多了,我们可以进一步将其合并变成项数更少的五项:Other,Mrs,Miss,Mr,Master
titles_dict = {"Capt": "Other", "Major": "Other", "Jonkheer": "Other", "Don": "Other", "Sir": "Other", "Dr": "Other",
               "Rev": "Other", "Countess": "Other", "Dona": "Other", "Mme": "Mrs", "Mlle": "Miss", "Ms": "Miss",
               "Mr": "Mr", "Mrs": "Mrs", "Miss": "Miss", "Master": "Master", "Lady": "Other"}

# .map(age_means)方法根据对应的键映射值,这样Title特征项被映射成了更少的几种值
train_data["Title"] = train_data["Title"].map(titles_dict)
# 将Title特征转为类别特征,这样我们可以将其转为one_hot编码
train_data["Title"] = pd.Categorical(train_data["Title"])

# # 我们再画图看一下Title对逃生几率的影响
# sns.barplot(train_data["Title"], train_data["Survived"])
# plt.show()
# plt.close()
# # 可以看到我们新的分类中Other和Master分类的误差线比较长,说明其均值存在明显的不确定性

# # 再看看性别对逃生几率的影响
# train_data["Sex"] = pd.Categorical(train_data["Sex"])
# sns.barplot(train_data["Sex"], train_data["Survived"])
# plt.show()
# plt.close()
# # 显然女性的逃生几率更大

# # 再看看年龄对逃生几率的影响,我们假设儿童更容易逃生,而老年人逃生几率更小
# limit_1, limit_2 = 12, 50
# x_limit_1 = np.size(train_data[train_data["Age"] < limit_1]["Age"].unique())
# x_limit_2 = np.size(train_data[train_data["Age"] < limit_2]["Age"].unique())
# plt.figure(figsize=(25, 10))
# sns.barplot(train_data["Age"], train_data["Survived"], ci=None)
# # .axvspan可以在途中特定x坐标范围的矩形显示特定的颜色
# plt.axvspan(-1, x_limit_1, alpha=0.25, color="green")
# plt.axvspan(x_limit_1, x_limit_2, alpha=0.25, color="red")
# plt.axvspan(x_limit_2, 100, alpha=0.25, color="yellow")
# plt.xticks(rotation=90)
# plt.show()
# plt.close()

# 我们将年龄分成三类更好些,分成0-12,12-50,50-80再看看
train_data["Age"] = pd.cut(train_data["Age"], bins=[0, 12, 50, 200], labels=["Child", "Adult", "Elder"])

# # 再次画图
# sns.barplot(train_data["Age"], train_data["Survived"])
# plt.show()
# plt.close()
# # 显然儿童的存活率最高,而成年人和老人似乎没有显著的差别

# 我们将Sibsp(是否有兄弟姐妹/配偶)和Parch(是否有父母/孩子)组合成一个新的特征FamilySize
train_data["FamilySize"] = train_data["SibSp"] + train_data["Parch"]
train_data.drop(["SibSp", "Parch"], axis=1, inplace=True)

# # 再看看这个新特征对逃生几率的影响,我们假设FamilySize越大逃生几率越大
# sns.barplot(train_data["FamilySize"], train_data["Survived"])
# plt.show()
# plt.close()
# # 我们可以看到FamilySize在O和3之间时,我们的假设得到了一些支持,但在4-10人时,逃生几率反而下降了
# # 这可能意味着FamilySize大于3时,我们的假设应该进行修正
# # 因此,我们不会对这个变量做任何变换,我们将它作为一个连续变量

# # 再看看票价对逃生几率的影响,票价这个特征应当与舱位等级这个变量相关性很强
# plt.figure(figsize=(7.5, 5))
# # 这次我们画箱式图
# sns.boxplot(train_data["Survived"], train_data["Fare"])
# plt.show()
# plt.close()
# # 显然可以看出幸存者支付的票价更高

# # 我们再看看票价、舱位等级与逃生几率之间的关系
# sns.barplot(train_data["Survived"], train_data["Fare"], train_data["Pclass"])
# plt.show()
# plt.close()
# # 这里我们发现,如果是二等舱或三等舱,票价对是否幸存没有什么影响,但是如果是头等舱,票价越高则幸存概率越大

# # 上船地点应当对逃生几率没有什么影响,我们也画图看看
# # C=瑟堡,Q=昆士敦,S=南安普敦
# sns.barplot(train_data["Embarked"], train_data["Survived"])
# plt.show()
# plt.close()
# # 比较奇怪的是C地点上船的人生存概率更高,这可能受了性别或其他变量的影响

# # 按Embarked特征分组求一下平均值,C地点上船的人票价更高
# print(train_data.groupby(["Embarked"]).mean())
# # 按Embarked特征分组,再按前面三种年龄分组,结果发现不同上船地点的年龄分组之间没有明显区别
# print(train_data.groupby(["Embarked", "Age"])["PassengerId"].count())
# # 按Embarked特征分组,再按性别分组,也没有发现明显的关系(女性数量总是少于男性的)
# print(train_data.groupby(["Embarked", "Sex"])["PassengerId"].count())
# # 综上,我们认为影响生存率的更可能是票价,上船地点这个特征不影响生存率

train_data.drop(["PassengerId"], axis=1, inplace=True)
train_data["Embarked"] = pd.Categorical(train_data["Embarked"])
train_data["Pclass"] = pd.Categorical(train_data["Pclass"])
# drop_first=1即丢弃一个特征转换成的one_hot编码中的第一个类别特征
# 丢弃原因是为了防止Dummy Variable Trap,即如果一个特征中有k个类别,那么我们实际上只需要k-1个one_hot编码类别就可以表示该特征所有k个类别
train_data = pd.get_dummies(train_data, drop_first=True)

# 建立训练用特征和标签
x = train_data.drop(["Survived"], axis=1)
y = train_data["Survived"]
# 划分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=1)

# 下面进行特征变换,使用Box-Cox变换,用于连续变量不满足正态分布的情况,变换之后可以一定程度上减小不可观测的误差和预测变量的相关性
# 即若λ!=0,Y=((Y^λ)−1)/λ;若λ=0,y=log(1+X)
# 使用Box-Cox变换一般都可以保证将数据进行成功的正态变换,但在二分变量或较少水平的等级变量的情况下,不能成功进行转换
x_train_transformed = x_train.copy()
x_train_transformed["Fare"] = boxcox(x_train_transformed["Fare"] + 1)[0]
x_test_transformed = x_test.copy()
x_test_transformed["Fare"] = boxcox(x_test_transformed["Fare"] + 1)[0]
# print(x_train_transformed)

# 进行min-MAX标准化
scaler = MinMaxScaler()
x_train_transformed_scaled = scaler.fit_transform(x_train_transformed.astype(float))
x_test_transformed_scaled = scaler.transform(x_test_transformed)

# 进行多项式变换创建新特征
poly = PolynomialFeatures(degree=2).fit(x_train_transformed)
# print(poly.get_feature_names())
# fit_transform相当于先调用fit再调用transform
x_train_poly = poly.transform(x_train_transformed_scaled)
x_test_poly = poly.transform(x_test_transformed_scaled)
# print(x_train_poly[0])

# 使用卡方检验选择特征x
# 先创建一个用于卡方检验的基础模型,C惩罚项系数
ka_base_lr_model = LogisticRegression(solver="liblinear", C=1)
ka_base_lr_model.fit(x_train, y_train)
ka_lr_acc_list = cross_val_score(ka_base_lr_model, x_train, y_train, cv=10, scoring="accuracy")
# print("卡方检验基础lr模型的交叉验证平均准确率:{:.3f}".format(np.mean(ka_lr_acc_list)))
# 以初始模型的准确率作为最高准确率的初始值,以其方差作为最小方差的初始值
highest_score = np.mean(ka_lr_acc_list)
smallest_std = np.std(ka_lr_acc_list)
k_features_highest_score = x_train_poly.shape[1]

# 进行特征选择
for i in range(1, x_train_poly.shape[1] + 1):
   # i即第i个特征,每次使用卡方检验时选择保留i个得分最高的特征
   select = SelectKBest(score_func=chi2, k=i)
   select.fit(x_train_poly, y_train)
   # 保留了i个得分最高的特征的数据集
   x_train_poly_selected = select.transform(x_train_poly)
   # 使用保留了i个得分最高的特征的数据集拟合一个lr模型,计算其准确率
   ka_base_lr_model.fit(x_train_poly_selected, y_train)
   scores = cross_val_score(ka_base_lr_model, x_train_poly_selected, y_train, cv=10, scoring="accuracy")
   # print("选择{}个特征的数据集产生的lr模型的交叉验证平均准确率:{:.3f}".format(i, np.mean(scores)))
   # 如果这个新模型的准确率高于最高得分,那么以这个模型的准确率作为最高准确率
   # 如果准确率相同,那么判断谁的方差更小,以方差更小的模型作为最佳模型
   if np.mean(scores) > highest_score:
      highest_score = np.mean(scores)
      smallest_std = np.std(scores)
      k_features_highest_score = i
   elif np.mean(scores) == highest_score:
      if np.std(scores) < smallest_std:
         highest_score = np.mean(scores)
         smallest_std = np.std(scores)
         k_features_highest_score = i

print("取得最佳准确率时的模型由{}个特征的数据集拟合得到".format(k_features_highest_score))

# 现在用我们取得的k来得到最佳模型
select = SelectKBest(score_func=chi2, k=k_features_highest_score)
select.fit(x_train_poly, y_train)
x_train_poly_selected = select.transform(x_train_poly)
lr = LogisticRegression(solver="liblinear", C=1)
lr.fit(x_train_poly_selected, y_train)
acc_list = cross_val_score(lr, x_train, y_train, cv=10, scoring="accuracy")

print("最佳lr模型的交叉验证平均准确率:{:.3f}".format(np.mean(acc_list)))


# 画出学习曲线
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
   plt.figure()
   plt.title(title)
   if ylim is not None:
      plt.ylim(*ylim)
   plt.xlabel("Training examples")
   plt.ylabel("Score")
   train_sizes, train_scores, test_scores = learning_curve(
      estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
   train_scores_mean = np.mean(train_scores, axis=1)
   train_scores_std = np.std(train_scores, axis=1)
   test_scores_mean = np.mean(test_scores, axis=1)
   test_scores_std = np.std(test_scores, axis=1)
   plt.grid()
   plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1,
                    color="r")
   plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1,
                    color="g")
   plt.plot(train_sizes, train_scores_mean, "o-", color="r", label="Training score")
   plt.plot(train_sizes, test_scores_mean, "o-", color="g", label="Validation score")

   plt.legend(loc="best")
   return plt


# title = "Learning Curves (Logistic Regression)"
# cv = 10
# plt_image = plot_learning_curve(lr, title, x_train, y_train, ylim=(0.7, 1.01), cv=cv, n_jobs=1)
# plt.show()
# plt.close()


def plot_validation_curve(estimator, title, x, y, param_name, param_range, ylim=None, n_jobs=1,
                          train_sizes=np.linspace(.1, 1.0, 5)):
   train_scores, test_scores = validation_curve(estimator, x, y, param_name, param_range, cv=5)
   train_mean = np.mean(train_scores, axis=1)
   train_std = np.std(train_scores, axis=1)
   test_mean = np.mean(test_scores, axis=1)
   test_std = np.std(test_scores, axis=1)
   plt.plot(param_range, train_mean, color="r", marker="o", markersize=5, label="Training score")
   plt.fill_between(param_range, train_mean + train_std, train_mean - train_std, alpha=0.15, color="r")
   plt.plot(param_range, test_mean, color="g", linestyle="--", marker="s", markersize=5, label="Validation score")
   plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, alpha=0.15, color="g")
   plt.grid()
   plt.xscale("log")
   plt.legend(loc="best")
   plt.xlabel("Parameter")
   plt.ylabel("Score")
   plt.ylim(ylim)


# # 画出不同惩罚项系数C下的准确率
# title = "Validation Curve (Logistic Regression)"
# param_name = "C"
# param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
# plot_validation_curve(estimator=lr, title=title, x=x_train_poly_selected, y=y_train, param_name=param_name,
#                       ylim=(0.5, 1.01), param_range=param_range)
# plt.show()
# plt.close()

# 对测试集测试
test_data = pd.read_csv("./titanic/test.csv")
test_data["FamilySize"] = test_data["SibSp"] + test_data["Parch"]
test_data.drop(["SibSp", "Parch"], axis=1, inplace=True)
test_data.drop(["Ticket", "Cabin"], axis=1, inplace=True)

for i in test_data:
   test_data["Title"] = test_data["Name"].str.extract("([A-Za-z]+)\.", expand=False)

test_data.drop(["Name"], axis=1, inplace=True)

test_age_means = test_data.groupby("Title")["Age"].mean().to_dict()
test_index_nan_age = test_data.loc[np.isnan(test_data["Age"])].index
test_data.loc[test_index_nan_age, "Age"] = test_data["Title"].loc[test_index_nan_age].map(test_age_means)
test_data["Title"] = test_data["Title"].map(titles_dict)
test_data["Title"] = pd.Categorical(test_data["Title"])
test_data["Imputed"] = 0
test_data.at[test_index_nan_age.values, "Imputed"] = 1

test_data["Age"] = pd.cut(test_data["Age"], bins=[0, 12, 50, 200], labels=["Child", "Adult", "Elder"])
passenger_id = test_data["PassengerId"].values
test_data.drop("PassengerId", axis=1, inplace=True)

test_data["Embarked"] = pd.Categorical(test_data["Embarked"])
test_data["Pclass"] = pd.Categorical(test_data["Pclass"])
test_data = pd.get_dummies(test_data, drop_first=1)

# Fare有一个缺失值,用平均值填充
test_data = test_data.fillna(test_data.mean())

X = test_data
X_transformed = X.copy()
X_transformed["Fare"] = boxcox(X_transformed["Fare"] + 1)[0]

Scaler = MinMaxScaler()
X_transformed_scaled = Scaler.fit_transform(X_transformed.astype(float))
poly = PolynomialFeatures(degree=2).fit(X_transformed)
X_poly = poly.transform(X_transformed_scaled)
X_poly_selected = select.transform(X_poly)

# 预测并将预测结果写入提交文件
predictions = lr.predict(X_poly_selected)
submission = pd.DataFrame({"PassengerId": passenger_id, "Survived": predictions})
submission.to_csv("submission.csv", index=False)

 上一篇
特征工程sklearn基础实践、特征工程数据可视化 特征工程sklearn基础实践、特征工程数据可视化
数据和特征决定了机器学习的上限,而模型和算法只是逼近这个上限而已。 特征工程sklearn实践这里主要以iris数据集为例,展示了数据预处理方法、三大类特征选择方法、降维方法。数据预处理方法: z_score标准化; min_max标准化
2019-04-06
下一篇 
关联分析:Apriori和FPgrowth算法原理 关联分析:Apriori和FPgrowth算法原理
关联分析:频繁项集和关联规则从大规模的数据中发现物品间隐含关系的方法被称为关联分析。关联分析是一种在大规模数据集中寻找有趣关系的任务。关联规则:关联规则是形如X→Y的表达式,其中X和Y是不相交的项集,即X∩Y=∅。关联规则的强度可以用它的支
2019-04-03
  目录