# 加载yellowbrick数据集
import os
import pandas as pd
FIXTURES = os.path.join(os.getcwd(), "data")
df = pd.read_csv(os.path.join(FIXTURES, "spam", "spam.csv"))
df.head()
word_freq_make | word_freq_address | word_freq_all | word_freq_3d | word_freq_our | word_freq_over | word_freq_remove | word_freq_internet | word_freq_order | word_freq_mail | ... | char_freq_; | char_freq_( | char_freq_[ | char_freq_! | char_freq_$ | char_freq_# | capital_run_length_average | capital_run_length_longest | capital_run_length_total | is_spam |
---|
5 rows × 58 columns
X, y = df.iloc[:,:-1],df.iloc[:,-1] # 数据集、目标集
df["is_spam"].unique() # 1,0 数字分类,其余因素均为数字
# classes=['not_spam','is_spam'] # 对应 [0,1]
array([1, 0], dtype=int64)
# 分类报告
# visualizer = ClassificationReport(model, classes=['not_spam','is_spam'], support=True)
# 混淆矩阵
# ConfusionMatrix(model,classes=['not_spam','is_spam'],label_encoder={0:"not_spam", 1:"is_spam"})
from yellowbrick.features import Rank2D
visualizer = Rank2D(algorithm='pearson') # 皮尔森相关系数
visualizer.fit(X, y) # Fit the data to the visualizer
visualizer.transform(X) # Transform the data
visualizer.poof() # Draw/show/poof the data
from sklearn.ensemble import RandomForestClassifier # 分类问题
from yellowbrick.features.importances import FeatureImportances
model = RandomForestClassifier(n_estimators=10)
viz = FeatureImportances(model)
viz.fit(X, y)
viz.poof()
FeatureImportances(absolute=False,
ax=<matplotlib.axes._subplots.AxesSubplot object at 0x0000025909186F98>,
labels=None, model=None, relative=True, stack=False, xlabel=None)
rfc = model.fit(X, y)
importances = rfc.feature_importances_
importances
array([0.00419625, 0.00329869, 0.00606181, 0.00068846, 0.02014877,
0.00599604, 0.05792883, 0.02877044, 0.00333147, 0.00927314,
0.00606247, 0.01202045, 0.00335088, 0.00115922, 0.00035176,
0.07424499, 0.02461916, 0.00705513, 0.02264122, 0.00251324,
0.05250165, 0.00250961, 0.00732771, 0.01208356, 0.06672894,
0.0112409 , 0.03217723, 0.00360037, 0.00101266, 0.00487104,
0.00037709, 0.00075526, 0.00115574, 0.00138721, 0.0027788 ,
0.00204865, 0.01613457, 0.00067385, 0.00496664, 0.00179599,
0.00153094, 0.00703848, 0.00252604, 0.00217318, 0.00794447,
0.02254089, 0.0003397 , 0.0008103 , 0.0039666 , 0.01288543,
0.00291571, 0.08633219, 0.15673341, 0.00346989, 0.05752226,
0.07943742, 0.03199316])
df1 = pd.DataFrame({'feature':df.columns[:-1],'importances':importances})
df2 = df1.sort_values('importances', ascending=False)
new_features = df[df2[df2['importances']>0.01]['feature'].tolist()]
new_features.shape # 由57个特征降到20个特征,看看结果上的差异 # 事实上用Bokeh也可以实现上面的柱状图
(4600, 20)
%%time
from sklearn.model_selection import train_test_split as tts
from sklearn.ensemble import RandomForestClassifier
from yellowbrick.classifier import ClassificationReport
X_train, X_test, y_train, y_test = tts(X, y, test_size =0.2, random_state=10)
model = RandomForestClassifier(n_estimators=10)
visualizer = ClassificationReport(model, classes=['not_spam','is_spam'], support=True)
visualizer.fit(X_train.values, y_train) # Fit the visualizer and the model
print('得分:',visualizer.score(X_test.values, y_test)) # Evaluate the model on the test data
visualizer.poof() # Draw/show/poof the data
得分: 0.9576086956521739
Wall time: 188 ms
%%time
# 新特征20
X_train, X_test, y_train, y_test = tts(new_features, y, test_size =0.2, random_state=10)
model = RandomForestClassifier(n_estimators=10)
visualizer = ClassificationReport(model, classes=['not_spam','is_spam'], support=True)
visualizer.fit(X_train.values, y_train) # Fit the visualizer and the model
print('得分:',visualizer.score(X_test.values, y_test)) # Evaluate the model on the test data
visualizer.poof()
得分: 0.9391304347826087
Wall time: 176 ms
from xgboost import XGBClassifier
X_train, X_test, y_train, y_test = tts(X, y, test_size =0.2, random_state=10)
model = XGBClassifier()
visualizer = ClassificationReport(model, classes=['not_spam','is_spam'], support=True)
visualizer.fit(X_train.values, y_train) # Fit the visualizer and the model
print('得分:',visualizer.score(X_test.values, y_test)) # Evaluate the model on the test data
visualizer.poof() # Draw/show/poof the data
得分: 0.9478260869565217
from xgboost import XGBClassifier
X_train, X_test, y_train, y_test = tts(new_features, y, test_size =0.2, random_state=10)
model = XGBClassifier()
visualizer = ClassificationReport(model, classes=['not_spam','is_spam'], support=True)
visualizer.fit(X_train.values, y_train) # Fit the visualizer and the model
print('得分:',visualizer.score(X_test.values, y_test)) # Evaluate the model on the test data
visualizer.poof()
得分: 0.9445652173913044
from yellowbrick.classifier import ConfusionMatrix
model = RandomForestClassifier(n_estimators=10)
cm = ConfusionMatrix(model,classes=['not_spam','is_spam'],label_encoder={0:"not_spam", 1:"is_spam"})
cm.fit(X_train.values, y_train)
cm.score(X_test.values, y_test)
cm.poof()
E:\Anaconda3\lib\site-packages\sklearn\metrics\classification.py:182: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
score = y_true == y_pred
(1)objective [ default=reg:linear ] 定义学习任务及相应的学习目标,可选的目标函数如下:
“reg:linear” –线性回归。
“reg:logistic” –逻辑回归。
“binary:logistic” –二分类的逻辑回归问题,输出为概率。
“multi:softmax” –处理多分类问题,同时需要设置参数num_class(类别个数)
(2)’eval_metric’ The choices are listed below,评估指标:
“rmse”: root mean square error
“logloss”: negative log-likelihood
(3)max_depth [default=6] 数的最大深度。缺省值为6 ,取值范围为:[1,∞]