数据预处理:特征选择

发布时间:2021-12-03 公开文章

Base

基础知识

Civil

土木分类资料

Python

Python编程学习

Tools

自媒体效率工具

 

过滤式特征选择VarianceThreshold

from sklearn.feature_selection import  VarianceThreshold,SelectKBest,f_classif

def test_VarianceThreshold():
    '''
    测试 VarianceThreshold  的用法

    :return:  None
    '''
    X=[[100,1,2,3],
       [100,4,5,6],
       [100,7,8,9],
       [101,11,12,13]]
    selector=VarianceThreshold(1)
    selector.fit(X)
    print("Variances is %s"%selector.variances_)
    print("After transform is %s"%selector.transform(X))
    print("The surport is %s"%selector.get_support(True))
    print("After reverse transform is %s"%
            selector.inverse_transform(selector.transform(X)))
test_VarianceThreshold()
Variances is [ 0.1875 13.6875 13.6875 13.6875]
After transform is [[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [11 12 13]]
The surport is [1 2 3]
After reverse transform is [[ 0  1  2  3]
 [ 0  4  5  6]
 [ 0  7  8  9]
 [ 0 11 12 13]]
def test_SelectKBest():
    '''
    测试 SelectKBest  的用法,其中考察的特征指标是 f_classif

    :return:  None
    '''
    X=[   [1,2,3,4,5],
          [5,4,3,2,1],
          [3,3,3,3,3,],
          [1,1,1,1,1] ]
    y=[0,1,0,1]
    print("before transform:",X)
    selector=SelectKBest(score_func=f_classif,k=3) # k=3
    selector.fit(X,y)
    print("scores_:",selector.scores_)
    print("pvalues_:",selector.pvalues_)
    print("selected index:",selector.get_support(True))
    print("after transform:",selector.transform(X))
test_SelectKBest()
before transform: [[1, 2, 3, 4, 5], [5, 4, 3, 2, 1], [3, 3, 3, 3, 3], [1, 1, 1, 1, 1]]
scores_: [0.2 0.  1.  8.  9. ]
pvalues_: [0.69848865 1.         0.42264974 0.10557281 0.09546597]
selected index: [2 3 4]
after transform: [[3 4 5]
 [3 2 1]
 [3 3 3]
 [1 1 1]]

包裹式特征选择RFE,RFECV

from sklearn.feature_selection import  RFE,RFECV
from sklearn.svm import LinearSVC
from sklearn.datasets import  load_iris
from sklearn.model_selection import train_test_split

def test_RFE():
    '''
    测试 RFE 的用法,其中目标特征数量为 2

    :return: None
    '''
    iris=load_iris()
    X=iris.data
    y=iris.target
    estimator=LinearSVC()
    selector=RFE(estimator=estimator,n_features_to_select=2)
    selector.fit(X,y)
    print("N_features %s"%selector.n_features_)
    print("Support is %s"%selector.support_)
    print("Ranking %s"%selector.ranking_)
def test_RFECV():
    '''
    测试 RFECV 的用法

    :return:  None
    '''
    iris=load_iris()
    X=iris.data
    y=iris.target
    estimator=LinearSVC()
    selector=RFECV(estimator=estimator,cv=3)
    selector.fit(X,y)
    print("N_features %s"%selector.n_features_)
    print("Support is %s"%selector.support_)
    print("Ranking %s"%selector.ranking_)
    print("Grid Scores %s"%selector.grid_scores_)
def test_compare_with_no_feature_selection():
    '''
    比较经过特征选择和未经特征选择的数据集,对 LinearSVC 的预测性能的区别

    :return: None
    '''
    ### 加载数据
    iris=load_iris()
    X,y=iris.data,iris.target
    ### 特征提取
    estimator=LinearSVC()
    selector=RFE(estimator=estimator,n_features_to_select=2)
    X_t=selector.fit_transform(X,y)
    #### 切分测试集与验证集
    X_train,X_test,y_train,y_test=train_test_split(X, y,
                test_size=0.25,random_state=0,stratify=y)
    X_train_t,X_test_t,y_train_t,y_test_t=train_test_split(X_t, y,
                test_size=0.25,random_state=0,stratify=y)
    ### 测试与验证
    clf=LinearSVC()
    clf_t=LinearSVC()
    clf.fit(X_train,y_train)
    clf_t.fit(X_train_t,y_train_t)
    print("Original DataSet: test score=%s"%(clf.score(X_test,y_test)))
    print("Selected DataSet: test score=%s"%(clf_t.score(X_test_t,y_test_t)))
test_RFE() # 调用 test_RFE
N_features 2
Support is [False  True False  True]
Ranking [3 1 2 1]
test_compare_with_no_feature_selection() # 调用 test_compare_with_no_feature_selection
Original DataSet: test score=0.9473684210526315
Selected DataSet: test score=0.9473684210526315
test_RFECV() # 调用 test_RFECV
N_features 4
Support is [ True  True  True  True]
Ranking [1 1 1 1]
Grid Scores [0.91421569 0.94689542 0.95383987 0.96691176]

嵌入式特征选择

from sklearn.feature_selection import  SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.datasets import  load_digits,load_diabetes
import numpy as np
import  matplotlib.pyplot as plt
from sklearn.linear_model import Lasso

def test_SelectFromModel():
    '''
    测试 SelectFromModel 的用法。

    :return: None
    '''
    digits=load_digits()
    X=digits.data
    y=digits.target
    estimator=LinearSVC(penalty='l1',dual=False)
    selector=SelectFromModel(estimator=estimator,threshold='mean')
    selector.fit(X,y)
    selector.transform(X)
    print("Threshold %s"%selector.threshold_)
    print("Support is %s"%selector.get_support(indices=True))
def test_Lasso(*data):
    '''
    测试 alpha 与稀疏性的关系

    :param data: 可变参数。它是一个元组,这里要求其元素依次为:训练样本集、测试样本集、训练样本的值、测试样本的值
    :return: None
    '''
    X,y=data
    alphas=np.logspace(-2,2)
    zeros=[]
    for alpha in alphas:
        regr=Lasso(alpha=alpha)
        regr.fit(X,y)
        ### 计算零的个数 ###
        num=0
        for ele in regr.coef_:
            if abs(ele) < 1e-5:num+=1
        zeros.append(num)
    ##### 绘图
    fig=plt.figure()
    ax=fig.add_subplot(1,1,1)
    ax.plot(alphas,zeros)
    ax.set_xlabel(r"$\alpha$")
    ax.set_xscale("log")
    ax.set_ylim(0,X.shape[1]+1)
    ax.set_ylabel("zeros in coef")
    ax.set_title("Sparsity In Lasso")
    plt.show()
def test_LinearSVC(*data):
    '''
    测试 C  与 稀疏性的关系

    :param data: 可变参数。它是一个元组,这里要求其元素依次为:训练样本集、测试样本集、训练样本的标记、测试样本的标记
    :return: None
    '''
    X,y=data
    Cs=np.logspace(-2,2)
    zeros=[]
    for C in Cs:
        clf=LinearSVC(C=C,penalty='l1',dual=False)
        clf.fit(X,y)
         ### 计算零的个数 ###
        num=0
        for row in clf.coef_:
            for ele in row:
                if abs(ele) < 1e-5:num+=1
        zeros.append(num)
    ##### 绘图
    fig=plt.figure()
    ax=fig.add_subplot(1,1,1)
    ax.plot(Cs,zeros)
    ax.set_xlabel("C")
    ax.set_xscale("log")
    ax.set_ylabel("zeros in coef")
    ax.set_title("Sparsity In SVM")
    plt.show()
test_SelectFromModel() # 调用 test_SelectFromModel
Threshold 0.6759827221334211
Support is [ 2  3  4  5  6  9 12 13 14 18 19 20 21 22 26 27 30 33 36 38 41 42 43 44
 45 51 53 54 55 58 61]
data=load_diabetes() # 生成用于回归问题的数据集
test_Lasso(data.data,data.target) # 调用 test_Lasso

 

 

data=load_digits() # 生成用于分类问题的数据集
test_LinearSVC(data.data,data.target) # 调用 test_LinearSVC