一个有用的Python可视化库yellowbrick-Feature Analysis

发布时间:2021-12-03 公开文章


















# 加载数据集
import os
import pandas as pd
FIXTURES  = os.path.join(os.getcwd(), "data")
datasets = {
    "bikeshare": os.path.join(FIXTURES, "bikeshare", "bikeshare.csv"),
    "concrete": os.path.join(FIXTURES, "concrete", "concrete.csv"),
    "credit": os.path.join(FIXTURES, "credit", "credit.csv"),
    "energy": os.path.join(FIXTURES, "energy", "energy.csv"),
    "game": os.path.join(FIXTURES, "game", "game.csv"),
    "mushroom": os.path.join(FIXTURES, "mushroom", "mushroom.csv"),
    "occupancy": os.path.join(FIXTURES, "occupancy", "occupancy.csv"),
    "spam": os.path.join(FIXTURES, "spam", "spam.csv"),

def load_data(name, download=True):
    Loads and wrangles the passed in dataset by name.
    If download is specified, this method will download any missing files.

    # Get the path from the datasets
    path = datasets[name]

    # Check if the data exists, otherwise download or raise
    if not os.path.exists(path):
        if download:
            raise ValueError((
                "'{}' dataset has not been downloaded, "
                "use the download.py module to fetch datasets"

    # Return the data frame
    return pd.read_csv(path)


RadViz Visualizer


# 房屋出租率
data = load_data("occupancy")
# 选取训练集与目标集
features = ["temperature", "relative humidity", "light", "C02", "humidity"]
classes = ["unoccupied", "occupied"]
# 选取数据
X = data[features]
y = data.occupancy

datetime temperature relative humidity light C02 humidity occupancy 0 2015-02-04 17:51:00 23.18 27.2720 426.0 721.25 0.004793 1 1 2015-02-04 17:51:59 23.15 27.2675 429.5 714.00 0.004783 1 2 2015-02-04 17:53:00 23.15 27.2450 426.0 713.50 0.004779 1 3 2015-02-04 17:54:00 23.15 27.2000 426.0 708.25 0.004772 1 4 2015-02-04 17:55:00 23.10 27.2000 426.0 704.50 0.004757 1

# 加载库
from yellowbrick.features import RadViz
# 可视化
visualizer = RadViz(classes=classes, features=features)
# 直观展示特征重要性
visualizer.fit(X, y)      # Fit the data to the visualizer
visualizer.transform(X)   # Transform the data
visualizer.poof()         # Draw/show/poof the data


Rank Features

# 加载数据集
data = load_data('credit')

# 选取特征
features = [
        'limit', 'sex', 'edu', 'married', 'age', 'apr_delay', 'may_delay',
        'jun_delay', 'jul_delay', 'aug_delay', 'sep_delay', 'apr_bill', 'may_bill',
        'jun_bill', 'jul_bill', 'aug_bill', 'sep_bill', 'apr_pay', 'may_pay', 'jun_pay',
        'jul_pay', 'aug_pay', 'sep_pay',

# 选取数据
X = data[features]
y = data.default

Rank 1D


from yellowbrick.features import Rank1D
# 可视化
visualizer = Rank1D(features=features, algorithm='shapiro')
visualizer.fit(X.values, y)                # Fit the data to the visualizer
visualizer.transform(X)             # Transform the data
visualizer.poof()                   # Draw/show/poof the data



Rank 2D


from yellowbrick.features import Rank2D

# 可视化
visualizer = Rank2D(features=features, algorithm='covariance')

visualizer.fit(X, y)                # Fit the data to the visualizer
visualizer.transform(X)             # Transform the data
visualizer.poof()                   # Draw/show/poof the data


# 可视化
visualizer = Rank2D(features=features, algorithm='pearson')

visualizer.fit(X, y)                # Fit the data to the visualizer
visualizer.transform(X)             # Transform the data
visualizer.poof()                   # Draw/show/poof the data


# algorithm :{'pearson', 'covariance', 'spearman'}

Parallel Coordinates


# Load the classification data set
data = load_data("occupancy")

# Specify the features of interest and the classes of the target
features = [
    "temperature", "relative humidity", "light", "C02", "humidity"
classes = ["unoccupied", "occupied"]

# Extract the instances and target
X = data[features]
y = data.occupancy

datetime temperature relative humidity light C02 humidity occupancy 0 2015-02-04 17:51:00 23.18 27.2720 426.0 721.25 0.004793 1 1 2015-02-04 17:51:59 23.15 27.2675 429.5 714.00 0.004783 1 2 2015-02-04 17:53:00 23.15 27.2450 426.0 713.50 0.004779 1 3 2015-02-04 17:54:00 23.15 27.2000 426.0 708.25 0.004772 1 4 2015-02-04 17:55:00 23.10 27.2000 426.0 704.50 0.004757 1

from yellowbrick.features import ParallelCoordinates

# Instantiate the visualizer
visualizer = ParallelCoordinates(
    classes=classes, features=features, sample=0.5, shuffle=True

# Fit and transform the data to the visualizer
visualizer.fit_transform(X, y)

# Finalize the title and axes then display the visualization
<Figure size 800x550 with 1 Axes>
visualizer.fit_transform(X, y)
from yellowbrick.features import ParallelCoordinates

# Instantiate the visualizer
visualizer = ParallelCoordinates(
    classes=classes, features=features,
    normalize='standard', sample=0.2, shuffle=True,

# Fit the visualizer and display it
visualizer.fit_transform(X, y)


from yellowbrick.features import ParallelCoordinates

# Instantiate the visualizer
visualizer = ParallelCoordinates(
    classes=classes, features=features,
    normalize='standard', sample=0.2, shuffle=True,fast=True

# Fit the visualizer and display it
visualizer.fit_transform(X, y)


PCA Projection


import numpy as np
# Load the classification data set
data = load_data('credit')

# Specify the features of interest and the target
target = "default"
features = [col for col in data.columns if col != target]

# Extract the instance data and the target
X = data[features]
y = data[target]

# Create a list of colors to assign to points in the plot
colors = np.array(['r' if yi else 'b' for yi in y])
from yellowbrick.features.pca import PCADecomposition

visualizer = PCADecomposition(scale=True, color=colors)
visualizer.fit_transform(X, y)
E:\Anaconda3\lib\site-packages\sklearn\preprocessing\data.py:625: DataConversionWarning: Data with input dtype int64 were all converted to float64 by StandardScaler.
  return self.partial_fit(X, y)
E:\Anaconda3\lib\site-packages\sklearn\base.py:462: DataConversionWarning: Data with input dtype int64 were all converted to float64 by StandardScaler.
  return self.fit(X, **fit_params).transform(X)
E:\Anaconda3\lib\site-packages\sklearn\pipeline.py:451: DataConversionWarning: Data with input dtype int64 were all converted to float64 by StandardScaler.
  Xt = transform.transform(Xt)


visualizer = PCADecomposition(scale=True, color=colors, proj_dim=3)
visualizer.fit_transform(X, y)
E:\Anaconda3\lib\site-packages\sklearn\preprocessing\data.py:625: DataConversionWarning: Data with input dtype int64 were all converted to float64 by StandardScaler.
  return self.partial_fit(X, y)
E:\Anaconda3\lib\site-packages\sklearn\base.py:462: DataConversionWarning: Data with input dtype int64 were all converted to float64 by StandardScaler.
  return self.fit(X, **fit_params).transform(X)
E:\Anaconda3\lib\site-packages\sklearn\pipeline.py:451: DataConversionWarning: Data with input dtype int64 were all converted to float64 by StandardScaler.
  Xt = transform.transform(Xt)










# 混凝土强度
data = load_data('concrete')

# Specify the features of interest and the target
target = "strength"
features = [
    'cement', 'slag', 'ash', 'water', 'splast', 'coarse', 'fine', 'age'

# Extract the instance data and the target
X = data[features]
y = data[target]
visualizer = PCADecomposition(scale=True, proj_features=True)
visualizer.fit_transform(X, y)
E:\Anaconda3\lib\site-packages\sklearn\preprocessing\data.py:625: DataConversionWarning: Data with input dtype int64, float64 were all converted to float64 by StandardScaler.
  return self.partial_fit(X, y)
E:\Anaconda3\lib\site-packages\sklearn\base.py:462: DataConversionWarning: Data with input dtype int64, float64 were all converted to float64 by StandardScaler.
  return self.fit(X, **fit_params).transform(X)
E:\Anaconda3\lib\site-packages\sklearn\pipeline.py:451: DataConversionWarning: Data with input dtype int64, float64 were all converted to float64 by StandardScaler.
  Xt = transform.transform(Xt)


visualizer = PCADecomposition(scale=True, proj_features=True, proj_dim=3)
visualizer.fit_transform(X, y)
E:\Anaconda3\lib\site-packages\sklearn\preprocessing\data.py:625: DataConversionWarning: Data with input dtype int64, float64 were all converted to float64 by StandardScaler.
  return self.partial_fit(X, y)
E:\Anaconda3\lib\site-packages\sklearn\base.py:462: DataConversionWarning: Data with input dtype int64, float64 were all converted to float64 by StandardScaler.
  return self.fit(X, **fit_params).transform(X)
E:\Anaconda3\lib\site-packages\sklearn\pipeline.py:451: DataConversionWarning: Data with input dtype int64, float64 were all converted to float64 by StandardScaler.
  Xt = transform.transform(Xt)


# 粗、细两个特征向量是垂直的

Feature Importances


# 租房
data = load_data("occupancy")

# Specify the features of interest
features = [
    "temperature", "relative humidity", "light", "C02", "humidity"

# Extract the instances and target
X = data[features]
y = data.occupancy
import matplotlib.pyplot as plt

from sklearn.ensemble import GradientBoostingClassifier

from yellowbrick.features.importances import FeatureImportances

# Create a new matplotlib figure
fig = plt.figure()
ax = fig.add_subplot()

viz = FeatureImportances(GradientBoostingClassifier(), ax=ax)
viz.fit(X, y)


# 混凝土
data = load_data("concrete")

# Specify the features of interest
features = [

# Extract the instances and target
X = data[features]
y = data.strength
import matplotlib.pyplot as plt

from sklearn.linear_model import Lasso

from yellowbrick.features.importances import FeatureImportances

# Create a new figure
fig = plt.figure()
ax = fig.add_subplot()

# Title case the feature for better display and create the visualizer
labels = list(map(lambda s: s.title(), features))
viz = FeatureImportances(Lasso(), ax=ax, labels=labels, relative=False)

# Fit and show the feature importances
viz.fit(X, y)


# 混凝土强度和水是负相关的

Recursive Feature Elimination(RFE)


from sklearn.svm import SVC
from sklearn.datasets import make_classification

from yellowbrick.features import RFECV

# Create a dataset with only 3 informative features
X, y = make_classification(
    n_samples=1000, n_features=25, n_informative=3, n_redundant=2,
    n_repeated=0, n_classes=8, n_clusters_per_class=1, random_state=0

# Create RFECV visualizer with linear SVM classifier
viz = RFECV(SVC(kernel='linear', C=1))
viz.fit(X, y)


import warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn")  # 忽略警告
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

data = load_data('credit')

# target = df['default']
features = [col for col in data.columns if col != 'default']

X = data[features]
y = data['default']

cv = StratifiedKFold(5)  # 5折还真慢
oz = RFECV(RandomForestClassifier(), cv=cv, scoring='f1_weighted')

oz.fit(X, y)

KeyboardInterrupt                         Traceback (most recent call last)



Scatter Plot Visualizer

# Load the classification data set
data = load_data("occupancy")

# Specify the features of interest and the classes of the target
features = ["temperature", "relative humidity", "light", "C02", "humidity"]
classes = ["unoccupied", "occupied"]

# Extract the numpy arrays from the data frame
X = data[features]
y = data.occupancy
from yellowbrick.contrib.scatter import ScatterVisualizer

visualizer = ScatterVisualizer(x="light", y="C02", classes=classes)

visualizer.fit(X, y)
E:\Anaconda3\lib\site-packages\yellowbrick\contrib\scatter.py:225: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.
  X_two_cols = X[self.features_].as_matrix()


Joint Plot Visualization


# Load the data
df = load_data("concrete")
feature = "cement"
target = "strength"

# Get the X and y data from the DataFrame
X = df[feature]
y = df[target]
from yellowbrick.features import JointPlotVisualizer

visualizer = JointPlotVisualizer(feature=feature, target=target)

visualizer.fit(X, y)


# 水泥用量与强度之间的关系