import pandas as pd

# Load the dataset
titanic_data = pd.read_csv('data/titanic.csv')
# Display the basic information and the first few rows of the dataset
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
  PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S


# Inspecting missing values
missing_values = titanic_data.isnull().sum()
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


# Checking data types
data_types = titanic_data.dtypes
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


# Identifying potential outliers by looking at summary statistics
summary_statistics = titanic_data.describe()
  PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200


# Visual inspectation of popential outliers
import matplotlib.pyplot as plt
import seaborn as sns

# Setting the style for the plots
# 使用Seaborn绘制描述统计信息
plt.figure(figsize=(10, 6))
sns.barplot(data=summary_statistics, orient='h')
plt.title('Summary Statistics of Titanic Data')




# Create box plots for 'Age' and 'Fare'
fig, axes = plt.subplots(1, 2, figsize=(15, 5))
sns.boxplot(ax=axes[0], x=titanic_data['Age']).set_title('Box Plot of Age')
sns.boxenplot(ax=axes[1], x=titanic_data['Fare']).set_title('Box Plot of Fare')




Age:年龄的分布在较高的一端有一些异常值,但这些值在人类年龄的合理范围内。 Fare:与大多数数据相比,有几个异常值表明票价非常高。这些异常值可能代表奢华的住宿或单纯的数据错误。


# Addressing missing values
# 1. Imputing missing values in 'Age' with the median
median_age = titanic_data['Age'].median()
titanic_data['Age'].fillna(median_age, inplace=True)
# 2. Imputing the missing value in 'Fare' based 
# on the median of the corresponding Pclass
median_fare = titanic_data.groupby('Pclass')['Fare'].median()
titanic_data['Fare'] = titanic_data.apply(
    lambda row: median_fare[row['Pclass']] if pd.isnull(row['Fare']) 
    else row['Fare'], axis=1)
# 3. Filling missing values in 'Carbin' with 'Unknown'
titanic_data['Cabin'].fillna('Unknown', inplace=True)
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       2
dtype: int64
titanic_data['Embarked'].fillna('S', inplace=True)
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64


# Descriptive statistics for numerical columns
numerical_stats = titanic_data.describe()
  PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.361582 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 13.019697 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 22.000000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 35.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200

PassengerId: 范围从892到1309。 Survived: 大约36.36%的乘客最终活了下来(mean=0.36)。 Pclass: 大多数乘客的船票类型为第3类(mean=2.27) Age: 范围从0.17到76岁,年龄中位数为27岁。 SibSp: 多数乘客在船上没有兄弟姐妹/配偶。 Parch: 多数乘客在船上没有父母/孩子。 Fare: 票价范围分布较广,票价中位数为14.45。


# Frequency counts for categorical columns
categorical_columns = titanic_data.select_dtypes(include=object).columns
categorical_stats = {column: titanic_data[column].value_counts() 
                     for column in categorical_columns}
{'Name': Braund, Mr. Owen Harris                     1
 Boulos, Mr. Hanna                           1
 Frolicher-Stehli, Mr. Maxmillian            1
 Gilinski, Mr. Eliezer                       1
 Murdlin, Mr. Joseph                         1
 Kelly, Miss. Anna Katherine "Annie Kate"    1
 McCoy, Mr. Bernard                          1
 Johnson, Mr. William Cahoone Jr             1
 Keane, Miss. Nora A                         1
 Dooley, Mr. Patrick                         1
 Name: Name, Length: 891, dtype: int64,
 'Sex': male      577
 female    314
 Name: Sex, dtype: int64,
 'Ticket': 347082      7
 CA. 2343    7
 1601        7
 3101295     6
 CA 2144     6
 9234        1
 19988       1
 2693        1
 PC 17612    1
 370376      1
 Name: Ticket, Length: 681, dtype: int64,
 'Cabin': Unknown        687
 C23 C25 C27      4
 G6               4
 B96 B98          4
 C22 C26          3
 E34              1
 C7               1
 C54              1
 E36              1
 C148             1
 Name: Cabin, Length: 148, dtype: int64,
 'Embarked': S    646
 C    168
 Q     77
 Name: Embarked, dtype: int64}

Name:所有值唯一 Sex:男性266人,女性152人。 Ticket:船票种类较多,部分乘客共享一张船票。 Cabin:大多数是“未知”(我们在前面处理缺失值时填充的)。在已知的客舱中,有一些是重复的。 Embarked:大多数乘客在南安普敦(S)港口登船,其次是瑟堡(C)和皇后镇(Q)港口。



# Setting up the figure size and layout fot the historams
plt.figure(figsize=(15, 10))

# The numerical columns to be analyzed
numerical_columns = ['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

# Plotting histogram for each numerical column
for i, column in enumerate(numerical_columns):
    plt.subplot(3, 2, i+1)
    sns.histplot(titanic_data[column], kde=False, color='#21a675')
    plt.title(f'Histogram of {column}')





Survived:死亡乘客(0)数量远高于幸存乘客(1)数量。 Pclass:三等舱的乘客最多,其次是一等舱和二等舱。 Age:分布有点偏右,乘客的年龄段主要集中在20-30岁。 SibSp:大多数乘客在船上都没有兄弟姐妹或配偶。 Parch:与 SibSp 类似,大多数乘客在旅行途中都没有携带父母或孩子。 Fare;分布严重偏右,表明大多数票价都很低,只有少数较高的异常值。


# Setting up the figure size and layout for the bar charts
plt.figure(figsize=(12, 5))

# The categorical column to be analyzed
cat_columns = ['Sex', 'Embarked']

# Plotting bar charts for each categorical column
for i, column in enumerate(cat_columns):
    plt.subplot(1, 2, i+1)
    sns.countplot(x=column, data=titanic_data)
    plt.title(f'Bar Chart of {column}')





Age:男性乘客多于女性乘客。 Embarked:大多数乘客从南安普敦港口(S)登船,其次是瑟堡(C)和皇后镇(Q))港口。


Survived vs Pclass:检查不同船票类型与幸存率之间存在怎样的关系。 Survived vs Sex:检查男性和女性乘客的幸存率差异。 Age vs Fare:探索乘客年龄与他们所付票价之间的关系。

plt.figure(figsize=(18, 6))

# 1. Survived Rate Across Different Classes (Pclass)
plt.subplot(1, 3, 1)
sns.barplot(x='Pclass', y='Survived', data=titanic_data)
plt.title('Survived Rate by Pclass')

# 2. Survived Rate Based on Sex
plt.subplot(1, 3, 2)
sns.barplot(x='Sex', y='Survived', data=titanic_data)
plt.title('Survived Rate by Sex')

# 3. Age vs Fare
plt.subplot(1, 3, 3)
sns.scatterplot(x='Age', y='Fare', hue='Survived', data=titanic_data)
plt.title('Age vs Fare')





Survived vs Pclass:条形图显示了一个明显的趋势,即上层阶级(1级和2级)的存活率高于第三级。 Survived vs Sex:这张图显示了性别之间存活率的显著差异,女性的存活率比男性高得多。 Age vs Fare:散点图显示了年龄和票价之间的关系,颜色表示幸存率(蓝色表示存活,橙色表示未存活)。虽然很明显票价越高(可能是异常值)幸存率越高,但年龄和票价与幸存率之间似乎没有明确的关系。


# 1. Average Age and Fare per class (Pclass)
avg_age_fare_per_plcass = titanic_data.groupby('Pclass')[['Age', 'Fare']].mean()

# 2. Survival Rates by Gender
survival_rate_by_gender = titanic_data.groupby('Sex')['Survived'].mean()

avg_age_fare_per_plcass, survival_rate_by_gender
(              Age       Fare
 1       36.812130  84.154687
 2       29.765380  20.662183
 3       25.932627  13.675550,
 female    0.742038
 male      0.188908
 Name: Survived, dtype: float64)

平均年龄及票价: 头等舱:平均年龄约39.75岁,平均票价约94.28美元。 二等舱:平均年龄28.68岁左右,平均票价22.20美元。 三等舱:平均年龄25.01岁,平均票价12.44美元。 根据性别来看幸存率: 女性:幸存率100%。 男性:幸存率为0%。


# 1. Defining a high fare as one that is above the 95th percentile
high_fara_threshold = titanic_data['Fare'].quantile(0.95)
high_fare_data = titanic_data[titanic_data['Fare'] > high_fara_threshold]

# 2. Age Groups with Unusaully High or Low Survival Rates
age_bins = [0, 12, 18, 40, 60, 80]
age_labels = ['Child', 'Teen', 'Adult', 'Middle-Aged', 'Senior']
titanic_data['AgeGroup'] = pd.cut(titanic_data['Age'], bins=age_bins, 
                                  labels=age_labels, right=False)

# Calculating survial rates for each age group
survival_rates_by_age_group = titanic_data.groupby('AgeGroup')['Survived'].mean()

print(f"{high_fare_data[['Fare', 'Pclass', 'Survived', 'Age', 'Sex']]}\n")
Fare  Pclass  Survived    Age     Sex
27   263.0000       1         0  19.00    male
31   146.5208       1         1  28.00  female
88   263.0000       1         1  23.00  female
118  247.5208       1         0  24.00    male
195  146.5208       1         1  58.00  female
215  113.2750       1         1  31.00  female
258  512.3292       1         1  35.00  female
268  153.4625       1         1  58.00  female
269  135.6333       1         1  35.00  female
297  151.5500       1         0   2.00  female
299  247.5208       1         1  50.00  female
305  151.5500       1         1   0.92    male
311  262.3750       1         1  18.00  female
318  164.8667       1         1  31.00  female
319  134.5000       1         1  40.00  female
325  135.6333       1         1  36.00  female
332  153.4625       1         0  38.00    male
334  133.6500       1         1  28.00  female
337  134.5000       1         1  41.00  female
341  263.0000       1         1  24.00  female
373  135.6333       1         0  22.00    male
377  211.5000       1         0  27.00    male
380  227.5250       1         1  42.00  female
390  120.0000       1         1  36.00    male
393  113.2750       1         1  23.00  female
435  120.0000       1         1  14.00  female
438  263.0000       1         0  64.00    male
498  151.5500       1         0  25.00  female
527  221.7792       1         0  28.00    male
557  227.5250       1         0  28.00    male
609  153.4625       1         1  40.00  female
659  113.2750       1         0  58.00    male
660  133.6500       1         1  50.00    male
679  512.3292       1         1  36.00    male
689  211.3375       1         1  15.00  female
700  227.5250       1         1  18.00  female
708  151.5500       1         1  22.00  female
716  227.5250       1         1  38.00  female
730  211.3375       1         1  29.00  female
737  512.3292       1         1  35.00    male
742  262.3750       1         1  21.00  female
763  120.0000       1         1  36.00  female
779  211.3375       1         1  43.00  female
802  120.0000       1         1  11.00    male
856  164.8667       1         1  45.00  female

Child          0.573529
Teen           0.488889
Adult          0.357724
Middle-Aged    0.394161
Senior         0.240000
Name: Survived, dtype: float64

异常高昂的票价 支付特别高票价(高于95%)的乘客大多在头等舱。 这些高票价乘客的幸存率通常很高。 这一群体的年龄和性别分布各不相同,男性和女性的年龄各不相同。 幸存率异常高或异常低的年龄组 儿童(0-12岁):幸存率43.48%。 青少年(13-18岁):幸存率为38.89%。 成人(19-40岁):幸存率35.59%。 中年(41-60岁):幸存率为33.82%。 老年人(61-80岁):令人惊讶的是,这一群体的幸存率高达50.00%。高价乘客(主要是头等舱乘客)的幸存率更高,这与历史记录相符,即头等舱乘客更容易乘坐救生艇。 儿童和老年人的幸存率更高,这可能表明救援行动优先考虑这些年龄组,似乎与“在救生资源有限的情况下,老年人和儿童优先”的社会传统美德相符?然而,老年人的样本量很小,所以这一发现应该谨慎解释。如果要进一步验证这一见解的真实性,可考虑扩大数据集中老年人的样本量。


plt.figure(figsize=(18, 10))

# 1. Survival Rate by Passenger Class (Pclass)
plt.subplot(2, 3, 1)
sns.barplot(x='Pclass', y='Survived', data=titanic_data)
plt.title('Survival Rate by Pclass')

# 2. Survival Rate by Sex
plt.subplot(2, 3, 2)
sns.barplot(x='Sex', y='Survived', data=titanic_data)
plt.title('Survival Rate by Sex')

# 3. Survival Rate by Age Group
plt.subplot(2, 3, 3)
sns.barplot(x='AgeGroup', y='Survived', data=titanic_data)
plt.title('Survival Rate by Age Group')

# 4. Fare vs Survival Rate
plt.subplot(2, 3, 4)
sns.scatterplot(x='Fare', y='Survived', data=titanic_data)
plt.title('Fare vs Survival Rate')

# 5. Survival Rate by Embarkation Point
plt.subplot(2, 3, 5)
sns.barplot(x='Embarked', y='Survived', data=titanic_data)
plt.title('Survival Rate by Embarkation Point')





船票类型 vs 幸存率:柱状图显示了一个明显的趋势,即头等舱和二等舱的乘客比三等舱的乘客幸存率高。 性别 vs 幸存率:图表显示了不同性别之间幸存率的显著差异,且呈现两个极端趋势(男性-0;女性-1),这不符合实际情况,说明该数据集中性别和幸存率数据本身存在问题。 年龄组 vs 幸存率:图表显示,老年人的幸存率相对较高,其次是儿童,当然这可能跟老年人的样本量较小,不能反映真实情况有关。可考虑扩展老年人的样本量进一步验证准确性。 票价 vs 幸存率:散点图显示,支付更高票价的乘客幸存下来的可能性更高。这种趋势与第一种(船票类型 vs 幸存率)一致,因为船票类型本身跟票价就是正相关。 登船点 vs 幸存率:图表显示了不同登船地点的乘客幸存率的差异,其中在皇后镇(Q)登船的乘客幸存率更高。

假设检验(Hypothesis Testing)

女性的幸存率高于男性 零假设(H0):性别与幸存率没有关系(即幸存率与性别无关)。 备择假设(H1):性别与存活率之间存在一定的关系(即幸存率取决于性别)。

from scipy.stats import chi2_contingency

# Creating a contingency table for 'Sex' and 'Survived'
contingency_table = pd.crosstab(titanic_data['Sex'], titanic_data['Survived'])

# Performing the Chi-Square Test
chi2, p_value, _, _ = chi2_contingency(contingency_table)

chi2, p_value
(260.71702016732104, 1.1973570627755645e-58)

p值远低于典型的α水平0.05,这表明结果在统计上是显著的。 由于p值远低于显著性水平,所以我们拒绝零假设(H0),接受备择假设(H1)。 在统计上,性别与泰坦尼克号上的幸存率有显著的关系。 该数据支持“女性幸存率高于男性”的假设。




泛化能力差(Poor Generalization):


模型评估不准确(Inaccurate Model Evaluation):


不确定性高(High Uncertainty):

数据增强(Data Augmentation):



迁移学习(Transfer Learning):


正则化技术(Regularization Techniques):





模型偏向多数类(Bias towards Majority Class):


性能评价不准确(Misleading Performance Evaluation):


少数类的预测能力差(Poor Minority Class Prediction):


过度拟合少数类(Overfitting to Minority Class):

在一些过度调整以识别少数类的情况下,模型可能过拟合到这些类别,牺牲对多数类的正确预测。 重采样(Resampling):


合成数据生成(Synthetic Data Generation):

使用技术如SMOTE(Synthetic Minority Over-sampling Technique)来生成少数类的合成样本。

改变损失函数(Modifying Loss Function):


使用集成学习(Ensemble Learning):


专注于评价指标(Focusing on Evaluation Metrics):

使用像精确率-召回率曲线(Precision-Recall Curve)或F1分数等更适合处理不平衡数据的评价指标。


import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
# 生成一个不平衡的二分类数据集
X, y = make_classification(n_samples=1000, n_features=10, weights=[0.9, 0.1], random_state=42)
print("原始数据集中,类别0的样本数:", len(y[y==0]))
print("原始数据集中,类别1的样本数:", len(y[y==1]))
原始数据集中,类别0的样本数: 897
原始数据集中,类别1的样本数: 103
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 过采样(增加少数类样本)
X_oversampled, y_oversampled = resample(X_train[y_train == 1], y_train[y_train == 1], n_samples=len(X_train[y_train == 0]), random_state=42)
X_oversampled = np.concatenate((X_train[y_train == 0], X_oversampled))
y_oversampled = np.concatenate((y_train[y_train == 0], y_oversampled))

print("过采样后的数据集中,类别0的样本数:", len(y_oversampled[y_oversampled == 0]))
print("过采样后的数据集中,类别1的样本数:", len(y_oversampled[y_oversampled == 1]))
过采样后的数据集中,类别0的样本数: 722
过采样后的数据集中,类别1的样本数: 722
# 欠采样(减少多数类样本)
X_undersampled, y_undersampled = resample(X_train[y_train == 0], y_train[y_train == 0], n_samples=len(X_train[y_train == 1]), random_state=42)
X_undersampled = np.concatenate((X_train[y_train == 1], X_undersampled))
y_undersampled = np.concatenate((y_train[y_train == 1], y_undersampled))

print("欠采样后的数据集中,类别0的样本数:", len(y_undersampled[y_undersampled == 0]))
print("欠采样后的数据集中,类别1的样本数:", len(y_undersampled[y_undersampled == 1]))
欠采样后的数据集中,类别0的样本数: 78
欠采样后的数据集中,类别1的样本数: 78


from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_classification
from collections import Counter
# 创建一个不平衡的二分类数据集作为示例
X, y = make_classification(n_samples=1000, n_features=20, n_informative=2,
                           n_redundant=10, n_clusters_per_class=1, weights=[0.99],
                           flip_y=0, random_state=1)

# 查看原始数据集的类别分布
print('Original dataset shape:', Counter(y))
Original dataset shape: Counter({0: 990, 1: 10})
# 使用SMOTE生成合成样本
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)

# 查看生成的合成样本后的类别分布
print('Resampled dataset shape:', Counter(y_resampled))
Resampled dataset shape: Counter({0: 990, 1: 990})


from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# 创建一个不平衡的二分类数据集作为示例
X, y = make_classification(n_samples=1000, n_features=20, n_informative=2,
                           n_redundant=10, n_clusters_per_class=1, weights=[0.99],
                           flip_y=0, random_state=1)

# 将数据集划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 使用逻辑回归模型,对不同类别的样本定义权重
weights = {0: 1, 1: 10}
clf = LogisticRegression(class_weight=weights)

# 训练模型
clf.fit(X_train, y_train)

# 对测试集进行预测
y_pred = clf.predict(X_test)

# 打印分类报告
print(classification_report(y_test, y_pred))
precision    recall  f1-score   support

           0       1.00      1.00      1.00       297
           1       0.67      0.67      0.67         3

    accuracy                           0.99       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.99      0.99      0.99       300


from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# 创建一个不平衡的二分类数据集作为示例
X, y = make_classification(n_samples=1000, n_features=20, n_informative=2,
                           n_redundant=10, n_clusters_per_class=1, weights=[0.99],
                           flip_y=0, random_state=1)

# 将数据集划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 使用随机森林算法,建立一个包含10个树的集成模型
clf = RandomForestClassifier(n_estimators=10, random_state=42)

# 训练模型
clf.fit(X_train, y_train)

# 对测试集进行预测
y_pred = clf.predict(X_test)

# 打印分类报告
print(classification_report(y_test, y_pred))
precision    recall  f1-score   support

           0       0.99      1.00      1.00       297
           1       1.00      0.33      0.50         3

    accuracy                           0.99       300
   macro avg       1.00      0.67      0.75       300
weighted avg       0.99      0.99      0.99       300


from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, f1_score
import matplotlib.pyplot as plt

# 创建一个不平衡的二分类数据集作为示例
X, y = make_classification(n_samples=1000, n_features=20, n_informative=2,
                           n_redundant=10, n_clusters_per_class=1, weights=[0.99],
                           flip_y=0, random_state=1)

# 将数据集划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 使用逻辑回归模型进行训练
clf = LogisticRegression()
clf.fit(X_train, y_train)

# 对测试集进行预测
y_pred_proba = clf.predict_proba(X_test)[:, 1]
y_pred = clf.predict(X_test)

# 计算精确率、召回率和阈值
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)

# 计算F1分数
f1 = f1_score(y_test, y_pred)

# 打印F1分数
print("F1 Score:", f1)

# 绘制精确率-召回率曲线
plt.plot(recall, precision, marker='.')
plt.title('Precision-Recall Curve')
F1 Score: 0.8