import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib as mpl
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 10)
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False #当坐标轴有负号的时候可以显示负号
%matplotlib inline
http://www.woshipm.com/data-analysis/712880.html
online=pd.read_csv("../data/数据源见页底.csv",parse_dates=['InvoiceDate']) # 直接将'InvoiceDate'列转为时间格式
online.head()
online.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 InvoiceNo 541909 non-null object
1 StockCode 541909 non-null object
2 Description 540455 non-null object
3 Quantity 541909 non-null int64
4 InvoiceDate 541909 non-null datetime64[ns]
5 UnitPrice 541909 non-null float64
6 CustomerID 406829 non-null float64
7 Country 541909 non-null object
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB
def get_month(x):
return datetime(x.year, x.month,1)
online['InvoiceDate'].apply(get_month)
0 2010-12-01
1 2010-12-01
2 2010-12-01
3 2010-12-01
4 2010-12-01
...
541904 2011-12-01
541905 2011-12-01
541906 2011-12-01
541907 2011-12-01
541908 2011-12-01
Name: InvoiceDate, Length: 541909, dtype: datetime64[ns]
online['InvoiceMonth'] = online['InvoiceDate'].apply(get_month) # 将'InvoiceDate'返回为当月1号月初
online.head()
grouping = online.groupby('CustomerID')['InvoiceMonth']
grouping = online.groupby('CustomerID')['InvoiceMonth']
online['CohortMonth'] = grouping.transform('min') # 以用户ID转换该用户初次购买时间
online.tail()
def get_date_int(df, column):
year = df[column].dt.year
month = df[column].dt.month
day = df[column].dt.day
return year, month, day
cohort_year,cohort_month,_=get_date_int(online, 'CohortMonth')
invoice_year, invoice_month,_ = get_date_int(online, 'InvoiceDate')
years_diff = invoice_year - cohort_year
months_diff = invoice_month - cohort_month
online['CohortPeriod'] = years_diff * 12 + months_diff + 1 # 第一个用户有活跃5个月
online.tail()
online['Cohort']=online['CohortMonth'].astype(str)
online['Cohort']=online['Cohort'].apply(lambda x:x[:-3])
online.head()
grouping = online.groupby(['Cohort', 'CohortPeriod'])
cohort_data = grouping['CustomerID'].apply(pd.Series.nunique) #活跃用户
cohort_data = cohort_data.reset_index()
cohort_data.head()
cohort_counts = cohort_data.pivot(index='Cohort',
columns='CohortPeriod',
values='CustomerID')
cohort_counts
cohort_sizes = cohort_counts.iloc[:,0] # 用第一列数据对比计算留存率,归一化
retention = cohort_counts.divide(cohort_sizes, axis=0)
retention.head()
retention.round(3)*100
retention.iloc[[0,1,2],:].T
from bokeh.plotting import figure, show, output_notebook
from bokeh.layouts import gridplot
from bokeh.models import ColumnDataSource,SingleIntervalTicker,FactorRange
from math import pi
output_notebook()
<div class="bk-root">
<a href="https://bokeh.pydata.org" target="_blank" class="bk-logo bk-logo-small bk-logo-notebook"></a>
<span id="1001">Loading BokehJS ...</span>
</div>
source = retention.iloc[[0,1,2],:].T
p = figure(plot_width=800, plot_height=550,title="群组分析:用户留存")
p.line(x='CohortPeriod',y='2010-12',color="firebrick",line_width=2,source=source)
p.line(x='CohortPeriod',y='2011-01',color="navy",line_width=2,source=source)
p.line(x='CohortPeriod',y='2011-02',color="green",line_width=2,source=source)
p.xaxis.axis_label = '群组周期'
p.yaxis.axis_label = '% 群组购买'
p.xaxis.major_label_orientation=pi/4
p.y_range.start=0
show(p)
如忘记保存,或后续再查看,可凭“订单号” 手动获取