时间:2023-03-15 23:58:01 | 来源:电子商务
时间:2023-03-15 23:58:01 来源:电子商务
# 导库import numpy as npimport pandas as pdimport matplotlib.pyplot as pltimport seaborn as sns%matplotlib inlinefrom pyecharts.charts import *from pyecharts import options as opts# plt显示中文plt.rcParams['font.sans-serif'] = ['SimHei'] #显示中文标签plt.rcParams['axes.unicode_minus'] = False #显示符号import warningswarnings.filterwarnings('ignore')
导入数据data0 = pd.read_excel(r'E:/jupyter data/superstore_dataset2011-2015.csv')data0.head()
出现报错:XLRDError: Unsupported format, or corrupt file: Expected BOF record; found b'Row ID,O'data0 = pd.read_excel(r'E:/jupyter data/superstore_dataset2011-2015.xlsx')data0.head()
data1 = data0.copy()data1.columns=['行编号','订单ID','订单日期','发货时间','发货模式','客户ID','客户姓名','客户类别','客户所在城市','客户城市所在州','客户所在国家','邮编','marker所属区域','marker所属洲','产品ID','产品类别','产品子类别','产品名称','销售额','销售量','折扣','利润','发货成本','订单优先级','不知名24','不知名25']
2)缺失值处理(data1.isnull().sum().sort_values(ascending=False) / data1.shape[0]).apply(lambda x: format(x, '.2%'))
由于不知名的那两列不知道特征含义且缺失率非常大,可以删除;“邮编”缺失率79%,但这个特征没什么用,可以不用管。剩下的几个特征的缺失值先不管,用到再说# 剔除掉不知名的两列data2 = data1.iloc[:,:-2]
3)数据格式转化# "订单日期"data2['订单日期'] = pd.to_datetime(data2['订单日期']) # 转化为日期格式data2 = data2.sort_values(by='订单日期', ascending=True) # 排序data2['订单日期_月'] = data2['订单日期'].map(lambda x:x.month) # 增加列data2['订单日期_年'] = data2['订单日期'].map(lambda x:x.year)# "发货时间"data2['发货时间'] = pd.to_datetime(data2['发货时间']) # 转化为日期格式data2.reset_index(drop=True,inplace=True) # 重置索引
# 销售额、利润这两个字段存在非数字的异常值data2['销售额'] = data2['销售额'].astype(float)
ValueError: could not convert string to float: ' Metal"'data2.shape[0] - sum(data2['销售额'].apply(lambda x:type(x) is float)), data2.shape[0] - sum(data2['利润'].apply(lambda x:type(x) is float)) # 异常值数目
(1555, 1438)# 直接剔除data2 = data2[data2['销售额'].apply(lambda x:type(x) is float)]data2 = data2[data2['利润'].apply(lambda x:type(x) is float)]data2.reset_index(drop=True,inplace=True)# 转化类型data2['销售额'] = data2['销售额'].astype(float) data2['销售量'] = data2['销售量'].astype('int')data2['利润'] = data2['利润'].astype(float)
4)增加指标# 增加销售单价列data2['销售单价'] = data2['销售额'] / data2['销售量']data2['星期'] = data2['订单日期'].apply(lambda x:x.isoweekday()) # 提取出周几
data2.describe()
超过一半数据条的销售额、销售单价都以低价为主,但注意有25%的商品的利润都为负,可能要减少库存从而打折清仓处理data_sale = data2.groupby('订单日期_年').sum()[['销售额', '利润', '销售量']]data_sale['利润率'] = data_sale['利润'] / data_sale['销售额']data_saleriqi_nian = list(data_sale.index.astype(str)) # pyechart里面要求str格式作x坐标轴xiaoshoue = list(data_sale['销售额'].astype(int)) # 为了避免小数,直接转化为intlirun = list(data_sale['利润'].astype(int))lirunlv = list(round(data_sale['利润率']*100,2))rate = data_sale.pct_change() # 计算同比rate# 可视化bar1 = (Bar() .add_xaxis(riqi_nian) .add_yaxis('销售额', xiaoshoue , xaxis_index=1, yaxis_index=1) .add_yaxis('利润', lirun , xaxis_index=1, yaxis_index=1) .extend_axis(yaxis = opts.AxisOpts(name="百分比" ,type_="value" ,min_=6 ,axisline_opts=opts.AxisLineOpts(linestyle_opts=opts.LineStyleOpts(color="#d14a61")) #右边坐标轴颜色 ,axislabel_opts=opts.LabelOpts(formatter="{value} %"))) .set_global_opts(title_opts = opts.TitleOpts(title="2011年-2014年整体盈利情况") ,yaxis_opts = opts.AxisOpts(axislabel_opts=opts.LabelOpts(formatter="{value} 元")) # 左边坐标轴 ,toolbox_opts = opts.ToolboxOpts(is_show=True)) # 工具箱显示 )line1 = (Line() .add_xaxis(riqi_nian) .add_yaxis("利润率", lirunlv , xaxis_index=1 , yaxis_index=2) .set_global_opts(yaxis_opts=opts.AxisOpts(type_="value" ,min_=6 ,name='百分比')) )overlap1 = bar1.overlap(line1)line2 = (Line() .add_xaxis(riqi_nian[1:]) .add_yaxis("销售额", list(round(rate['销售额']*100,2))[1:] , xaxis_index=0, yaxis_index=0) .add_yaxis("利润", list(round(rate['利润']*100,2))[1:] , xaxis_index=0, yaxis_index=0) .add_yaxis("销售量", list(round(rate['销售量']*100,2))[1:] , xaxis_index=0, yaxis_index=0) .add_yaxis("利润率", list(round(rate['利润率']*100,2))[1:] , xaxis_index=0, yaxis_index=0) .set_global_opts(title_opts=opts.TitleOpts(title="2012年-2014年各项同比增长率" , pos_top="48%"), legend_opts=opts.LegendOpts(pos_top="48%") ,yaxis_opts = opts.AxisOpts(axislabel_opts=opts.LabelOpts(formatter="{value} %"))) )grid = (Grid(init_opts=opts.InitOpts(width="1000px", height="700px")) .add(line2, grid_opts=opts.GridOpts(pos_top="60%"), is_control_axis_index=True) .add(overlap1, grid_opts=opts.GridOpts(pos_bottom="60%"), is_control_axis_index=True) )# 这里先add(line2),或许画图顺序是从下面开始,所以上面line2的xaxis与yaxis都比overlap1先画。grid.render_notebook()
* 四年间销售额和利润都在稳步增长,但与大型超市(如:沃尔玛)的销售额差距明显巨大。data_sale_month = pd.pivot_table(data2 ,index='订单日期_月' ,columns='订单日期_年' ,values=['销售额','利润','销售量'] ,aggfunc='sum')data_sale_month_lirunlv = data_sale_month['利润']/data_sale_month['销售量'] # 利润率data_sale_monthfig,axes = plt.subplots(2,2,figsize = (12,8),dpi=100)data_sale_month['销售额'].plot.area(colormap = 'Accent_r', stacked=False,ax=axes[0,0],title='销售额')data_sale_month['利润'].plot.area(colormap = 'Accent_r', stacked=False,ax=axes[0,1],title='利润')data_sale_month['销售量'].plot.area(colormap = 'Accent_r', stacked=False,ax=axes[1,0],title='销售量')data_sale_month_lirunlv.plot.line(ax=axes[1,1], title='利润率')plt.subplots_adjust(wspace=0.3, hspace=0.3) # wspace,hspace 把间距变大
* 总体来看,季节性明显,其中上半年淡季居多,而下半年旺季居多# 销售额每月同比增长data_sale_month['销售额'].pct_change(axis='columns').iloc[:,1:].style.background_gradient(cmap='Greens',low=0,high=1)
每月销售额都是同比增长的,但增长率大小并没发现什么规律,这给下一年拆分业绩指标其实增加了一定难度week_xiaoshou = data2.groupby('星期')['销售量'].sum()week_xiaoshoux = week_xiaoshou.index.tolist()y = week_xiaoshou.values.tolist()bar = (Bar() .add_xaxis(x) .add_yaxis("销售额", y) .set_global_opts(title_opts=opts.TitleOpts(title="工作日销售额明显高于节假日")) )bar.render_notebook()
原因可能是节假日消费者更加喜欢线下消费data2['打折损失'] = data2[data2['折扣']!=0.0]['折扣']*data2[data2['折扣']!=0.0]['销售额']data2['打折损失'] = data2['打折损失'].fillna(0)data2.groupby('订单日期_年')['打折损失'].sum()
5.2. 商品分析table = data2[data2['产品子类别']=='Tables']other = data2[data2['产品子类别']!='Tables']# 打折率对比table[table['折扣']!=0]['折扣'].count() / table['折扣'].count(), other[other['折扣']!=0]['折扣'].count() / other['折扣'].count()
(0.7617896009673518, 0.4453779747526728)# 打折程度对比table[table['折扣']!=0]['折扣'].mean(), other[other['折扣']!=0]['折扣'].mean()
(0.38733333333333225, 0.3271326164874401)# 发货成本对比table['发货成本'].mean(), other['发货成本'].mean()
(92.52696493349451, 25.154532756411655)# table负利润率sum(table['利润']<0) / table['利润'].count()
0.5888754534461911# 假设打折率与其他类别相同a1 = int(table['折扣'].count() * (0.76-0.44)) # 不该打折的数据条数目a2 = table[table['折扣']!=0]['折扣'].count() - a1 # 应该打折数table_ = table[table['折扣']!=0].sample(a2) loss0 = (table[table['折扣']!=0]['折扣']*table[table['折扣']!=0]['销售额']).sum()loss1 = (table_['销售额']*table_['折扣']).sum()loss2 = loss0-loss1 # 其他类被打折率相同时补回的损失# 假设table的打折程度和其他类别相同loss3 = (table_['折扣'].mean()-other[other['折扣']!=0]['折扣'].mean())*table_['销售额'].sum() # 假设发货成本和其他类别相同loss4 = table['利润'].count()*67new_table_lirun = table['利润'].sum()+loss2+loss3+loss4new_table_lirun
该理想利润处于各类别利润的中位数,初步下结论table利润为负的原因为:促销活动与发货成本高。表现为table促销率比其他类别高7%,打折程度高6%,发货成本高67元左右。初步判断为清仓产品,但如果不是,说明这个产品在市场推广上遇到了瓶颈,或者是遇到强竞争对手,需要结合实际业务进行分析,适当改善经营策略。而发货成本,由于table为大件物品,可以优化货仓的库存位置,尽量以最短路配送到消费者手中。# 各商品销售额同比sale_leibie = pd.pivot_table(data2 ,index=['产品类别','产品子类别'] ,columns='订单日期_年' ,values=['销售额'] ,aggfunc='sum')sale_leibiesale_leibie.pct_change(axis='columns').style.background_gradient(cmap='Greens',low=0,high=1)
几乎所有种类都是同比增长的。basket = data2.pivot_table(index='订单ID' ,columns='产品ID' ,values='销售量' ,aggfunc=np.sum).fillna(0)basket
出现报错,原因不明。data2['产品详细类别'] = data2['产品类别'] +','+data2['产品子类别']price_max = data2.groupby('产品详细类别')[['销售单价']].max().round(2)price_min = data2.groupby('产品详细类别')[['销售单价']].min().round(2)price_max.rename(columns={'销售单价':'最高价'}, inplace=True)price_min.rename(columns={'销售单价':'最低价'}, inplace=True)price_width = price_min.join(price_max)price_width['价格带(元)'] = '('+price_width['最低价'].astype(str)+', '+price_width['最高价'].astype(str)+')'price_width['价格带宽度(元)'] = price_width['最高价'] - price_width['最低价']price_width
价格带深度:同一类商品品牌数或SKU数(保存库存控制的最小可用单位)def drop_duplicates (group): # 用来去重 return group.drop_duplicates().count()place_deep = data2.groupby('产品详细类别')[['产品ID']].agg(drop_duplicates)place_deep.rename(columns={'产品ID':'价格带深度(个)'}, inplace=True)place_deep
价格带广度:同一类商品中不重复销售价格的数量place_range = data2.groupby('产品详细类别')[['销售单价']].agg(drop_duplicates)place_range.rename(columns={'销售单价':'价格带广度(个)'}, inplace=True)place_rangeplace_analysis = price_width[['价格带(元)']].join(price_width[['价格带宽度(元)']]).join(place_deep[['价格带深度(个)']]).join(place_range[['价格带广度(个)']])place_analysis
由于产品子类别这个字段仍比较宽泛,导致这里价格带宽度普遍都比较大,进而导致这里价格带的分析起来并不准确,详细分析需要更详细的产品类别数据集leibie = data2['产品详细类别'].unique().tolist()fig = plt.figure(figsize = (18,18))for i in range(len(data2['产品详细类别'].unique().tolist())): df1 = data2[data2['产品详细类别']==leibie[i]].groupby('销售单价')[['销售量']].sum() sns.set(style='whitegrid') plt.subplot(5,4, i+1) sns.distplot(df1['销售量'], bins=10) plt.xlabel(' ') plt.title(str(leibie[i]))
大多数品种价格以偏态分布为主,即价格越低,销量越高。小部分品种(如fasteners纽扣)会呈现对称分布,还有一些会出现多峰情况,企业要根据品种的价格分布合理生产,避免库存积压。# datetime64格式使用.days提取日期data2['发货速度'] = (data2['发货时间']-data2['订单日期']).apply(lambda x:x.days)data2.groupby('产品详细类别')['发货速度'].mean()
由于不了解具体业务流程,'订单日期'与'发货时间'先后关系不明确,这里就不进行分析了customer_sale = data2.pivot_table('销售额' ,index='客户类别' ,columns='订单日期_年' ,aggfunc='sum' )customer_sale # 销售额customer_num = data2.pivot_table('客户ID' ,index='客户类别' ,columns='订单日期_年' ,aggfunc='count' )customer_num # 客户人数customer_num = customer_num.astype(float) # 不知道为什么pyechart可视化要变成float形式list1 = []for i in range(customer_sale.shape[1]): list1.append({"value":customer_sale.iloc[0,i], "percent":customer_sale.iloc[0,i]/customer_sale.iloc[:,i].sum()})list2 = []for i in range(customer_sale.shape[1]): list2.append({"value":customer_sale.iloc[1,i], "percent":customer_sale.iloc[1,i]/customer_sale.iloc[:,i].sum()})list3 = []for i in range(customer_sale.shape[1]): list3.append({"value":customer_sale.iloc[2,i], "percent":customer_sale.iloc[2,i]/customer_sale.iloc[:,i].sum()})from pyecharts.commons.utils import JsCodefrom pyecharts.globals import ThemeTypebar3 = (Bar(init_opts=opts.InitOpts(theme=ThemeType.LIGHT)) .add_xaxis([2011,2012,2013,2014]) .add_yaxis("Consumer", list1, stack='stack1', category_gap="50%") .add_yaxis("Corporate", list2, stack='stack1', category_gap="50%") .add_yaxis("Home Office", list3, stack='stack1', category_gap="50%") .set_series_opts(label_opts=opts.LabelOpts(position="right" ,formatter=JsCode("function(x){return Number(x.data.percent * 100).toFixed() + '%';}")) ) )bar3.render_notebook()
list4 = []for i in range(customer_num.shape[1]): list4.append({"value":customer_num.iloc[0,i], "percent":customer_num.iloc[0,i]/customer_num.iloc[:,i].sum()})list5 = []for i in range(customer_num.shape[1]): list5.append({"value":customer_num.iloc[1,i], "percent":customer_num.iloc[1,i]/customer_num.iloc[:,i].sum()})list6 = []for i in range(customer_num.shape[1]): list6.append({"value":customer_num.iloc[2,i], "percent":customer_num.iloc[2,i]/customer_num.iloc[:,i].sum()})bar2 = (Bar(init_opts=opts.InitOpts(theme=ThemeType.LIGHT)) .add_xaxis([2011,2012,2013,2014]) .add_yaxis("Consumer", list4, stack='stack1', category_gap="50%") .add_yaxis("Corporate", list5, stack='stack1', category_gap="50%") .add_yaxis("Home Office", list6, stack='stack1', category_gap="50%") .set_series_opts(label_opts=opts.LabelOpts(position="right" ,formatter=JsCode("function(x){return Number(x.data.percent * 100).toFixed() + '%';}")) ) )bar2.render_notebook()
每类消费者的销售额、消费人数都稳步增长,但Consumer体量最大,增长相对较慢,可能有点饱和,而Home office体量小增长快,似乎还有很大的市场,可以根据精准营销拉新。customer_month = pd.pivot_table(data2 ,index='订单日期_月' ,columns='订单日期_年' ,values=['销售额', '订单ID', '客户ID', '销售量'] ,aggfunc = {'销售额':'sum' # 消费额 ,'订单ID':drop_duplicates # 订单数 ,'客户ID':drop_duplicates # 消费人数 ,'销售量':'sum'}) # 销售总量customer_month.rename(columns = {'销售额':'消费额' ,'订单ID':'订单数' ,'客户ID':'消费人数'},inplace = True) customer_monthfig,axes = plt.subplots(2,2,figsize = (12,8),dpi=100)# plt显示中文plt.rcParams['font.sans-serif'] = ['SimHei'] #显示中文标签plt.rcParams['axes.unicode_minus'] = False #显示符号customer_month['消费人数'].plot.area(colormap = 'Accent_r', stacked=False,ax=axes[0,0],title='消费人数')customer_month['订单数'].plot.area(colormap = 'Accent_r', stacked=False,ax=axes[0,1],title='消费次数')(customer_month['消费额'] / customer_month['订单数']).plot.line(colormap = 'Accent_r', stacked=False,ax=axes[1,0],title='单次消费金额')(customer_month['销售量'] / customer_month['订单数']).plot.line(colormap = 'Accent_r', stacked=False,ax=axes[1,1],title='连带率')plt.subplots_adjust(wspace=0.3, hspace=0.3) # wspace,hspace 把间距变大
明显看出每年大概6-11月都会是旺季,其余则为淡季。每年6月-7月、9-10月、12-1月都会出现不同程度的下跌。单次消费金额与连带率没看出什么规律。data2['订单日期_年月'] = data2['订单日期_年'].astype(str)+'-'+data2['订单日期_月'].astype(str)data2['订单日期_年月'] = pd.to_datetime(data2['订单日期_年月'])month_buy_ = data2.pivot_table(index='客户ID' ,columns='订单日期_年月' ,values='行编号' ,aggfunc='count').fillna(0) # 某些用户在某月没有消费过,用nan表示,这里用0填充month_buy = month_buy_.applymap(lambda x: 1 if x>0 else 0)month_buy.head()#用户状态def active_status(data): status = [] for i in range(month_buy.shape[1]): if data[i] == 0: #若本月没有消费 if len(status) > 0: # 2011-01之后 if status[i-1] == 'unreg': # 未注册 status.append('unreg') else: status.append('unactive') else: # 2011-01 status.append('unreg') else: #若本月消费 if len(status) == 0: # 2011-01 status.append('new') else: # 2011-01之后 if status[i-1] == 'unactive': status.append('return') # 回流 elif status[i-1] == 'unreg': status.append('new') else: status.append('active') return pd.Series(status, index = month_buy.columns) # 这里需要对返回的值进行转换,将列表转为Seriescustomer_status = month_buy.apply(active_status,axis=1)customer_status.head()customer_status_num = customer_status.apply(lambda x : pd.value_counts(x)).fillna(0).T # 统计每类的数量customer_status_num.index = pd.DataFrame(customer_status_num.index)['订单日期_年月'].astype(str).apply(lambda x:x[:-3]).tolist()customer_status_numx_data = customer_status_num.index.tolist()y_data1 = customer_status_num['active'].values.tolist()y_data2 = customer_status_num['new'].values.tolist()y_data3 = customer_status_num['return'].values.tolist()y_data4 = customer_status_num['unactive'].values.tolist()y_data5 = customer_status_num['unreg'].values.tolist()line = (Line() .add_xaxis(x_data) .add_yaxis('active', y_data1, label_opts=opts.LabelOpts(is_show=False)) .add_yaxis('new', y_data2, label_opts=opts.LabelOpts(is_show=False)) .add_yaxis('return', y_data3, label_opts=opts.LabelOpts(is_show=False)) .add_yaxis('unactive', y_data4, label_opts=opts.LabelOpts(is_show=False)) .add_yaxis('unreg', y_data5, label_opts=opts.LabelOpts(is_show=False)) )line.render_notebook()
四年来,活跃用户都在稳步增加,说明召回运营工作有点成效,但总量相比与不活跃用户占比仍比较小;而回流用户变化不大,可能没怎么注重该方面运营;每月的新客不多,13后每月新客的增加甚至只有个位数了,可能受限于市场规模,拉新是个大问题。所以往后就更需要老顾客的消费来拉动销售额,应该特别注重召回运营的工作,一旦用户流失,在没有新客进入的情况下,企业受损会有点严重。# 只消费一次的客户a = data2.groupby('客户ID')['订单日期'].agg(['min', 'max']).reset_index()bug_one = (a['min'] == a['max']).value_counts().values (a['min'] == a['max']).value_counts()
绝大多数客户都不会只消费一次,该超市口碑至少不错复购率复购率的定义:在某时间窗口内消费两次及以上的用户在总消费用户中占比。这里的时间窗口是月,如果一个用户在同一天下了两笔订单,这里也将他算作复购用户#转换:消费2次以上记为1,消费1次记为0,消费0次记为NAN #applymap针对dataframe所有数据transf = month_buy_.applymap(lambda x: 1 if x>1 else np.nan if x==0 else 0)transf.head()#count统计所有非空数据个数表示总消费用户数,sum计算非0数据的和表示消费两次以上的用户数df_duplicate = pd.DataFrame(transf.sum() / transf.count()).reset_index()df_duplicate.columns = ['Date', 'DuplicatedRate']df_duplicate['Date'] = df_duplicate['Date'].astype(str).apply(lambda x:x[:-3])df_duplicatex_data = df_duplicate['Date'].tolist()y_data = df_duplicate['DuplicatedRate'].tolist()line = (Line() .add_xaxis(x_data) .add_yaxis('', y_data ,label_opts=opts.LabelOpts(is_show=False)) )line.render_notebook()
# 每个客户的每次购买时间间隔buy_diff = data2.groupby('客户ID').apply(lambda x:x['订单日期'] - x['订单日期'].shift())# shift()把第一个元素变为Nan,并且弹出最后一个元素,类似求解等比数列的思想buy_diffdiff = (buy_diff / np.timedelta64(1, 'D')).dropna() sns.distplot(diff[(diff.values>0)&(diff.values<500)], bins=30, kde=True)
呈现指数分布,大部分的订单客户消费周期都很短,对该超市的黏性不错# 假设今天是2014-12-31日buy_life = (data2.groupby('客户ID')['订单日期'].max() - data2.groupby('客户ID')['订单日期'].min()).reset_index()buy_life['life_time'] = buy_life['订单日期'] / np.timedelta64(1,'D')buy_lifebuy_life['life_time'].plot.hist(bins=15, title='客户生命周期分布')
大部分客户生命周期都有3年,整体比较健康data2['是否特价'] = data2['折扣'].apply(lambda x: 1 if x>0.5 else 0) # 取折扣大于0.5为特价商品customer_tejia = pd.DataFrame(data2.groupby('客户ID')['是否特价'].sum() / data2.groupby('客户ID')['是否特价'].count()) #购买特价商品的占比customer_tejia_ = customer_tejia[customer_tejia['是否特价']>0.5] # 取占比大于0.5的为热衷特价商品消费用户customer_tejia_['是否特价'].sort_values(ascending=False)
-高单价商品消费用户pd.DataFrame(data2.groupby('客户ID')['销售单价'].mean().sort_values(ascending=False)).iloc[:10,:] # 高消费的前十位
rfm = pd.pivot_table(data2 ,index='客户ID' ,values=['销售额', '订单日期','订单ID'] ,aggfunc = {'销售额':'sum', # money '订单日期':'max', # 最近一次消费时间 '订单ID':'count'}) # frequencyrfm.rename(columns = {'销售额':'M', '订单ID':'F'},inplace = True) rfm.head()# 假设今天为2014-12-31rfm['R'] = (rfm['订单日期'].max() - rfm['订单日期'])/ np.timedelta64(1, 'D')# rfm['订单日期']里面的数据类型是timedflta时间,无法直接作出直方图,先换算成数值,换算的方式直接除timedelta函数即可,np.timedelta64(1, ‘D’),D表示天,1表示1天,作为单位使用的。rfm.head()sns.pairplot(rfm[['F','M','R']])rfm_std = rfm[['R']].apply(lambda x: -(x-x.mean()) / x.std()) # 先转化为极大值,再标准化rfm_std[['F','M']] = rfm[['F','M']].apply(lambda x: (x-x.mean()) / x.std())rfm_std# 构建rfm模型公式,用平均值进行划分def get_rfm(x): """ R: 用户最近一次消费的间隔, 1为表现好 F: 用户最近一段时间的消费次数, 1为表现好 M: 用户最近一段时间的交易金额, 1为表现好 """ level = x.apply(lambda x:'1' if x>=0 else '0') label = level['R'] + level['F'] + level['M'] dict = {'111':'重要价值客户', '011':'重要保持客户', '101':'重要挽留客户', '001':'重要发展客户', '110':'一般价值客户', '010':'一般保持客户', '100':'一般挽留客户', '000':'一般发展客户'} result = dict[label] return resultrfm['label'] = rfm_std.apply(get_rfm, axis=1)rfm.head()rfm_groupby = rfm.groupby('label')['M'].sum().sort_values()bar = ( Bar() .add_xaxis(rfm_groupby.index.tolist()) .add_yaxis("", np.around(rfm_groupby,0).tolist()) .reversal_axis() .set_series_opts(label_opts=opts.LabelOpts(position="right")) .set_global_opts(title_opts=opts.TitleOpts(title="各类用户销售额贡献值")))bar.render_notebook()
重要价值客户贡献了大部分的销售额,将来运营可以对不同的客户类型进行相应营销策略,特别要注重重要价值客户的留存# 可以进行打分quantiles = rfm.quantile(q=[0.25,0.5,0.75]).to_dict()quantiles# 我们最想要的客户是:粘性高,忠诚度和收入高的用户# 一般分成 3~5 段,这里我们分为4段def RScore(x,p,d): if x <= d[p][0.25]: return 1 elif x <= d[p][0.50]: return 2 elif x <= d[p][0.75]: return 3 else: return 4 def FMScore(x,p,d): if x <= d[p][0.25]: return 4 elif x <= d[p][0.50]: return 3 elif x <= d[p][0.75]: return 2 else: return 1rfm['r_quartile'] = rfm['R'].apply(RScore, args=('R',quantiles))rfm['f_quartile'] = rfm['F'].apply(FMScore, args=('F',quantiles))rfm['m_quartile'] = rfm['M'].apply(FMScore, args=('M',quantiles))#RFM score = r*100+f*10+m, 这里权重随便设置的,111 为rfm score的最高分rfm['RFMScore'] = rfm.r_quartile*100 + rfm.f_quartile*10 +rfm.m_quartilerfm.head(5)# 划分等级def rfm_level(RFMScore): if (RFMScore >= 0 and RFMScore < 122): return '1' elif (RFMScore >= 122 and RFMScore < 223): return '2' elif (RFMScore >= 223 and RFMScore < 333): return '3' return '4'rfm['RFMScore_level'] = rfm['RFMScore'].apply(rfm_level)rfm.head()# 不同rfm 分数的分布import seaborn as snscolor = sns.color_palette()from pylab import mplmpl.rcParams['font.sans-serif'] = ['SimHei'] plt.figure(figsize=(6,8))sns.countplot(x='RFMScore_level',data=rfm, color = color[9])plt.ylabel('计数',fontsize=12)plt.xlabel('RFM得分水平', fontsize=12)plt.xticks(rotation='vertical')plt.title('RFM得分水平分布',fontsize=15)plt.show()
关键词:分析,数据,零售