0. Import Package & Data Load¶

In [1]:

import pandas as pd
import seaborn as sns
import numpy as np
import os
import scipy.stats as st
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

data_cus = pd.read_pickle("data_cus.pkl")
data_cou = pd.read_pickle("data_cou.pkl")
data_ma = pd.read_pickle("data_ma.pkl")
data_on = pd.read_pickle("data_on.pkl")
data_tax = pd.read_pickle("data_tax.pkl")
data_merge = pd.read_pickle("data_merge.pkl") # simple analysis에서 작업한 data

1. Heuristic RFM(Recency, Frequency, Monetary)¶

이전 EDA에서 회원이 등록된 사용자는 모두 1회 이상 구매가 발생했던 것을 확인
데이터의 분포를 기반으로 가설을 세워, RFM 각 항목별 3단계 분류를 통해 고객 세분화를 진행한다.
- 현재 회원 수는 약 1500명으로 너무 많은 단계는 비효율 적이라 판단하였다.
- Moneytary의 지표로는 실제 회사의 이익에 반하는 결과를 얻기 위해 invoice_value를 사용

In [2]:

# 총 고객 수 
print("총 고객 수:", len(data_cus['CustomerID'].drop_duplicates()))

# recency table 처리
data_r = data_merge.copy()[['CustomerID', 'Transaction_Date']]
latest_purchase_date = data_r.groupby('CustomerID')['Transaction_Date'].max()
latest_purchase_week = latest_purchase_date.dt.strftime('%U').astype('int64') # week가 높을수록 최근 구매
df_r = pd.DataFrame({'CustomerID': latest_purchase_week.index, 'Recency': latest_purchase_week.values})

# frequency table 처리
data_f = data_merge.copy()[['CustomerID', 'Transaction_ID', 'Transaction_Date']]
# 같은 날짜에 동일 고객이 여러번 주문한 경우는 1개로 친다.
data_f = data_f.drop_duplicates(['CustomerID', 'Transaction_Date'], keep='first')
frequency = data_f.groupby('CustomerID')['Transaction_ID'].count()
df_f = pd.DataFrame({'CustomerID': frequency.index, "Frequency": frequency.values})

# moneytary table 처리
data_m = data_merge.copy()[['CustomerID', 'Transaction_ID', 'invoice_value']]
sum_invoice = data_m.groupby('CustomerID')['invoice_value'].sum().round()
df_m = pd.DataFrame({'CustomerID': sum_invoice.index, "Monetary": sum_invoice.values})

from functools import reduce
df_rfm = reduce(lambda x, y: pd.merge(x, y, on='CustomerID'), [df_r, df_f, df_m])

총 고객 수: 1468

In [3]:

# 데이터 분포 확인
df_rfm[['Recency', 'Frequency', 'Monetary']].hist(ax = plt.subplots(nrows=1, ncols=3, figsize=(10, 3))[1])
plt.show()

# Monetary 누적 비율 그래프
sorted_temp = df_m.copy().sort_values(by='Monetary', ascending=False)

total_revenue = sorted_temp['Monetary'].sum()

sorted_temp['Cumulative_Sum'] = sorted_temp['Monetary'].cumsum()
sorted_temp['Cumulative_Percentage'] = (sorted_temp['Cumulative_Sum'] / total_revenue) * 100

fig, ax1 = plt.subplots(figsize=(10, 6))
ax1.bar(np.arange(len(sorted_temp['CustomerID'])), sorted_temp['Monetary'], alpha=0.6)
ax1.set_xlabel('Customer')
ax1.set_ylabel('Sales', color='blue', fontsize=12)

ax2 = ax1.twinx()
ax2.plot(np.arange(len(sorted_temp['CustomerID'])), sorted_temp['Cumulative_Percentage'], color='red', linewidth=2)
ax2.set_ylabel('Percentage', color='red', fontsize=12)

plt.axvline(len(sorted_temp[sorted_temp['Cumulative_Percentage'] <= 60]), color='green', linewidth=2, linestyle='--')
plt.axvline(len(sorted_temp[sorted_temp['Cumulative_Percentage'] <= 80]), color='green', linewidth=2, linestyle='--')

plt.title("Monetary Cumulative_Percentage", fontsize=15)
plt.show()

No description has been provided for this image

Rececy의 분포는 약간 우편향이지만 비교적 고르게 분포되어있다.
- Recuncy는 quintile method를 이용하여 3단계 점수로 구분
Frequency의 분포는 매우 좌편향 되어있지만, 대부분 데이터가 5개 이하로 분포한다.
- frequency의 횟수가 1~3 번의 항목들에 각각 대응하는 점수를 부여하고, 3회이상은 3으로 치환한다.
Monetary의 분포는 매우 좌편향 되어있으며, 소수의 인원이 대부분의 매출을 담당한다.
- 해당 항목들은 파레토 법칙을 응용하여 데이터를 처리하여 3단계 점수로 구분.

In [4]:

# calculate recency score
def cal_recency(df_r):
    score = pd.cut(df_r, 3, labels=False)+1
    return score

# calculate frequency score
def cal_frequency(df_f):
    df_f.loc[df_f['Frequency'] > 3, 'Frequency'] = 3
    return df_f['Frequency']

# calculate monetary score
def cal_monetary(df_m):
    sorted_temp = df_m.copy().sort_values(by='Monetary', ascending=False)
    total_revenue = sorted_temp['Monetary'].sum()
    
    sorted_temp['Cumulative_Sum'] = sorted_temp['Monetary'].cumsum()
    sorted_temp['Cumulative_Percentage'] = (sorted_temp['Cumulative_Sum'] / total_revenue) * 100
    total_revenue = sorted_temp['Monetary'].sum()
    
    # 상위 60% 매출을 담당하는 고객군
    sorted_temp.loc[sorted_temp['Cumulative_Percentage'] <= 60, 'score'] = 3
    
    # 상위 80% 매출을 담당하는 고객군
    sorted_temp.loc[(sorted_temp['Cumulative_Percentage'] > 60) & (sorted_temp['Cumulative_Percentage'] <= 80), 'score'] = 2
    
    # Remaining customers
    sorted_temp = sorted_temp.fillna(1)
    
    return sorted_temp['score'].astype('int64')

In [5]:

df_rfm['R_score'] = cal_recency(df_rfm['Recency'])
df_rfm['F_score'] = cal_frequency(df_rfm)
df_rfm['M_score'] = cal_monetary(df_rfm)
df_rfm['RFM'] = df_rfm['R_score'].map(str) + df_rfm['F_score'].map(str) + df_rfm['M_score'].map(str)
df_rfm['RFM_score'] = np.round((df_rfm['R_score']+ df_rfm['F_score'] + df_rfm['M_score'])/3, 1)
df_rfm.head(5)

Out[5]:

	CustomerID	Recency	Frequency	Monetary	R_score	F_score	M_score	RFM	RFM_score
0	12346	37	1	175.0	3	1	1	311	1.7
1	12347	43	3	15687.0	3	3	3	333	3.0
2	12348	41	2	1690.0	3	2	1	321	2.0
3	12350	49	1	1467.0	3	1	1	311	1.7
4	12356	37	1	2007.0	3	1	1	311	1.7

In [6]:

# 각 항목별 점수 분포
fig, axes = plt.subplots(1, 3, figsize=(15, 5)) 

for i, column in enumerate(['R_score', 'F_score', 'M_score']):
    counts = df_rfm[column].value_counts().sort_index()

    axes[i].pie(counts, labels=counts.index, autopct='%1.1f%%', startangle=140)
    axes[i].set_title(column)  # 열 이름을 subplot 타이틀로 설정
 
plt.suptitle("Score distribution by item", fontsize=15)
plt.tight_layout()
plt.show()

plt.figure(figsize=(12,5))
df_rfm['RFM_score'].hist(bins=10, grid=False, figsize=(12,5), color='#86bf91', zorder=2, rwidth=.95)
plt.xticks(np.arange(0, 5, .5))
plt.title("Distribution of RFM mean score", fontsize=15)
plt.show()
#https://documentation.bloomreach.com/engagement/docs/rfm-segmentation
#https://dacon.io/competitions/official/236222/codeshare/9794
corr = df_rfm[['R_score', 'F_score', 'M_score']].corr()
sns.heatmap(corr, cmap='Blues')
plt.title("Feature Correlation", fontsize=15)
plt.show()

R에서는 1점의 빈도가 가장 낮고, F/M에서는 2, 3점의 분포는 비교적 비슷하다.
RFM의 점수 평균으로 봤을떄, 크게 4가지 분류로 나눠지는 것을 확인.
각 항목별 상관관계는 뚜렷하지 않아보인다.
해당 결과를 토대로 고객군을 분류하여 고객군별로 전략을 수립한다.

1-1. 고객 분류 방법¶

1) RFM mean score 분포에 따른 방식(고객에게 직접적으로 부여되는 등급의 의미)
    - VIP, Platinum, Gold, Silver 4 등급으로 분류
2) R, F, M 항목별 점수를 고려한 분류방식(마케팅 목적으로 내부에서 관리하는 등급의 의미)
    - 참고: https://documentation.bloomreach.com/engagement/docs/rfm-segmentation

In [7]:

"""
RFM mean score에 따른 방식(고객에게 직접적으로 부여되는 등급의 의미)
        - VIP: RFM mean score >= 2.7,
        - Platinum: 2.7 > RFM mean score > 2,
        - Gold: 2 >= RFM mean score >1.3
        - Silver: RFM mean score =< 1.3
"""

def assign_rfm_grade(score):
    if score >= 2.7:
        return 'VIP'
    elif 2.7 > score >= 2:
        return 'Platinum'
    elif 2 > score >= 1.7:
        return 'Gold'
    else:
        return 'Silver'

df_rfm['RFM_grade(score_mean)'] = df_rfm['RFM_score'].apply(assign_rfm_grade)

# RFM 등급별 고객 수 세기
grade_counts = df_rfm.groupby('RFM_grade(score_mean)')['CustomerID'].count()
total_customers = df_rfm['CustomerID'].count()
percentages = round((grade_counts / total_customers) * 100, 1).apply(lambda x: f"{x:.1f}%")
temp_df = pd.DataFrame({'Count': grade_counts, 'Percentage': percentages}).reindex(['VIP', 'Platinum', 'Gold', 'Silver'])
print(temp_df)

plt.figure(figsize=(11,3))
sns.boxplot(x='RFM_grade(score_mean)', y='Recency', data=df_rfm, order=['VIP', 'Platinum', 'Gold', 'Silver'])
plt.show()
plt.figure(figsize=(11,3))
sns.boxplot(x='RFM_grade(score_mean)', y='Frequency', data=df_rfm,  order=['VIP', 'Platinum', 'Gold', 'Silver'])
plt.show()
plt.figure(figsize=(11,3))
sns.boxplot(x='RFM_grade(score_mean)', y='Monetary', data=df_rfm,  order=['VIP', 'Platinum', 'Gold', 'Silver'])
plt.show()

                       Count Percentage
RFM_grade(score_mean)                  
VIP                      269      18.3%
Platinum                 403      27.5%
Gold                     323      22.0%
Silver                   473      32.2%

단순 RFM 평균 점수의 분포만으로 분류했을 경우, 상위등급(VIP, Platinum)의 비율이 크게 나오게 되어 상위등급을 위한 혜택에 대한 부담이 생길 수 있다.
각 등급별 R, F, M 항목의 분포에 대해서는 생각했던 목적과 유사하게 나타났다.

In [8]:

"""
R, F, M 항목별 점수를 고려한 분류방식(마케팅 목적으로 내부에서 관리하는 등급의 의미)
- VIP 고객: 각 항목별 3점이 최소 2개 이상 (333, 233, 323, 332)
- 충성고객: R, F 모두 2점 이상 (223, 222, 221, 232, 231, 322, 321, 331)
- 우수 고객: 평균 RFM 점수 2점 이상 (133, 123, 213, 132, 313, 312)
- 신규고객: (211, 311)
- 놓치면 안될 고객: (113, 122, 212)
- 이탈 우려 고객: (111, 112, 121, 131)
"""

def classify_customer(rfm):
    R = int(str(rfm)[0])  # R 값 추출
    F = int(str(rfm)[1])  # F 값 추출
    M = int(str(rfm)[2])  # M 값 추출

    if [R, F, M].count(3) == 3:  # VIP 고객
        return 'VIP 고객'
    elif R >= 2 and F >= 2 and M>=2:  # 충성고객
        return '충성 고객'
    elif (R + F + M)  / 3 >= 2:  # 우수고객
        return '우수 고객'
    elif R == 2 and F == 1 and M == 1:  # 신규고객
        return '신규 고객'
    elif R == 1 and (F == 1 or F == 2) and (M == 1 or M == 2):  # 놓치면 안될 고객
        return '놓치면 안될 고객'
    else:  # 이탈 우려 고객
        return '이탈 우려 고객'

# df_rfm['RFM']에 저장된 RFM 코드를 기반으로 고객 분류하기
df_rfm['RFM_grade(marketing)'] = df_rfm['RFM'].apply(classify_customer)
# RFM 등급별 고객 수 세기
grade_counts = df_rfm.groupby('RFM_grade(marketing)')['CustomerID'].count()
total_customers = df_rfm['CustomerID'].count()
percentages = round((grade_counts / total_customers) * 100, 1).apply(lambda x: f"{x:.1f}%")
temp_df = pd.DataFrame({'Count': grade_counts, 'Percentage': percentages}).reindex(['VIP 고객', '충성 고객', '우수 고객', '신규 고객', '놓치면 안될 고객', '이탈 우려 고객'])
temp_df

Out[8]:

	Count	Percentage
RFM_grade(marketing)
VIP 고객	134	9.1%
충성 고객	245	16.7%
우수 고객	293	20.0%
신규 고객	224	15.3%
놓치면 안될 고객	263	17.9%
이탈 우려 고객	309	21.0%

R, F, M 항목별 점수를 고려한 분류방식에서 역시 상위등급(VIP, 충성고객)의 비율이 높게 나왔다.
- R, F, M 항목의 등급을 1~5 등으로 세분화하는 전략이 필요하다.
VIP 고객, 충성 고객: 우선 배송, 전담 매니저 등의 혜택 제공 및 cross-selling 제품에 대한 구매 유도
우수 고객: 맞춤형 상품 추천, 고객 구매 상품에 대한 추가정보 제공
신규 고객: 첫 구매 후 추가 할인 쿠폰 제공, 커뮤니케이션 등을 통한 재구매 유도
놓치면 안될 고객: 재구매 시기 예측하여 프로모션 제공
이탈 우려 고객: 복귀 회원에 대한 혜택 제공, push notificaiton 마케팅

2. K-means Clustering¶

RFM에서 사용한 항목들을 지표로 사용하여 군집의 특징 파악

In [9]:

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, RobustScaler
from yellowbrick.cluster import KElbowVisualizer

df_X = reduce(lambda x, y: pd.merge(x, y, on='CustomerID'), [df_r, df_f, df_m])

df_X_temp = df_X.copy()
df_X_temp['Recency'] = np.sqrt(df_X_temp['Recency']) # 분포를 고려한 root transform
df_X_temp.loc[df_X_temp['Frequency'] > 5, 'Frequency'] = 5 # 드물게 분포된 매우 높은 값을 특정값으로 치환
df_X_temp['Monetary'] = np.log(df_X_temp['Monetary']) # 분포를 고려한 log transform

df_X_temp = df_X_temp.drop(columns=['CustomerID'])
df_X_temp.hist(ax = plt.subplots(nrows=1, ncols=3, figsize=(10, 3))[1])
plt.suptitle("Score distribution by item", fontsize=15)
plt.tight_layout()
plt.show()

data_X = df_X_temp.to_numpy()
print("X_data shape:", data_X.shape)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(data_X)

X_data shape: (1468, 3)

In [10]:

# 최적의 Cluster 개수를 구하기 위한 Elbow method
k_values = range(1, 11)
inertia_values = []

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=0)
    kmeans.fit(X_scaled)
    inertia_values.append(kmeans.inertia_)

model = KMeans()
visualizer = KElbowVisualizer(model, k=(1,10))
visualizer.fit(X_scaled)

Out[10]:

KElbowVisualizer(ax=<Axes: >, estimator=KMeans(n_clusters=9), k=(1, 10))

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

In [11]:

# Elbow method를 통해 나온 4개의 군집 생성을 위한 k means model 선언
kmeans = KMeans(n_clusters=4, max_iter=50)
kmeans.fit(X_scaled)

df_X['cluster'] = kmeans.labels_

In [12]:

# 각 군집별 항목 등급 분포
plt.figure(figsize=(11,3))
sns.boxplot(x='cluster', y='Recency', data=df_X)
plt.show()
plt.figure(figsize=(11,3))
sns.boxplot(x='cluster', y='Frequency', data=df_X)
plt.show()
plt.figure(figsize=(11,3))
sns.boxplot(x='cluster', y='Monetary', data=df_X)
plt.show()
df_X.groupby('cluster')['CustomerID'].count()

Out[12]:

cluster
0    324
1    649
2    284
3    211
Name: CustomerID, dtype: int64

각 cluster별 특징 분석
- cluster 0: R-상위, (F, M)-하위 군집으로 신규고객들의 군집으로 판단.
- clsuter 1: R, F, M 모두 다른 군집에 비해 상위 군집으로 VIP, 충성, 우수 고객 군집으로 판단.
- cluster 2: F, M은 하위 군집이나 R은 고루 분포되어있으므로 일반적인 고객 군집으로 판단.
- cluster 3: R, F는 하위 군집이나 M은 다른 군집에 비해 상위 집단으로, 이탈우려 고객 및 놓쳐서는 안될 고객 군집으로 판단.
해당 K-means 결과 역시 상위 집단의 비율이 높게 나타났다.
- 이는 R, F, M별 각 항목별 등급 세분화 및 새로운 지표 추가를 통한 피라미드식 등급 분포를 생성하는 것이 추후에 전략적인 선택을 하는데 도움이 될 것이다.
Heuristic RFM vs K-means Clustering
- 두 분석의 집단 분류결과는 비슷하게 나왔다고 판단됨.
- 데이터 기반의 Clustering 결과의 해석의 용이함이 확인 되었으므로, Clustering에 의한 결과를 이용하는 것이 모델의 지속가능한 사용을 위해 좋을 것이라 판단됨.

In [13]:

# K-means cluster 3d visualization
from mpl_toolkits.mplot3d import Axes3D

plt.figure(figsize=(6, 6))
fig = plt.figure(1)
plt.clf()

ax = fig.add_subplot(111, projection='3d')
ax.scatter(df_X_temp['Frequency'], df_X_temp['Recency'], df_X_temp['Monetary'],
           c=df_X['cluster'],
           s=200,
           cmap='spring',
           alpha=0.5,
           edgecolor='darkgrey')

ax.set_xlabel('Frequency', fontsize=13)
ax.set_ylabel('Recency', fontsize=13)
ax.set_zlabel('Monetary', fontsize=13)
ax.view_init(elev=50, azim=60)
plt.tight_layout()

plt.show()

3. 한계점 및 추후 분석¶

고객 평가 등급을 더욱 세분화하거나, 가중치, 새로운 지표 등을 추가해야 될 것이라 판단됨.
RFM, Clustering 분석의 경우 과거 데이터에 기반한 분석으로, 고객의 동적인 특성을 반영하지 못할 수 있다.
고정된 기준에 의한 판단으로 bias 를 가질 수 있다.
추후, 고객의 행동을 예측하는 분석, cross-selling 분석, 코호트 분석등을 통한 개인화 마케팅 전략 수립 및 행동에 대한 예측 분석 필요.

[LTV] Marketing insights for E-commerce company (0)	2024.06.21
[Cohort Analysis] Marketing insights for E-commerce company (0)	2024.06.15
[Simple Analysis] Marketing insights for E-commerce company (1)	2024.06.03
[EDA] Marketing insights for E-commerce company (0)	2024.06.03

내 블로그 - 관리자 홈 전환	`Q` `Q`
새 글 쓰기	`W` `W`

글 수정 (권한 있는 경우)	`E` `E`
댓글 영역으로 이동	`C` `C`

이 페이지의 URL 복사	`S` `S`
맨 위로 이동	`T` `T`
티스토리 홈 이동	`H` `H`
단축키 안내	`Shift` + `/` `⇧` + `/`

Data 공부

Data 공부

[Customer Segment] Marketing insights for E-commerce company 본문

[Customer Segment] Marketing insights for E-commerce company

Customer Segment

Heuristic RFM 기법과 data-driven K means clustering 기법을 비교하여 고객 세분화 결과를 비교한다.

0. Import Package & Data Load¶

1. Heuristic RFM(Recency, Frequency, Monetary)¶

1-1. 고객 분류 방법¶

2. K-means Clustering¶

3. 한계점 및 추후 분석¶

'Data 분석 > E-Commerce data' 카테고리의 다른 글

티스토리툴바

단축키

내 블로그

블로그 게시글

모든 영역

« 2025/08 »
일	월	화	수	목	금	토
					1	2
3	4	5	6	7	8	9
10	11	12	13	14	15	16
17	18	19	20	21	22	23
24	25	26	27	28	29	30
31