0. Import Package¶

In [1]:

import pandas as pd
import seaborn as sns
import numpy as np
import os
import scipy.stats as st
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

1. Data Load¶

In [2]:

data_cus = pd.read_pickle("data_cus.pkl")
data_cou = pd.read_pickle("data_cou.pkl")
data_ma = pd.read_pickle("data_ma.pkl")
data_on = pd.read_pickle("data_on.pkl")
data_tax = pd.read_pickle("data_tax.pkl")

2. Simple Analysis¶

EDA를 통해 이해된 데이터의 특성과 산업의 특성에 따라 간단히 수행할 수 있는 분석작업

2-1. Sales¶

2-1-1. Total Sales¶

Invoice Value: Invoice Value =((Quantity*Avg_price)(1-Dicount_pct)*(1+GST))+Delivery_Charges
Total Sales를 이용해 현재 회사의 매출과 가능하다면, E-commerce에서 어느 정도 우위에 있는지 확인한다.

In [3]:

# 거래내역별 GST 값을 알기위하여 data_on, data_tax merge
data_merge = pd.merge(data_on, data_tax, how='left', on='Product_Category')

# 거래내역별 쿠폰의 정보를 알기 위하여 Month와 Product_Category를 이용하여 쿠폰정보 얻기
data_merge['Month'] = data_merge['Transaction_Date'].dt.month
data_cou_copy = data_cou[['CouponID', 'Month', 'Product_Category', 'Discount_pct']]
data_merge = pd.merge(data_merge, data_cou_copy, how='left', on=['Month', 'Product_Category'])

# 쿠폰이 적용되지 않는 카테고리 처리
data_merge['Discount_pct'] =  data_merge['Discount_pct'].fillna(0)

# 쿠폰은 사용하였을 경우에만 적용되도록 처리 및 계산하기 위해 소수점 변환
data_merge['Discount_pct'] = \
[data_merge['Discount_pct'].values[i]/100 if data_merge['Coupon_Status'].values[i] == 'Used' else 0 for i in range(len(data_merge))]

# invoice value 및 total sale 계산
data_merge['invoice_value'] = \
(data_merge['Quantity']*data_merge['Avg_Price'])*(1-data_merge['Discount_pct'])*(1+data_merge['GST'])+data_merge['Delivery_Charges']

total_sales = data_merge['invoice_value'].sum()
print("Total Sales for 1 year: ${:,}".format(int(round(total_sales, -3))))

Total Sales for 1 year: $5,397,000

현재, 경쟁사의 Total sales, 년도별 sales 등의 관련 정보가 없으므로 해당 지표에 대한 추후분석은 보류.

2-1-2. Sales per month¶

월별 매출을 확인하여 트렌드를 살펴보고, 어떤 요인이 매출에 영향을 끼칠 수 있는지 고민한다.

In [4]:

monthly_sales = data_merge.groupby('Month')['invoice_value'].sum().values

# Visualization
month = np.array(range(1, 13))
plt.figure(figsize=(8,3))
plt.gca().spines['top'].set_visible(False)
plt.plot(month, monthly_sales, linewidth=2, color='red', marker='o', label='Monthly Sales')
plt.xlabel('Month', fontsize=12, loc='right'); plt.ylabel("Sales", fontsize=12, loc='top');
plt.xticks(month, fontsize=9); plt.yticks(fontsize=9); plt.tick_params(axis='both', which='major', length=2)
plt.title("Sales per month", fontsize=15)
upper_bound = np.percentile(monthly_sales, 75) + 1.5 * np.subtract(*np.percentile(monthly_sales, [75, 25]))
lower_bound = np.percentile(monthly_sales, 75) - 1.5 * np.subtract(*np.percentile(monthly_sales, [75, 25]))
plt.axhspan(lower_bound, upper_bound, alpha=0.2) # 트렌드를 벗어나는 지점을 찾기위한 영역
plt.ylim(min(monthly_sales)-50000, max(monthly_sales)+50000)
plt.scatter(month[[np.argmin(monthly_sales), np.argmax(monthly_sales)]], 
            monthly_sales[[np.argmin(monthly_sales), np.argmax(monthly_sales)]], color='blue', s=120, facecolors='none')
plt.show()

No description has been provided for this image

월 별 매출에서 트렌드에 크게 벗어나는 지점은 나타나지 않으며, 6월이 최저, 12월이 최고 매출로 나타난다.
E-commerce 산업에서 가장 중요한 지표 중 하나인 매출에 영향을 끼치는 요인은 어떤 것인지 추후 분석 필요.

2-2. MAU(Monthly Active User) Analysis¶

MAU는 마케팅 성과와 고객 경험을 평가하는데 주요 지표이다.
지난 1년 간 MAU 추이를 확인하여 현재 E-Commerce 회사의 사용자 현황을 통해 어떤 전략을 취할 수 있을지 고민한다.
회원가입, 웹사이트 방문 로그 등의 정보가 존재하지 않으므로 MAU의 기준은 월별 구매가 일어난 사용자의 수로 정의한다.

In [5]:

data_mau = data_on.copy()
data_mau['Month'] = data_on['Transaction_Date'].dt.month

# 월별 사용자 ID의 중복을 제거하여 월별 물품 구매 기록이 있는 사용자 수 추출
mau = data_mau.drop_duplicates(subset=['Month', 'CustomerID']).groupby('Month')['CustomerID'].count().values

# Visualization
month = np.array(range(1, 13))
plt.figure(figsize=(8, 3))
plt.gca().spines[['right', 'top']].set_visible(False)
plt.plot(month, mau, linewidth=2.5, color='teal', marker='o')
plt.title('MAU ', fontsize=15, pad=20)
plt.xlabel('Month', fontsize=12, loc='right'); plt.ylabel("MAU", fontsize=12, loc='top');
plt.xticks(month, fontsize=9); plt.yticks(fontsize=9); plt.tick_params(axis='both', which='major', length=2)
plt.ylim(0, max(mau)+20)
upper_bound = np.percentile(mau, 75) + 1.5 * np.subtract(*np.percentile(mau, [75, 25]))
lower_bound = np.percentile(mau, 75) - 1.5 * np.subtract(*np.percentile(mau, [75, 25]))
plt.axhspan(lower_bound, upper_bound, alpha=0.2) # 트렌드를 벗어나는 지점을 찾기위한 영역
plt.scatter(month[[np.argmin(mau), np.argmax(mau)]], mau[[np.argmin(mau), np.argmax(mau)]], 
            color='red', s=120, facecolors='none')
plt.show()

MAU의 증감과 매출의 증감의 상관관계를 살펴보아 이를 통해 MAU의 상승이 매출과 관련이 있는지 확인한다.
- 가설 1-1. MAU와 매출은 양의 상관관계를 띌 것이다.
MAU 그래프를 보았을때 대부분 평균치를 웃도나 2월과 8월에서 증감이 눈에띈다. 해당 결과에 영향을 끼치는 요인을 찾아본다.
- 가설 2-1. MAU와 마케팅 금액사용에는 양의 상관관계가 있을 것이다.

2-2-1. 가설 1-1) MAU와 매출은 양의 상관관계를 띌 것이다.¶

In [6]:

monthly_sales = data_merge.groupby('Month')['invoice_value'].sum().values

# Visualization
fig, ax1 = plt.subplots(figsize=(8,3))

ax1.spines['top'].set_visible(False)
line1 = ax1.plot(month, mau, linewidth=2, color='teal', marker='o', label='MAU')
ax1.set_title('MAU & Monthly Sales ', fontsize=15, pad=20)
ax1.set_ylim(min(mau)-20, max(mau)+20)

ax2 = ax1.twinx()
ax2.spines['top'].set_visible(False)
line2 = ax2.plot(month, monthly_sales, linewidth=2, color='red', marker='o', label='Monthly Sales')
ax2.set_ylim(min(monthly_sales)-20000, max(monthly_sales)+20000)

lines = line1 + line2
labels = [line.get_label() for line in lines]
ax2.legend(lines, labels, loc='upper left')

plt.tight_layout()
plt.show()
# Statistical Analysis(Correlation)
# N수는 12로 작은 값을 가지며, 정규성을 띄지 않으므로 Spearman Correlation 수행
print("Normality test result:", st.shapiro(mau)[1], st.shapiro(monthly_sales)[1])
print("Spearman result:", st.spearmanr(mau, monthly_sales))

Normality test result: 0.3491823673248291 0.3654453158378601
Spearman result: SignificanceResult(statistic=0.11908949961751887, pvalue=0.7123998368899922)

가설 1-1의 검증 결과는 p_value > 0.05의 근거로 기각되어 MAU와 월별 매출의 상관관계는 없는 것으로 확인.
5, 6월의 경우 MAU가 비교적 낮더라도 매출은 최저로 나타난다. 반면, 11, 12월의 경우 MAU가 비교적 낮더라도 매출은 최상으로 보인다. 이를 고려하여 추가적으로 고객을 SEGMENT로 분류하여 고객별 적절한 액션을 통해 소비욕구 증가, 이탈방지 등의 효과를 야기해야 될 것이다.
5, 6, 7, 11월의 데이터가 상관관계에 영향을 끼치는 것으로 확인되며, 어떤 항목들이 영향을 끼쳤는지 추후 확인필요

2-2-2. 가설 2-1) MAU와 마케팅 금액사용에는 양의 상관관계가 있을 것이다.¶

In [7]:

# Marketing table 처리
data_ma_temp = data_ma.copy()
data_ma_temp['Month'] = data_ma_temp['Date'].dt.month

offline_ma = data_ma_temp.groupby('Month')['Offline_Spend'].sum()
online_ma = data_ma_temp.groupby('Month')['Online_Spend'].sum()
all_ma = offline_ma+online_ma

# Simple Visualization
plt.figure(figsize=(10, 6))
plt.gca().spines[['right', 'top']].set_visible(False)
bar_width = 0.4
x = np.arange(1, len(month)+1)

plt.bar(x - bar_width/2, offline_ma, color='blue', width=bar_width, label='Offline Marketing')
plt.bar(x + bar_width/2, online_ma, color='orange', width=bar_width, label='Online Marketing')
plt.plot(month, all_ma, marker='o', color='green', linewidth=2, label='All Marketing')

plt.xlabel('Months', fontsize=12)
plt.ylabel('Marketing Cost', fontsize=12)
plt.title('Marketing Cost per month', fontsize=15)
plt.xticks(x, month)
plt.legend()

plt.show()

# Statistical Analysis(Correlation): (mau, monthly_sales) - (marketing cost)
correlation_matrix = np.array([
    [st.spearmanr(offline_ma, monthly_sales)[0], st.spearmanr(online_ma, monthly_sales)[0], st.spearmanr(all_ma, monthly_sales)[0]],
    [st.spearmanr(offline_ma, mau)[0], st.spearmanr(online_ma, mau)[0], st.spearmanr(all_ma, mau)[0]]
])
pvalue_matrix = np.array([
    st.spearmanr(offline_ma, monthly_sales)[1], st.spearmanr(online_ma, monthly_sales)[1], st.spearmanr(all_ma, monthly_sales)[1],
    st.spearmanr(offline_ma, mau)[1], st.spearmanr(online_ma, mau)[1], st.spearmanr(all_ma, mau)[1]
])
annot_matrix = [["{:.2f}".format(correlation_matrix[i, j]) if pvalue_matrix[i * 3 + j] < 0.05 else "NS" for j in range(correlation_matrix.shape[1])] for i in range(correlation_matrix.shape[0])]
                       
# Visualization
plt.figure(figsize=(9, 6))  
heatmap = sns.heatmap(correlation_matrix, cmap='Oranges', vmin=0, vmax=1,
            yticklabels=['Monthly Sales', 'MAU'], xticklabels=['Offline', 'Online', 'All'])
plt.title("Spearman Correlation Matrix", fontsize=15)

# 주석 추가
for i, (label, row) in enumerate(zip(['Monthly Sales', 'MAU'], annot_matrix)):
    for j, text in enumerate(row):
        heatmap.text(j + 0.5, i + 0.5, text, ha='center', va='center', fontsize=15, color='black')

plt.show()

Marketing 비용과 월별 매출은 강한 양의 상관관계를 나타내지만, MAU와는 상관관계를 띄지 않는다.
Marketing 비용과 월별 매출간의 인과관계는 더 분석해봐야겠지만, 통상적 개념으로 Marketing 비용을 높일 수록 매출은 더욱 늘어날 것으로 보인다.
MAU에 영향을 끼치는 요인은 추후 분석 필요.

2-3. ARPU Analysis¶

ARPU(Average Revenue Per User): 한 사용장당 평균적으로 발생하는 수익
이전 EDA에서 모든 회원이 1번 이상 거래내역이 있으므로 이 데이터에서는 ARPU와 ARPPU를 동일한 지표로 본다.
각 기간별 ARPU를 통해 현황을 살펴본다.

In [8]:

# 주별 ARPU
data_merge['Week'] = data_merge['Transaction_Date'].dt.strftime('%U').astype('int64')
wau = data_merge.drop_duplicates(subset=['Week', 'CustomerID']).groupby('Week')['CustomerID'].count().values
arpwau = data_merge.groupby('Week')['invoice_value'].sum() / wau

# 월별 ARPU
arpmau = data_merge.groupby('Month')['invoice_value'].sum() / mau

# 전체 기간 ARPU
arpu = data_merge['invoice_value'].sum() / len(data_cus)
print("\n전체 ARPU: ${:,}\n".format(np.round(arpu, 1)))

# Visualization
week = np.arange(1, 54)
plt.figure(figsize=(10,5))
plt.gca().spines[['right', 'top']].set_visible(False)
plt.bar(week, arpwau)
plt.title("ARPWAU", fontsize=15)
plt.show()

plt.figure(figsize=(10,5))
plt.gca().spines[['right', 'top']].set_visible(False)
plt.bar(month, arpmau)
plt.title("ARPMAU", fontsize=15)
plt.show()

전체 ARPU: $3,676.7

ARPWAU와 ARPMAU를 확인해봤을 때, 흥미로운 점은 매출과 MAU가 가장 낮았던 2월에서 가장 큰 APRU를 나타낸 다는 점이다.
- 2월의 고객들의 충성도를 높이는 방향으로 전략을 세워야 할 것이다.
비교적 연초, 연말에는 ARPU가 높고, 중간에는 APRU가 낮다. 해당 요인을 살펴보는 추후 분석이 필요.

2-4. Category Analysis¶

인기 있는 Category, item은 E-commerce 산업에서 개인화, 마케팅, 재고관리 등에 영향을 끼치는 중요한 지표이다.
E-Commerce에서 현재 인기있는 Category와 item을 찾아 어떤 방식으로 쓰일 수 있을지 전략을 고민한다.

In [9]:

# Cateogry Sales Distribution
category_count = data_merge.groupby('Product_Category')['Quantity'].sum()
category_count = category_count.sort_values(ascending=False)

plt.figure(figsize=(10, 5))
plt.gca().spines[['top', 'right']].set_visible(False)
plt.title('Category Sales Distribution', fontsize=15)
category_count.plot(kind='bar')
plt.xlabel('Category', loc='right'); plt.ylabel('Quantity', loc='top')
plt.show()

# item Sales Distribution
item_count = data_merge.groupby('Product_SKU')['Quantity'].sum()
item_count = item_count.sort_values(ascending=False).head(10)

plt.figure(figsize=(10, 5))
plt.gca().spines[['top', 'right']].set_visible(False)
plt.title('Top 10 selling items', fontsize=15)
item_count.plot(kind='bar')
plt.xlabel('Item', loc='right'); plt.ylabel('Quantity', loc='top')
plt.show()

# category not applied with coupon
print("\ncategory not applied with coupon:", set(category_count.index).difference(data_cou['Product_Category'].unique()))

# Number of purchase categories per customer
category_counts = data_merge.groupby('CustomerID')['Product_Category'].nunique()
category_counts.plot(kind='bar', color='skyblue')
plt.xticks([])
plt.show()

# top 10 seeling items info
data_merge[data_merge['Product_SKU'].isin(item_count.index.values)][['Product_SKU', 'Product_Description', 'Product_Category']].drop_duplicates('Product_SKU')

category not applied with coupon: {'Fun', 'More Bags', 'Google', 'Backpacks'}

Out[9]:

	Product_SKU	Product_Description	Product_Category
0	GGOENEBJ079499	Nest Learning Thermostat 3rd Gen-USA - Stainle...	Nest-USA
2	GGOEGFKQ020399	Google Laptop and Cell Phone Stickers	Office
5	GGOEGBMJ013399	Sport Bag	Bags
6	GGOEGDHC018299	Google 22 oz Water Bottle	Drinkware
9	GGOEGGOA017399	Maze Pen	Office
13	GGOENEBQ078999	Nest Cam Outdoor Security Camera - USA	Nest-USA
20	GGOEGOAQ012899	Ballpoint LED Light Pen	Office
43	GGOEGFSR022099	Google Kick Ball	Lifestyle
265	GGOEGFYQ016599	Foam Can and Bottle Cooler	Drinkware
420	GGOEGOLC014299	Google Metallic Notebook Set	Office

인기있는 카테고리는 사무용품, 의류, 물병, 생활용품, 스마트홈 용품이며, 인기있는 물품은 상위 카테고리의 물품들이다.
- 상위 카테고리에 대한 할인정책, 제품라인 확장 등을 통하여 충성고객을 만들 수 있을 것이다.
- 스마트홈에 대한 수요가 충분한 것으로 보아, 제품라인 확대 등의 공격적인 마케팅이 필요해보인다.
하위 카테고리에 대한 과감한 폐지를 통한 고객 집중 혹은 마케팅을 통한 매출량 증대가 필요할 것이다.
- 쿠폰이 적용되지 않은 카테고리는 대부분 하위 카테고리이므로, 쿠폰 적용을 통해 매출 증대를 살펴본다.
고객별 구입 카테고리 수, 동시 구매 카테고리를 통해 개인화 전략에 대한 추후 분석필요

2-5. Coupon Analysis¶

E-commerce 에서 쿠폰은 고객유치, 재고정리, 재구매율 증가, 마케팅 등의 중요한 역할을 수행한다.
현재 E-commerce 회사에서 쿠폰의 사용량을 확인하여 어떤 전략을 취해야 할지 고민한다.
이전 EDA 과정에서, 17개의 카테고리에서 10%, 20%, 30%의 쿠폰이 동일한 비율로 사용되는 것을 확인했다.

In [10]:

# Coupon Status
coupon_status_count = data_merge.groupby('Coupon_Status')['CouponID'].count()
def plot_pie_chart(title, sizes, explode:list=[0,0], labels:list=['1', '2'], colors:list=['blue', 'red']):
    patches, texts, autotexts = plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
    for text in texts + autotexts:
        text.set_fontsize(14)
    plt.title(title, fontsize=15)
    plt.axis('equal') 
    plt.show()

plot_pie_chart('Coupon Status', coupon_status_count, explode=[.01, .01, .01], 
               labels=['Clicked', 'Not Used', 'Used'], colors=['orange', 'tomato', 'dodgerblue'])

쿠폰의 사용비율은 클릭만한 경우>사용한 경우> 사용하지 않은 경우 순으로 나타난다.
해당 결과에서는 쿠폰을 클릭한 유저들에게 구매까지 이어질 수 있도록 고객의 특성을 파악하여 전략을 취해야 할 것이다.

In [11]:

# data save
data_merge.to_pickle('data_merge.pkl')

[LTV] Marketing insights for E-commerce company (0)	2024.06.21
[Cohort Analysis] Marketing insights for E-commerce company (0)	2024.06.15
[Customer Segment] Marketing insights for E-commerce company (0)	2024.06.06
[EDA] Marketing insights for E-commerce company (0)	2024.06.03

Data 공부

Data 공부

[Simple Analysis] Marketing insights for E-commerce company 본문

[Simple Analysis] Marketing insights for E-commerce company

Simple Analysis

0. Import Package¶

1. Data Load¶

2. Simple Analysis¶

2-1. Sales¶

2-1-1. Total Sales¶

2-1-2. Sales per month¶

2-2. MAU(Monthly Active User) Analysis¶

2-2-1. 가설 1-1) MAU와 매출은 양의 상관관계를 띌 것이다.¶

2-2-2. 가설 2-1) MAU와 마케팅 금액사용에는 양의 상관관계가 있을 것이다.¶

2-3. ARPU Analysis¶

2-4. Category Analysis¶

2-5. Coupon Analysis¶

'Data 분석 > E-Commerce data' 카테고리의 다른 글

티스토리툴바

« 2025/04 »
일	월	화	수	목	금	토
		1	2	3	4	5
6	7	8	9	10	11	12
13	14	15	16	17	18	19
20	21	22	23	24	25	26
27	28	29	30