Memasang library yang diperlukan
library(cluster)
library(factoextra)
library(ggplot2)
library(ggplotify)
library(dplyr)
library(utils)
library(plotly)
library(mice)
Membaca data
data <- read.csv("marketing_campaign.csv", sep = "\t", header = TRUE)
Membuat atribut baru
# Membuat atribut age untuk mengetahui usia customer
data$Age <- NULL
data$Age <- 2023-data$Year_Birth
# Membuat atribut spending untuk mengetahui total produk yang sudah dibeli
data$Spending <- NULL
data$Spending <- data$MntWines+data$MntFruits+data$MntMeatProducts+
data$MntFishProducts+data$MntSweetProducts+data$MntGoldProds
# Membuat atribut Children untuk mengetahui jumlah anak yang dimiliki
data$Children <- NULL
data$Children <- data$Kidhome+data$Teenhome
# Membuat atribut Seniority untuk mengetahui lama waktu seseorang menjadi customer di perusahaan ini
data$Seniority <- NULL
now <- as.Date('1-5-2023',format='%d-%m-%Y')
data$Dt_Customer <- as.Date(data$Dt_Customer, format='%d-%m-%Y')
data$Seniority <- as.integer(now - data$Dt_Customer)
# Memilih atribut yang diperlukan
dataCust <- data %>%
select(Age, Marital_Status, Children, Education, Seniority, Income, Spending,
MntWines, MntFruits, MntMeatProducts, MntFishProducts, MntSweetProducts, MntGoldProds)
# Memilih Marital_Status yang bernilai Single dan Married
dataCust <- subset(dataCust, Marital_Status %in% c("Single", "Married"))
Diskretisasi atribut Age
# 1 = Dewasa (27-61); 2 = Lansia (62-96); 3 = Manula (97-130)
dataCust$Age <- as.numeric(dataCust$Age)
dataCust$Age <- cut(dataCust$Age, breaks = c(0, 61, 96, 130), labels = c(1, 2, 3))
Memeriksa missing value
md.pattern(dataCust)
## Age Marital_Status Children Education Seniority Spending MntWines
## 1328 1 1 1 1 1 1 1
## 16 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0
## MntFruits MntMeatProducts MntFishProducts MntSweetProducts MntGoldProds
## 1328 1 1 1 1 1
## 16 1 1 1 1 1
## 0 0 0 0 0
## Income
## 1328 1 0
## 16 0 1
## 16 16
Mengisi nilai missing value dengan rata-rata
dataCust$Income[is.na(dataCust$Income)] <- mean(dataCust$Income, na.rm = TRUE)
md.pattern(dataCust)
## /\ /\
## { `---' }
## { O O }
## ==> V <== No need for mice. This data set is completely observed.
## \ \|/ /
## `-----'
## Age Marital_Status Children Education Seniority Income Spending MntWines
## 1344 1 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0 0
## MntFruits MntMeatProducts MntFishProducts MntSweetProducts MntGoldProds
## 1344 1 1 1 1 1 0
## 0 0 0 0 0 0
Inisialisasi data kategorik
# Atribut Education
# 1=undergraduated; 2=postgraduated
dataCust$Education[which(dataCust$Education=="Basic")] <- 1
dataCust$Education[which(dataCust$Education=="2n Cycle")] <- 1
dataCust$Education[which(dataCust$Education=="Graduation")] <- 2
dataCust$Education[which(dataCust$Education=="Master")] <- 2
dataCust$Education[which(dataCust$Education=="PhD")] <- 2
dataCust$Education <- as.integer(dataCust$Education)
# Atribut Marital Status
# 1=Single; 2=Married
dataCust$Marital_Status <- recode(dataCust$Marital_Status, "Single" = 1, "Married" = 2)
dataCust$Marital_Status <- as.integer(dataCust$Marital_Status)
Normalisasi
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
dataCust$Income <- normalize(dataCust$Income)
dataCust$Spending <- normalize(dataCust$Spending)
dataCust$MntWines <- normalize(dataCust$MntWines)
dataCust$MntFruits <- normalize(dataCust$MntFruits)
dataCust$MntMeatProducts <- normalize(dataCust$MntMeatProducts)
dataCust$MntFishProducts <- normalize(dataCust$MntFishProducts)
dataCust$MntSweetProducts <- normalize(dataCust$MntSweetProducts)
dataCust$MntGoldProds <- normalize(dataCust$MntGoldProds)
summary(dataCust)
## Age Marital_Status Children Education Seniority
## 1:1006 Min. :1.000 Min. :0.000 Min. :1.000 Min. :3229
## 2: 337 1st Qu.:1.000 1st Qu.:0.000 1st Qu.:2.000 1st Qu.:3403
## 3: 1 Median :2.000 Median :1.000 Median :2.000 Median :3580
## Mean :1.643 Mean :0.933 Mean :1.884 Mean :3579
## 3rd Qu.:2.000 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:3757
## Max. :2.000 Max. :3.000 Max. :2.000 Max. :3927
## Income Spending MntWines MntFruits
## Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.2043 1st Qu.:0.0253 1st Qu.:0.01474 1st Qu.:0.01005
## Median :0.3074 Median :0.1468 Median :0.10918 Median :0.04020
## Mean :0.3096 Mean :0.2347 Mean :0.19792 Mean :0.13130
## 3rd Qu.:0.4146 3rd Qu.:0.4053 3rd Qu.:0.32351 3rd Qu.:0.16583
## Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.00000
## MntMeatProducts MntFishProducts MntSweetProducts MntGoldProds
## Min. :0.000000 Min. :0.00000 Min. :0.000000 Min. :0.00000
## 1st Qu.:0.008121 1st Qu.:0.01181 1st Qu.:0.003802 1st Qu.:0.02486
## Median :0.036543 Median :0.04724 Median :0.030418 Median :0.06354
## Mean :0.097061 Mean :0.14328 Mean :0.102288 Mean :0.11919
## 3rd Qu.:0.131961 3rd Qu.:0.19291 3rd Qu.:0.133080 3rd Qu.:0.14917
## Max. :1.000000 Max. :1.00000 Max. :1.000000 Max. :1.00000
str(dataCust)
## 'data.frame': 1344 obs. of 13 variables:
## $ Age : Factor w/ 3 levels "1","2","3": 2 2 1 1 1 1 1 2 1 2 ...
## $ Marital_Status : int 1 1 2 2 2 2 2 1 2 2 ...
## $ Children : int 0 2 1 1 1 0 0 0 2 1 ...
## $ Education : int 2 2 2 2 2 1 2 2 2 2 ...
## $ Seniority : int 3891 3341 3389 3645 3454 3821 3855 3810 3780 3686 ...
## $ Income : num 0.352 0.277 0.353 0.196 0.31 ...
## $ Spending : num 0.63968 0.00873 0.16548 0.06508 0.00556 ...
## $ MntWines : num 0.42532 0.00737 0.11587 0.0509 0.00335 ...
## $ MntFruits : num 0.44221 0.00503 0.21608 0.05025 0.02513 ...
## $ MntMeatProducts : num 0.3161 0.0029 0.0679 0.0319 0.0029 ...
## $ MntFishProducts : num 0.67717 0.00787 0.1811 0.01181 0 ...
## $ MntSweetProducts: num 0.3346 0.0038 0.1027 0.0038 0.0076 ...
## $ MntGoldProds : num 0.24309 0.01657 0.04144 0.06354 0.00276 ...