Memasang library yang diperlukan

library(cluster)
library(factoextra)
library(ggplot2)
library(ggplotify)
library(dplyr)
library(utils)
library(plotly)
library(mice)

Membaca data

data <- read.csv("marketing_campaign.csv", sep = "\t", header = TRUE)

Praproses Data

Membuat atribut baru

# Membuat atribut age untuk mengetahui usia customer
data$Age <- NULL
data$Age <- 2023-data$Year_Birth

# Membuat atribut spending untuk mengetahui total produk yang sudah dibeli
data$Spending <- NULL
data$Spending <- data$MntWines+data$MntFruits+data$MntMeatProducts+
  data$MntFishProducts+data$MntSweetProducts+data$MntGoldProds

# Membuat atribut Children untuk mengetahui jumlah anak yang dimiliki
data$Children <- NULL
data$Children <- data$Kidhome+data$Teenhome

# Membuat atribut Seniority untuk mengetahui lama waktu seseorang menjadi customer di perusahaan ini
data$Seniority <- NULL
now <- as.Date('1-5-2023',format='%d-%m-%Y')
data$Dt_Customer <- as.Date(data$Dt_Customer, format='%d-%m-%Y')
data$Seniority <- as.integer(now - data$Dt_Customer)
# Memilih atribut yang diperlukan
dataCust <- data %>%
  select(Age, Marital_Status, Children, Education, Seniority, Income, Spending,
         MntWines, MntFruits, MntMeatProducts, MntFishProducts, MntSweetProducts, MntGoldProds)

# Memilih Marital_Status yang bernilai Single dan Married
dataCust <- subset(dataCust, Marital_Status %in% c("Single", "Married"))

Diskretisasi atribut Age

# 1 = Dewasa (27-61); 2 = Lansia (62-96); 3 = Manula (97-130)
dataCust$Age <- as.numeric(dataCust$Age)
dataCust$Age <- cut(dataCust$Age, breaks = c(0, 61, 96, 130), labels = c(1, 2, 3))

Memeriksa missing value

md.pattern(dataCust)

##      Age Marital_Status Children Education Seniority Spending MntWines
## 1328   1              1        1         1         1        1        1
## 16     1              1        1         1         1        1        1
##        0              0        0         0         0        0        0
##      MntFruits MntMeatProducts MntFishProducts MntSweetProducts MntGoldProds
## 1328         1               1               1                1            1
## 16           1               1               1                1            1
##              0               0               0                0            0
##      Income   
## 1328      1  0
## 16        0  1
##          16 16

Mengisi nilai missing value dengan rata-rata

dataCust$Income[is.na(dataCust$Income)] <- mean(dataCust$Income, na.rm = TRUE)
md.pattern(dataCust)
##  /\     /\
## {  `---'  }
## {  O   O  }
## ==>  V <==  No need for mice. This data set is completely observed.
##  \  \|/  /
##   `-----'

##      Age Marital_Status Children Education Seniority Income Spending MntWines
## 1344   1              1        1         1         1      1        1        1
##        0              0        0         0         0      0        0        0
##      MntFruits MntMeatProducts MntFishProducts MntSweetProducts MntGoldProds  
## 1344         1               1               1                1            1 0
##              0               0               0                0            0 0

Inisialisasi data kategorik

# Atribut Education
# 1=undergraduated; 2=postgraduated
dataCust$Education[which(dataCust$Education=="Basic")] <- 1
dataCust$Education[which(dataCust$Education=="2n Cycle")] <- 1
dataCust$Education[which(dataCust$Education=="Graduation")] <- 2
dataCust$Education[which(dataCust$Education=="Master")] <- 2
dataCust$Education[which(dataCust$Education=="PhD")] <- 2
dataCust$Education <- as.integer(dataCust$Education)

# Atribut Marital Status
# 1=Single; 2=Married
dataCust$Marital_Status <- recode(dataCust$Marital_Status, "Single" = 1, "Married" = 2)
dataCust$Marital_Status <- as.integer(dataCust$Marital_Status)

Normalisasi

normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x)))
}
dataCust$Income <- normalize(dataCust$Income)
dataCust$Spending <- normalize(dataCust$Spending)
dataCust$MntWines <- normalize(dataCust$MntWines)
dataCust$MntFruits <- normalize(dataCust$MntFruits)
dataCust$MntMeatProducts <- normalize(dataCust$MntMeatProducts)
dataCust$MntFishProducts <- normalize(dataCust$MntFishProducts)
dataCust$MntSweetProducts <- normalize(dataCust$MntSweetProducts)
dataCust$MntGoldProds <- normalize(dataCust$MntGoldProds)
summary(dataCust)
##  Age      Marital_Status     Children       Education       Seniority   
##  1:1006   Min.   :1.000   Min.   :0.000   Min.   :1.000   Min.   :3229  
##  2: 337   1st Qu.:1.000   1st Qu.:0.000   1st Qu.:2.000   1st Qu.:3403  
##  3:   1   Median :2.000   Median :1.000   Median :2.000   Median :3580  
##           Mean   :1.643   Mean   :0.933   Mean   :1.884   Mean   :3579  
##           3rd Qu.:2.000   3rd Qu.:1.000   3rd Qu.:2.000   3rd Qu.:3757  
##           Max.   :2.000   Max.   :3.000   Max.   :2.000   Max.   :3927  
##      Income          Spending         MntWines         MntFruits      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.2043   1st Qu.:0.0253   1st Qu.:0.01474   1st Qu.:0.01005  
##  Median :0.3074   Median :0.1468   Median :0.10918   Median :0.04020  
##  Mean   :0.3096   Mean   :0.2347   Mean   :0.19792   Mean   :0.13130  
##  3rd Qu.:0.4146   3rd Qu.:0.4053   3rd Qu.:0.32351   3rd Qu.:0.16583  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.00000   Max.   :1.00000  
##  MntMeatProducts    MntFishProducts   MntSweetProducts    MntGoldProds    
##  Min.   :0.000000   Min.   :0.00000   Min.   :0.000000   Min.   :0.00000  
##  1st Qu.:0.008121   1st Qu.:0.01181   1st Qu.:0.003802   1st Qu.:0.02486  
##  Median :0.036543   Median :0.04724   Median :0.030418   Median :0.06354  
##  Mean   :0.097061   Mean   :0.14328   Mean   :0.102288   Mean   :0.11919  
##  3rd Qu.:0.131961   3rd Qu.:0.19291   3rd Qu.:0.133080   3rd Qu.:0.14917  
##  Max.   :1.000000   Max.   :1.00000   Max.   :1.000000   Max.   :1.00000
str(dataCust)
## 'data.frame':    1344 obs. of  13 variables:
##  $ Age             : Factor w/ 3 levels "1","2","3": 2 2 1 1 1 1 1 2 1 2 ...
##  $ Marital_Status  : int  1 1 2 2 2 2 2 1 2 2 ...
##  $ Children        : int  0 2 1 1 1 0 0 0 2 1 ...
##  $ Education       : int  2 2 2 2 2 1 2 2 2 2 ...
##  $ Seniority       : int  3891 3341 3389 3645 3454 3821 3855 3810 3780 3686 ...
##  $ Income          : num  0.352 0.277 0.353 0.196 0.31 ...
##  $ Spending        : num  0.63968 0.00873 0.16548 0.06508 0.00556 ...
##  $ MntWines        : num  0.42532 0.00737 0.11587 0.0509 0.00335 ...
##  $ MntFruits       : num  0.44221 0.00503 0.21608 0.05025 0.02513 ...
##  $ MntMeatProducts : num  0.3161 0.0029 0.0679 0.0319 0.0029 ...
##  $ MntFishProducts : num  0.67717 0.00787 0.1811 0.01181 0 ...
##  $ MntSweetProducts: num  0.3346 0.0038 0.1027 0.0038 0.0076 ...
##  $ MntGoldProds    : num  0.24309 0.01657 0.04144 0.06354 0.00276 ...