-
Notifications
You must be signed in to change notification settings - Fork 0
/
cluster_total_cross.R
74 lines (45 loc) · 2.01 KB
/
cluster_total_cross.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#Load File, Load Packages
bank<-read.csv("bank-additional-full.csv",header=TRUE,sep=";")
library(rminer)
library(CrossClustering)
library(dplyr)
library(cluster)
set.seed(1)
#Create artificial time-axis beforehand though. The website noted that the values were chronologi-
#cally sorted. Therefore a simple itemnumber identifies a chronology.
time_axis <- as.numeric(rownames(bank))
bank_time <- cbind(bank, time_axis)
bank_time_without_y <- subset(bank_time[,-21])
#----------------------Clustering----------------------------#
# Setting up clustering training set
d <- daisy(bank_time_without_y, metric = "gower")
clusters <- CrossClustering(d, k.w.min = 2, k.w.max=19, k.c.max = 19)
# printing clustering information training set
cat("amount of clusters training set:", clusters$Optimal.cluster, "\n")
cat("Silhouette of training set clusters", clusters$Silhouette, "\n")
cat("Ommited information", (1-(clusters$n.clustered/clusters$n.total))*100, "%", "\n")
# memory clean
gc()
#Create cluster_n for cluster amount paramater for training set
clus_amount <- unlist(clusters$Optimal.cluster)
# adding clustering information to variable
clust_tot <- vector(mode="numeric", length=0)
# Labeling of clusters and aggregation of clustered data.
for (q in 1:clus_amount) {
ss_clust <- subset(bank_time[clusters$Cluster.list[[q]],])
ss_it_length <- nrow(ss_clust)
clust_n <- replicate(ss_it_length, q)
ss_clust <- cbind(ss_clust, clust_n)
clust_tot <- rbind(clust_tot, ss_clust)
}
# Addition of non-assigned data with "is not equal" to feature.
# listning what items are assigned
clusters_info <- unlist(clusters$Cluster.list)
non_cluster_group <- anti_join(bank_time, clust_tot)
# Add column for missing values
non_cluster_group$clust_n <- vector(mode="numeric", length= nrow(non_cluster_group))
# build final dataset
data <- rbind(clust_tot, non_cluster_group)
# Write file
write.table(data, "~/thesis/data/total_with_clust.txt", sep=";")
gc()