How to create Stratified Sampling for multiple columns in R - r

my data set has got 821049 variables and 18 columns. I would like to take 9 columns for the stratified sampling. These are "BASKETS_NZ", "PIS", "PIS_AP" "PIS_DV", "PIS_PL", "PIS_SDV", "PIS_SHOPS" "PIS_SR", "QUANTITY". My stratification variable is ID = 1:821049. How do I choose the intervals for my variables? How do I set the size of the sampling?
dpt(rbind(head(WKA_ohneJB, 10), tail(WKA_ohneJB, 10)))
structure(list(X = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
821039L, 821040L, 821041L, 821042L, 821043L, 821044L, 821045L,
821046L, 821047L, 821048L), BASKETS_NZ = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
LOGONS = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), PIS = c(71L, 39L, 50L, 4L,
13L, 4L, 30L, 65L, 13L, 31L, 111L, 33L, 3L, 46L, 11L, 8L,
17L, 68L, 65L, 15L), PIS_AP = c(14L, 2L, 4L, 0L, 0L, 0L,
1L, 0L, 2L, 1L, 13L, 0L, 0L, 2L, 1L, 0L, 3L, 8L, 0L, 1L),
PIS_DV = c(3L, 19L, 4L, 1L, 0L, 0L, 6L, 2L, 2L, 3L, 38L,
8L, 0L, 5L, 2L, 0L, 1L, 0L, 3L, 2L), PIS_PL = c(0L, 5L, 8L,
2L, 0L, 0L, 0L, 24L, 0L, 6L, 32L, 8L, 0L, 0L, 4L, 0L, 0L,
0L, 0L, 0L), PIS_SDV = c(18L, 0L, 11L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 6L, 0L, 0L, 13L, 0L, 0L, 1L, 15L, 1L, 0L), PIS_SHOPS = c(3L,
24L, 13L, 3L, 0L, 0L, 6L, 28L, 2L, 11L, 71L, 16L, 2L, 5L,
6L, 0L, 1L, 0L, 3L, 2L), PIS_SR = c(19L, 0L, 14L, 0L, 0L,
0L, 2L, 23L, 0L, 3L, 6L, 0L, 0L, 20L, 0L, 0L, 3L, 32L, 1L,
0L), QUANTITY = c(13L, 2L, 18L, 1L, 14L, 1L, 4L, 2L, 5L,
1L, 5L, 2L, 2L, 4L, 1L, 3L, 2L, 8L, 17L, 8L), WKA = c(1L,
1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 0L, 1L, 1L), NEW_CUST = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), EXIST_CUST = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), WEB_CUST = c(1L, 0L, 0L, 0L, 1L, 1L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L), MOBILE_CUST = c(0L,
1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 1L, 0L), TABLET_CUST = c(0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L),
LOGON_CUST_STEP2 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), row.names = c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 821039L, 821040L, 821041L,
821042L, 821043L, 821044L, 821045L, 821046L, 821047L, 821048L
), class = "data.frame")

Here is a solution to perform a stratified sampling based on multiple columns. Before implementing this, consider that your data is continuous and a sufficiently large that just a random sampling is adequate.
To solve this problem is to take a stratified sample from each group. The potential approaches to group the data together is by either pasting the 9 columns together or using dplyr's groupby function.
Using the solution is this question How to get around error "factor has new levels" in cross-validation glm? and updating with dplyr style.
This dplyr_stratified function will take the desired sampling ration and an arbitrary number of column and will return a data frame with the sampled rows. See the example below for taking 2 columns.
set.seed(1)
x <- rnorm(n = 100)
y <- rep(x = c("A","B"), times = c(50,50))
z <- rep(x = c("D","E","F"), times = c(33,33,34))
data <- data.frame(x, y=sample(y, replace = TRUE), z=sample(z, replace=TRUE))
library(dplyr)
#optional tag row for later identification:
data$rowid<-1:nrow(data)
dplyr_stratified <- function(df, percent, ...){
columns<-enquos(...)
#group then sample each group
out<-df %>% group_by(!!!columns) %>% slice( sample(1:n(), percent*n()))
}
testgroup<-dplyr_stratified(data, 0.8, z, y)
testgroup
Note: this is assuming each grouping will have a sufficient number of sample in order to select a representative sample. (If the groups are too small then this approach may not meet expectations)

Related

Plot PAM Cluster results with fviz_clust

My data set has got 821.000 rows and 18 columns. The variables are continous. As there are some dummy variables I selected only the continous columns. I used pam cluster algorithm and a dissmiliarity matrix. While trying to plot the cluster results an error came up.
structure(list(X = c(1L, 2L, 3L, 4L, 5L, 6L, 9L, 11L, 15L, 16L,
821037L, 821038L, 821039L, 821040L, 821041L, 821042L, 821043L,
821044L, 821047L, 821048L), BASKETS_NZ = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
LOGONS = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), PIS = c(71L, 39L, 50L, 4L,
13L, 4L, 13L, 23L, 8L, 7L, 24L, 3L, 111L, 33L, 3L, 46L, 11L,
8L, 65L, 15L), PIS_AP = c(14L, 2L, 4L, 0L, 0L, 0L, 2L, 1L,
0L, 0L, 0L, 0L, 13L, 0L, 0L, 2L, 1L, 0L, 0L, 1L), PIS_DV = c(3L,
19L, 4L, 1L, 0L, 0L, 2L, 6L, 0L, 0L, 1L, 1L, 38L, 8L, 0L,
5L, 2L, 0L, 3L, 2L), PIS_PL = c(0L, 5L, 8L, 2L, 0L, 0L, 0L,
11L, 0L, 0L, 4L, 0L, 32L, 8L, 0L, 0L, 4L, 0L, 0L, 0L), PIS_SDV = c(18L,
0L, 11L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 4L, 0L, 6L, 0L, 0L,
13L, 0L, 0L, 1L, 0L), PIS_SHOPS = c(3L, 24L, 13L, 3L, 0L,
0L, 2L, 17L, 0L, 0L, 7L, 1L, 71L, 16L, 2L, 5L, 6L, 0L, 3L,
2L), PIS_SR = c(19L, 0L, 14L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
11L, 0L, 6L, 0L, 0L, 20L, 0L, 0L, 1L, 0L), QUANTITY = c(13L,
2L, 18L, 1L, 14L, 1L, 5L, 1L, 1L, 8L, 1L, 1L, 5L, 2L, 2L,
4L, 1L, 3L, 17L, 8L), WKA = c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), NEW_CUST = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), EXIST_CUST = c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), WEB_CUST = c(1L,
0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L,
0L, 0L, 0L, 1L), MOBILE_CUST = c(0L, 1L, 1L, 1L, 0L, 0L,
0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L),
TABLET_CUST = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L), LOGON_CUST_STEP2 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L)), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 9L,
11L, 15L, 16L, 821037L, 821038L, 821039L, 821040L, 821041L, 821042L,
821043L, 821044L, 821047L, 821048L), class = "data.frame")
Code
WKA_ohneJB <- read.csv("WKA_ohneJB_PCA.csv", header=TRUE, sep = ";", stringsAsFactors = FALSE)
Baur_WKA <- subset(WKA_ohneJB, WKA == 1)
Baur_WKA_scale <- scale (Baur_WKA[c(2,4,5,6,7,8,9,10,11)])
set.seed (123)
sample <- Baur_WKA_scale[sample(nrow(Baur_WKA_scale), 10000), ]
dist.eucl <- dist(sample, method = "euclidean")
pam.res <- pam(dist.eucl, 3, metric = "euclidean", stand = FALSE)
fviz_cluster(pam.res, palette = c("#00AFBB", "#FC4E07", "#9932CC"), # color palette
ellipse.type = "t", # Concentration ellipse
repel = FALSE, # Avoid label overplotting (slow)
ggtheme = theme_classic() )
Error: Error in array(x, c(length(x), 1L), if (!is.null(names(x))) list(names(x), :
'data' must be of type vector, was 'NULL

Error in prcomp.default(data, scale = FALSE, center = FALSE) : cannot rescale a constant/zero column to unit variance

my data set hast got 821049 rows and 18 columns. It is about shopping cart abandonment. My task is to cluster shopping cart abandonments. Therefore I used the command subset to display only the lines of the users with a dropout (WKA ==1). Then I tried to execute K-Means and visualize the results graphically. Here an error occurred. I wonder if the error has to do with the variance of some of the variables. There are variables with a low variance for example 0.01, 0.06.
WKA_ohneJB <- read.csv("WKA_ohneJB_PCA.csv", header=TRUE, sep = ";")
WKA <- subset(WKA_ohneJB, WKA == 1)
set.seed (123)
sample <- WKA [sample(nrow(WKA), 10000), ]
fviz_nbclust(sample, kmeans, method = "wss")+ geom_vline(xintercept = 4, linetype = 2)
set.seed(123)
km.res <- kmeans(sample, 3, nstart = 40)
print(km.res)
fviz_cluster(km.res, data = sample, scale = T, palette = c("#2E9FDF", "#00AFBB", "#E7B800"), ellipse.type = "jaccard",
star.plot = TRUE,
repel = TRUE, ggtheme = theme_minimal() )
structure(list(X = c(1L, 2L, 3L, 4L, 5L, 6L, 9L, 11L, 15L, 16L,
821037L, 821038L, 821039L, 821040L, 821041L, 821042L, 821043L,
821044L, 821047L, 821048L), BASKETS_NZ = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
LOGONS = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), PIS = c(71L, 39L, 50L, 4L,
13L, 4L, 13L, 23L, 8L, 7L, 24L, 3L, 111L, 33L, 3L, 46L, 11L,
8L, 65L, 15L), PIS_AP = c(14L, 2L, 4L, 0L, 0L, 0L, 2L, 1L,
0L, 0L, 0L, 0L, 13L, 0L, 0L, 2L, 1L, 0L, 0L, 1L), PIS_DV = c(3L,
19L, 4L, 1L, 0L, 0L, 2L, 6L, 0L, 0L, 1L, 1L, 38L, 8L, 0L,
5L, 2L, 0L, 3L, 2L), PIS_PL = c(0L, 5L, 8L, 2L, 0L, 0L, 0L,
11L, 0L, 0L, 4L, 0L, 32L, 8L, 0L, 0L, 4L, 0L, 0L, 0L), PIS_SDV = c(18L,
0L, 11L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 4L, 0L, 6L, 0L, 0L,
13L, 0L, 0L, 1L, 0L), PIS_SHOPS = c(3L, 24L, 13L, 3L, 0L,
0L, 2L, 17L, 0L, 0L, 7L, 1L, 71L, 16L, 2L, 5L, 6L, 0L, 3L,
2L), PIS_SR = c(19L, 0L, 14L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
11L, 0L, 6L, 0L, 0L, 20L, 0L, 0L, 1L, 0L), QUANTITY = c(13L,
2L, 18L, 1L, 14L, 1L, 5L, 1L, 1L, 8L, 1L, 1L, 5L, 2L, 2L,
4L, 1L, 3L, 17L, 8L), WKA = c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), NEW_CUST = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), EXIST_CUST = c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), WEB_CUST = c(1L,
0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L,
0L, 0L, 0L, 1L), MOBILE_CUST = c(0L, 1L, 1L, 1L, 0L, 0L,
0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L),
TABLET_CUST = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L), LOGON_CUST_STEP2 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L)), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 9L,
11L, 15L, 16L, 821037L, 821038L, 821039L, 821040L, 821041L, 821042L,
821043L, 821044L, 821047L, 821048L), class = "data.frame")
Error in prcomp.default(data, scale = FALSE, center = FALSE) :
cannot rescale a constant/zero column to unit variance

Plotting the distribution for multiple columns

I would like to plot the distribution of multiple columns of my data set. It has over 820.000 rows and 18 columns. I want to plot all columns except the columns with the dummy variables. I have already been able to create a graphic. But I want to have the values of the x-axis on the y-axis because these are the column values and I want to display their distribution for each column.
1. Definition of the path
setwd("C:/Users/A/Documents/Master BWL/Masterarbeit")
2. Loading the required packages
library(factoextra); library(cluster); library(skmeans); library(mclust);
library(fpc); library(psda); library(simEd); library (ggpubr);
library(dbscan); library(clustertend); library(MASS); library(devtools);
library(ggbiplot);library(NbClust); library(clValid); library(plotrix)
library(graphics); library(reshape2)
3. Import csv file
WKA_ohneJB <- read.csv("WKA_ohneJB_PCA.csv", header=TRUE, sep = ";", stringsAsFactors = FALSE)
4 Select columns
WKA_ohneJB2 <- c(WKA_ohneJB[, "BASKETS_NZ"], WKA_ohneJB[, "PIS"], WKA_ohneJB[, "PIS_AP"],
WKA_ohneJB[, "PIS_DV"], WKA_ohneJB[, "PIS_PL"], WKA_ohneJB [, "PIS_SDV"],
WKA_ohneJB[, "PIS_SHOPS"], WKA_ohneJB[,"PIS_SR"], WKA_ohneJB[, "QUANTITY"]
)
df <- melt(WKA_ohneJB2)
5 Plot
ggplot(df) +
geom_col(aes(x= WKA_ohneJB2 , y=value))
This is the plot I have generated so far.
Here is a part of my dataset:
dput(rbind(head(WKA_ohneJB, 10), tail(WKA_ohneJB, 10)))
structure(list(X = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
821039L, 821040L, 821041L, 821042L, 821043L, 821044L, 821045L,
821046L, 821047L, 821048L), BASKETS_NZ = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
LOGONS = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), PIS = c(71L, 39L, 50L, 4L,
13L, 4L, 30L, 65L, 13L, 31L, 111L, 33L, 3L, 46L, 11L, 8L,
17L, 68L, 65L, 15L), PIS_AP = c(14L, 2L, 4L, 0L, 0L, 0L,
1L, 0L, 2L, 1L, 13L, 0L, 0L, 2L, 1L, 0L, 3L, 8L, 0L, 1L),
PIS_DV = c(3L, 19L, 4L, 1L, 0L, 0L, 6L, 2L, 2L, 3L, 38L,
8L, 0L, 5L, 2L, 0L, 1L, 0L, 3L, 2L), PIS_PL = c(0L, 5L, 8L,
2L, 0L, 0L, 0L, 24L, 0L, 6L, 32L, 8L, 0L, 0L, 4L, 0L, 0L,
0L, 0L, 0L), PIS_SDV = c(18L, 0L, 11L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 6L, 0L, 0L, 13L, 0L, 0L, 1L, 15L, 1L, 0L), PIS_SHOPS = c(3L,
24L, 13L, 3L, 0L, 0L, 6L, 28L, 2L, 11L, 71L, 16L, 2L, 5L,
6L, 0L, 1L, 0L, 3L, 2L), PIS_SR = c(19L, 0L, 14L, 0L, 0L,
0L, 2L, 23L, 0L, 3L, 6L, 0L, 0L, 20L, 0L, 0L, 3L, 32L, 1L,
0L), QUANTITY = c(13L, 2L, 18L, 1L, 14L, 1L, 4L, 2L, 5L,
1L, 5L, 2L, 2L, 4L, 1L, 3L, 2L, 8L, 17L, 8L), WKA = c(1L,
1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 0L, 1L, 1L), NEW_CUST = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), EXIST_CUST = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), WEB_CUST = c(1L, 0L, 0L, 0L, 1L, 1L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L), MOBILE_CUST = c(0L,
1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 1L, 0L), TABLET_CUST = c(0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L),
LOGON_CUST_STEP2 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), row.names = c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 821039L, 821040L, 821041L,
821042L, 821043L, 821044L, 821045L, 821046L, 821047L, 821048L
), class = "data.frame")
6 Plotting histogram
var_to_plot = c("BASKETS_NZ","PIS","PIS_AP","PIS_DV","PIS_PL","PIS_SDV", "PIS_SHOPS","PIS_SR", "QUANTITY")
par(mfrow=c(3,3))
for(i in var_to_plot){hist(WKA_ohneJB[,i],xlab=i,main="")}
I have created several histograms. But the scaling of the axes is wrong. I want the numerical values of the x axis to appear on the y axis and the numerical values of the y axis to appear on the x axis. How does this work? I also want the values to be displayed completely and not as e^.
You don't need to combine your dataframe all over again. What you need is either a density plot or histogram.
Also as good practice, load only the packages required for plotting, in this case it would be maybe ggplot2 and tidyr.
For example, I just used an example with 5 of the column names I can see in your data:
library(tidyr)
library(ggplot2)
WKA_ohneJB = data.frame(dummyvar=1:10000,sapply(1:5,rnorm,n=10000))
colnames(WKA_ohneJB)[-1] = c("BASKETS_NZ","PIS","PIS_AP","PIS_DV","PIS_PL")
head(WKA_ohneJB)
dummyvar BASKETS_NZ PIS PIS_AP PIS_DV PIS_PL
1 1 0.92088518 0.9167877 1.956920 4.695379 4.349631
2 2 0.05335686 2.8225161 3.059749 4.317281 5.985579
3 3 1.00141759 3.5743033 2.499662 4.761415 5.886588
4 4 -1.31231486 2.5335004 5.396917 4.364643 5.866026
5 5 -0.65336724 0.2647117 3.203358 4.838659 4.437011
6 6 0.78769080 0.3630670 2.516433 3.826074 3.741611
To one of them do:
ggplot(WKA_ohneJB,aes(x=PIS)) + geom_histogram()
Or:
ggplot(WKA_ohneJB,aes(x=PIS)) + geom_density()
To plot everything at one go, you can try to pivot it long, as you have done with melt, but I don't know if your machine can handle it, so try it for a few variables first:
var_to_plot = c("BASKETS_NZ","PIS","PIS_AP","PIS_DV","PIS_PL")
dummyvar = "dummyvar"
ggplot(pivot_longer(WKA_ohneJB[,c(var_to_plot,dummyvar)],-dummyvar),
aes(x=value)) +
geom_histogram() +
facet_wrap(~name)
If melting the data.frame is too intensive, just use baseR plot:
# means 2 rows, 3 columns
par(mfrow=c(2,3))
for(i in var_to_plot){hist(WKA_ohneJB[,i],xlab=i,main="")}

Dimensionality reduction methods and clustering algorithms for large data set

my data set has got 17 columns and > 80.000 variables. The data set consists entirely of numeric variables. Some columns are dummy variables. I want to use my data set to apply different hard and soft clustering algorithms and compare them. Which methods of dimension reduction and clustering algorithms are recommended for large data sets?
Here is a part of my dataset:
dput(rbind(head(WKA_ohneJB, 10), tail(WKA_ohneJB, 10)))
structure(list(X = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
821039L, 821040L, 821041L, 821042L, 821043L, 821044L, 821045L,
821046L, 821047L, 821048L), BASKETS_NZ = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
LOGONS = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), PIS = c(71L, 39L, 50L, 4L,
13L, 4L, 30L, 65L, 13L, 31L, 111L, 33L, 3L, 46L, 11L, 8L,
17L, 68L, 65L, 15L), PIS_AP = c(14L, 2L, 4L, 0L, 0L, 0L,
1L, 0L, 2L, 1L, 13L, 0L, 0L, 2L, 1L, 0L, 3L, 8L, 0L, 1L),
PIS_DV = c(3L, 19L, 4L, 1L, 0L, 0L, 6L, 2L, 2L, 3L, 38L,
8L, 0L, 5L, 2L, 0L, 1L, 0L, 3L, 2L), PIS_PL = c(0L, 5L, 8L,
2L, 0L, 0L, 0L, 24L, 0L, 6L, 32L, 8L, 0L, 0L, 4L, 0L, 0L,
0L, 0L, 0L), PIS_SDV = c(18L, 0L, 11L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 6L, 0L, 0L, 13L, 0L, 0L, 1L, 15L, 1L, 0L), PIS_SHOPS = c(3L,
24L, 13L, 3L, 0L, 0L, 6L, 28L, 2L, 11L, 71L, 16L, 2L, 5L,
6L, 0L, 1L, 0L, 3L, 2L), PIS_SR = c(19L, 0L, 14L, 0L, 0L,
0L, 2L, 23L, 0L, 3L, 6L, 0L, 0L, 20L, 0L, 0L, 3L, 32L, 1L,
0L), QUANTITY = c(13L, 2L, 18L, 1L, 14L, 1L, 4L, 2L, 5L,
1L, 5L, 2L, 2L, 4L, 1L, 3L, 2L, 8L, 17L, 8L), WKA = c(1L,
1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 0L, 1L, 1L), NEW_CUST = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), EXIST_CUST = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), WEB_CUST = c(1L, 0L, 0L, 0L, 1L, 1L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L), MOBILE_CUST = c(0L,
1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 1L, 0L), TABLET_CUST = c(0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L),
LOGON_CUST_STEP2 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), row.names = c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 821039L, 821040L, 821041L,
821042L, 821043L, 821044L, 821045L, 821046L, 821047L, 821048L
), class = "data.frame")
17x80000 is not so large. You should be able to apply any clustering method on this dataset. It is hard to tell what will work best not knowing the data and the problem in detail. Have a look at "Introduction to Statistical Learning", Ch. 10 for clustering methods. There are also some very instructive R labs for this chapter, which should give you a very quick start.
For further reading also consider "Elements of Statistical Learning" (Chapter 13 onwards).

Calculating optimal number of clusters with Nbclust()

I would like to calculate the optimal number of clusters for a large dataset: 17 columns and >80.000 rows.
This is my code:
1. Definition of the path
setwd("C:/Users/A/Documents/Master BWL/Masterarbeit")
2. Loading the required packages
library(factoextra); library(cluster); library(skmeans); library(mclust);
library(fpc); library(psda); library(simEd); library (ggpubr);
library(dbscan); library(clustertend); library(MASS); library(devtools);
library(ggbiplot);library(NbClust)
3. Import csv file
WKA_ohneJB <- read.csv("WKA_ohneJB_PCA.csv", header=TRUE, sep = ";", stringsAsFactors = FALSE)
WKA_ohneJB_scaled <- scale(WKA_ohneJB)
# NbClust ()
nb <- NbClust(WKA_ohneJB_scaled , distance = "manhattan", min.nc = 2, max.nc = 7, method = "kmeans")
dput(rbind(head(WKA_ohneJB, 10), tail(WKA_ohneJB, 10)))
structure(list(X = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
821039L, 821040L, 821041L, 821042L, 821043L, 821044L, 821045L,
821046L, 821047L, 821048L), BASKETS_NZ = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
LOGONS = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), PIS = c(71L, 39L, 50L, 4L,
13L, 4L, 30L, 65L, 13L, 31L, 111L, 33L, 3L, 46L, 11L, 8L,
17L, 68L, 65L, 15L), PIS_AP = c(14L, 2L, 4L, 0L, 0L, 0L,
1L, 0L, 2L, 1L, 13L, 0L, 0L, 2L, 1L, 0L, 3L, 8L, 0L, 1L),
PIS_DV = c(3L, 19L, 4L, 1L, 0L, 0L, 6L, 2L, 2L, 3L, 38L,
8L, 0L, 5L, 2L, 0L, 1L, 0L, 3L, 2L), PIS_PL = c(0L, 5L, 8L,
2L, 0L, 0L, 0L, 24L, 0L, 6L, 32L, 8L, 0L, 0L, 4L, 0L, 0L,
0L, 0L, 0L), PIS_SDV = c(18L, 0L, 11L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 6L, 0L, 0L, 13L, 0L, 0L, 1L, 15L, 1L, 0L), PIS_SHOPS = c(3L,
24L, 13L, 3L, 0L, 0L, 6L, 28L, 2L, 11L, 71L, 16L, 2L, 5L,
6L, 0L, 1L, 0L, 3L, 2L), PIS_SR = c(19L, 0L, 14L, 0L, 0L,
0L, 2L, 23L, 0L, 3L, 6L, 0L, 0L, 20L, 0L, 0L, 3L, 32L, 1L,
0L), QUANTITY = c(13L, 2L, 18L, 1L, 14L, 1L, 4L, 2L, 5L,
1L, 5L, 2L, 2L, 4L, 1L, 3L, 2L, 8L, 17L, 8L), WKA = c(1L,
1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 0L, 1L, 1L), NEW_CUST = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), EXIST_CUST = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), WEB_CUST = c(1L, 0L, 0L, 0L, 1L, 1L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L), MOBILE_CUST = c(0L,
1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 1L, 0L), TABLET_CUST = c(0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L),
LOGON_CUST_STEP2 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), row.names = c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 821039L, 821040L, 821041L,
821042L, 821043L, 821044L, 821045L, 821046L, 821047L, 821048L
), class = "data.frame")
Error: Error in na.omit(jeu1) : object 'polygons' not found
Simple means of determining number of clusters is to examine the elbow in the plot of within groups sum of squares and/or average width of the silhouette, the code produces simple plots to examine these...
In order to perform clustering, you need to solve the problem of NaNs after scaling...
WKA_ohneJB_scaled <- as.matrix(scale(data[, c(-1, -2, -18)]))
plot_scree_clusters <- function(x) {
wss <- 0
max_i <- 10 # max clusters
for (i in 1:max_i) {
km.model <- kmeans(x, centers = i, nstart = 20)
wss[i] <- km.model$tot.withinss
}
plot(1:max_i, wss, type = "b",
xlab = "Number of Clusters",
ylab = "Within groups sum of squares")
}
plot_scree_clusters(WKA_ohneJB_scaled)
plot_sil_width <- function(x) {
sw <- 0
max_i <- 10 # max clusters
for (i in 2:max_i) {
km.model <- cluster::pam(x = pc_comp$x, k = i)
sw[i] <- km.model$silinfo$avg.width
}
sw <- sw[-1]
plot(2:max_i, sw, type = "b",
xlab = "Number of Clusters",
ylab = "Average silhouette width")
}
plot_sil_width(WKA_ohneJB_scaled)
Use the Elbow Method, as alluded to by knytt. Here are a couple references that describe the technique.
https://www.r-bloggers.com/finding-optimal-number-of-clusters/
https://uc-r.github.io/kmeans_clustering#elbow
Also, consider using the Affinity Propogation library. The AP library will automatically determine the optimal number of clusters for you. Check out the siple example below.
install.packages("apcluster")
library("apcluster")
c1 <- cbind(rnorm(30,.3,.5),rnorm(30.7,.4))
c2 <- cbind(rnorm(30,.7,.4),rnorm(30.4,.5))
x1 <- rbind(c1,c2)
plot(x1, xlab="", ylab="", pch=19, cex=.8)
apresia <- apcluster(negDistMat(r=2),x1)
s1 <- negDistMat(x1,r=2)
apres1b <- apcluster(s1)
apresia
plot(apresia, x1)
Resource:
https://cran.r-project.org/web/packages/apcluster/vignettes/apcluster.pdf

Resources