Selection of columns by criterion

Selection of columns by criterion - r

my dataframe has got 17 columns and 821049 rows. The columns are
BASKETS_NZ, LOGONS, PIS, PIS_AP, PIS_DV, PIS_SDV, PIS_PL, PIS_SHOPS, PIS_SR, QUANTITY, WKA, NEW_CUST, EXIST_CUST, WEB_CUST, MOBILE_CUST, TABLET_CUST, LOGON_CUST_STEP2
I want to select all rows for which WKA = 1. What is the syntax for this?
dpt(rbind(head(WKA_ohneJB, 10), tail(WKA_ohneJB, 10)))
structure(list(X = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
821039L, 821040L, 821041L, 821042L, 821043L, 821044L, 821045L,
821046L, 821047L, 821048L), BASKETS_NZ = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
LOGONS = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), PIS = c(71L, 39L, 50L, 4L,
13L, 4L, 30L, 65L, 13L, 31L, 111L, 33L, 3L, 46L, 11L, 8L,
17L, 68L, 65L, 15L), PIS_AP = c(14L, 2L, 4L, 0L, 0L, 0L,
1L, 0L, 2L, 1L, 13L, 0L, 0L, 2L, 1L, 0L, 3L, 8L, 0L, 1L),
PIS_DV = c(3L, 19L, 4L, 1L, 0L, 0L, 6L, 2L, 2L, 3L, 38L,
8L, 0L, 5L, 2L, 0L, 1L, 0L, 3L, 2L), PIS_PL = c(0L, 5L, 8L,
2L, 0L, 0L, 0L, 24L, 0L, 6L, 32L, 8L, 0L, 0L, 4L, 0L, 0L,
0L, 0L, 0L), PIS_SDV = c(18L, 0L, 11L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 6L, 0L, 0L, 13L, 0L, 0L, 1L, 15L, 1L, 0L), PIS_SHOPS = c(3L,
24L, 13L, 3L, 0L, 0L, 6L, 28L, 2L, 11L, 71L, 16L, 2L, 5L,
6L, 0L, 1L, 0L, 3L, 2L), PIS_SR = c(19L, 0L, 14L, 0L, 0L,
0L, 2L, 23L, 0L, 3L, 6L, 0L, 0L, 20L, 0L, 0L, 3L, 32L, 1L,
0L), QUANTITY = c(13L, 2L, 18L, 1L, 14L, 1L, 4L, 2L, 5L,
1L, 5L, 2L, 2L, 4L, 1L, 3L, 2L, 8L, 17L, 8L), WKA = c(1L,
1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 0L, 1L, 1L), NEW_CUST = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), EXIST_CUST = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), WEB_CUST = c(1L, 0L, 0L, 0L, 1L, 1L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L), MOBILE_CUST = c(0L,
1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 1L, 0L), TABLET_CUST = c(0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L),
LOGON_CUST_STEP2 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), row.names = c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 821039L, 821040L, 821041L,
821042L, 821043L, 821044L, 821045L, 821046L, 821047L, 821048L
), class = "data.frame")

Many ways to do that. Among them:
With data.table :
library(data.table)
as.data.table(df1)[WKA == 1]
With dplyr:
library(dplyr)
df1 %>% filter(WKA==1)

We can use subset
subset(df1, WKA == 1)

Related

Plot PAM Cluster results with fviz_clust

My data set has got 821.000 rows and 18 columns. The variables are continous. As there are some dummy variables I selected only the continous columns. I used pam cluster algorithm and a dissmiliarity matrix. While trying to plot the cluster results an error came up.
structure(list(X = c(1L, 2L, 3L, 4L, 5L, 6L, 9L, 11L, 15L, 16L,
821037L, 821038L, 821039L, 821040L, 821041L, 821042L, 821043L,
821044L, 821047L, 821048L), BASKETS_NZ = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
LOGONS = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), PIS = c(71L, 39L, 50L, 4L,
13L, 4L, 13L, 23L, 8L, 7L, 24L, 3L, 111L, 33L, 3L, 46L, 11L,
8L, 65L, 15L), PIS_AP = c(14L, 2L, 4L, 0L, 0L, 0L, 2L, 1L,
0L, 0L, 0L, 0L, 13L, 0L, 0L, 2L, 1L, 0L, 0L, 1L), PIS_DV = c(3L,
19L, 4L, 1L, 0L, 0L, 2L, 6L, 0L, 0L, 1L, 1L, 38L, 8L, 0L,
5L, 2L, 0L, 3L, 2L), PIS_PL = c(0L, 5L, 8L, 2L, 0L, 0L, 0L,
11L, 0L, 0L, 4L, 0L, 32L, 8L, 0L, 0L, 4L, 0L, 0L, 0L), PIS_SDV = c(18L,
0L, 11L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 4L, 0L, 6L, 0L, 0L,
13L, 0L, 0L, 1L, 0L), PIS_SHOPS = c(3L, 24L, 13L, 3L, 0L,
0L, 2L, 17L, 0L, 0L, 7L, 1L, 71L, 16L, 2L, 5L, 6L, 0L, 3L,
2L), PIS_SR = c(19L, 0L, 14L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
11L, 0L, 6L, 0L, 0L, 20L, 0L, 0L, 1L, 0L), QUANTITY = c(13L,
2L, 18L, 1L, 14L, 1L, 5L, 1L, 1L, 8L, 1L, 1L, 5L, 2L, 2L,
4L, 1L, 3L, 17L, 8L), WKA = c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), NEW_CUST = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), EXIST_CUST = c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), WEB_CUST = c(1L,
0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L,
0L, 0L, 0L, 1L), MOBILE_CUST = c(0L, 1L, 1L, 1L, 0L, 0L,
0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L),
TABLET_CUST = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L), LOGON_CUST_STEP2 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L)), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 9L,
11L, 15L, 16L, 821037L, 821038L, 821039L, 821040L, 821041L, 821042L,
821043L, 821044L, 821047L, 821048L), class = "data.frame")
Code
WKA_ohneJB <- read.csv("WKA_ohneJB_PCA.csv", header=TRUE, sep = ";", stringsAsFactors = FALSE)
Baur_WKA <- subset(WKA_ohneJB, WKA == 1)
Baur_WKA_scale <- scale (Baur_WKA[c(2,4,5,6,7,8,9,10,11)])
set.seed (123)
sample <- Baur_WKA_scale[sample(nrow(Baur_WKA_scale), 10000), ]
dist.eucl <- dist(sample, method = "euclidean")
pam.res <- pam(dist.eucl, 3, metric = "euclidean", stand = FALSE)
fviz_cluster(pam.res, palette = c("#00AFBB", "#FC4E07", "#9932CC"), # color palette
ellipse.type = "t", # Concentration ellipse
repel = FALSE, # Avoid label overplotting (slow)
ggtheme = theme_classic() )
Error: Error in array(x, c(length(x), 1L), if (!is.null(names(x))) list(names(x), :
'data' must be of type vector, was 'NULL

How to create Stratified Sampling for multiple columns in R

my data set has got 821049 variables and 18 columns. I would like to take 9 columns for the stratified sampling. These are "BASKETS_NZ", "PIS", "PIS_AP" "PIS_DV", "PIS_PL", "PIS_SDV", "PIS_SHOPS" "PIS_SR", "QUANTITY". My stratification variable is ID = 1:821049. How do I choose the intervals for my variables? How do I set the size of the sampling?
dpt(rbind(head(WKA_ohneJB, 10), tail(WKA_ohneJB, 10)))
structure(list(X = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
821039L, 821040L, 821041L, 821042L, 821043L, 821044L, 821045L,
821046L, 821047L, 821048L), BASKETS_NZ = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
LOGONS = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), PIS = c(71L, 39L, 50L, 4L,
13L, 4L, 30L, 65L, 13L, 31L, 111L, 33L, 3L, 46L, 11L, 8L,
17L, 68L, 65L, 15L), PIS_AP = c(14L, 2L, 4L, 0L, 0L, 0L,
1L, 0L, 2L, 1L, 13L, 0L, 0L, 2L, 1L, 0L, 3L, 8L, 0L, 1L),
PIS_DV = c(3L, 19L, 4L, 1L, 0L, 0L, 6L, 2L, 2L, 3L, 38L,
8L, 0L, 5L, 2L, 0L, 1L, 0L, 3L, 2L), PIS_PL = c(0L, 5L, 8L,
2L, 0L, 0L, 0L, 24L, 0L, 6L, 32L, 8L, 0L, 0L, 4L, 0L, 0L,
0L, 0L, 0L), PIS_SDV = c(18L, 0L, 11L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 6L, 0L, 0L, 13L, 0L, 0L, 1L, 15L, 1L, 0L), PIS_SHOPS = c(3L,
24L, 13L, 3L, 0L, 0L, 6L, 28L, 2L, 11L, 71L, 16L, 2L, 5L,
6L, 0L, 1L, 0L, 3L, 2L), PIS_SR = c(19L, 0L, 14L, 0L, 0L,
0L, 2L, 23L, 0L, 3L, 6L, 0L, 0L, 20L, 0L, 0L, 3L, 32L, 1L,
0L), QUANTITY = c(13L, 2L, 18L, 1L, 14L, 1L, 4L, 2L, 5L,
1L, 5L, 2L, 2L, 4L, 1L, 3L, 2L, 8L, 17L, 8L), WKA = c(1L,
1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 0L, 1L, 1L), NEW_CUST = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), EXIST_CUST = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), WEB_CUST = c(1L, 0L, 0L, 0L, 1L, 1L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L), MOBILE_CUST = c(0L,
1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 1L, 0L), TABLET_CUST = c(0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L),
LOGON_CUST_STEP2 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), row.names = c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 821039L, 821040L, 821041L,
821042L, 821043L, 821044L, 821045L, 821046L, 821047L, 821048L
), class = "data.frame")

Here is a solution to perform a stratified sampling based on multiple columns. Before implementing this, consider that your data is continuous and a sufficiently large that just a random sampling is adequate.
To solve this problem is to take a stratified sample from each group. The potential approaches to group the data together is by either pasting the 9 columns together or using dplyr's groupby function.
Using the solution is this question How to get around error "factor has new levels" in cross-validation glm? and updating with dplyr style.
This dplyr_stratified function will take the desired sampling ration and an arbitrary number of column and will return a data frame with the sampled rows. See the example below for taking 2 columns.
set.seed(1)
x <- rnorm(n = 100)
y <- rep(x = c("A","B"), times = c(50,50))
z <- rep(x = c("D","E","F"), times = c(33,33,34))
data <- data.frame(x, y=sample(y, replace = TRUE), z=sample(z, replace=TRUE))
library(dplyr)
#optional tag row for later identification:
data$rowid<-1:nrow(data)
dplyr_stratified <- function(df, percent, ...){
columns<-enquos(...)
#group then sample each group
out<-df %>% group_by(!!!columns) %>% slice( sample(1:n(), percent*n()))
}
testgroup<-dplyr_stratified(data, 0.8, z, y)
testgroup
Note: this is assuming each grouping will have a sufficient number of sample in order to select a representative sample. (If the groups are too small then this approach may not meet expectations)

Error in prcomp.default(data, scale = FALSE, center = FALSE) : cannot rescale a constant/zero column to unit variance

my data set hast got 821049 rows and 18 columns. It is about shopping cart abandonment. My task is to cluster shopping cart abandonments. Therefore I used the command subset to display only the lines of the users with a dropout (WKA ==1). Then I tried to execute K-Means and visualize the results graphically. Here an error occurred. I wonder if the error has to do with the variance of some of the variables. There are variables with a low variance for example 0.01, 0.06.
WKA_ohneJB <- read.csv("WKA_ohneJB_PCA.csv", header=TRUE, sep = ";")
WKA <- subset(WKA_ohneJB, WKA == 1)
set.seed (123)
sample <- WKA [sample(nrow(WKA), 10000), ]
fviz_nbclust(sample, kmeans, method = "wss")+ geom_vline(xintercept = 4, linetype = 2)
set.seed(123)
km.res <- kmeans(sample, 3, nstart = 40)
print(km.res)
fviz_cluster(km.res, data = sample, scale = T, palette = c("#2E9FDF", "#00AFBB", "#E7B800"), ellipse.type = "jaccard",
star.plot = TRUE,
repel = TRUE, ggtheme = theme_minimal() )
structure(list(X = c(1L, 2L, 3L, 4L, 5L, 6L, 9L, 11L, 15L, 16L,
821037L, 821038L, 821039L, 821040L, 821041L, 821042L, 821043L,
821044L, 821047L, 821048L), BASKETS_NZ = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
LOGONS = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), PIS = c(71L, 39L, 50L, 4L,
13L, 4L, 13L, 23L, 8L, 7L, 24L, 3L, 111L, 33L, 3L, 46L, 11L,
8L, 65L, 15L), PIS_AP = c(14L, 2L, 4L, 0L, 0L, 0L, 2L, 1L,
0L, 0L, 0L, 0L, 13L, 0L, 0L, 2L, 1L, 0L, 0L, 1L), PIS_DV = c(3L,
19L, 4L, 1L, 0L, 0L, 2L, 6L, 0L, 0L, 1L, 1L, 38L, 8L, 0L,
5L, 2L, 0L, 3L, 2L), PIS_PL = c(0L, 5L, 8L, 2L, 0L, 0L, 0L,
11L, 0L, 0L, 4L, 0L, 32L, 8L, 0L, 0L, 4L, 0L, 0L, 0L), PIS_SDV = c(18L,
0L, 11L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 4L, 0L, 6L, 0L, 0L,
13L, 0L, 0L, 1L, 0L), PIS_SHOPS = c(3L, 24L, 13L, 3L, 0L,
0L, 2L, 17L, 0L, 0L, 7L, 1L, 71L, 16L, 2L, 5L, 6L, 0L, 3L,
2L), PIS_SR = c(19L, 0L, 14L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
11L, 0L, 6L, 0L, 0L, 20L, 0L, 0L, 1L, 0L), QUANTITY = c(13L,
2L, 18L, 1L, 14L, 1L, 5L, 1L, 1L, 8L, 1L, 1L, 5L, 2L, 2L,
4L, 1L, 3L, 17L, 8L), WKA = c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), NEW_CUST = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), EXIST_CUST = c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), WEB_CUST = c(1L,
0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L,
0L, 0L, 0L, 1L), MOBILE_CUST = c(0L, 1L, 1L, 1L, 0L, 0L,
0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L),
TABLET_CUST = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L), LOGON_CUST_STEP2 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L)), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 9L,
11L, 15L, 16L, 821037L, 821038L, 821039L, 821040L, 821041L, 821042L,
821043L, 821044L, 821047L, 821048L), class = "data.frame")
Error in prcomp.default(data, scale = FALSE, center = FALSE) :
cannot rescale a constant/zero column to unit variance

Error in grid.Call.graphics(C_polygon, x$x, x$y, index) : invalid color name in 'cluster' (package Mclust)

I use the Model Based Clustering from the package mclust for a subset of my data set. MClust recommended 1 cluster. The following error was displayed in the graphical display:
WKA_ohneJB <- read.csv("WKA_ohneJB_PCA.csv", header=TRUE, sep = ";", stringsAsFactors = FALSE)
WKA_ohneJB_scale <- scale (WKA_ohneJB)
set.seed(123)
WKA_ohneJB_scale_sub <- WKA_ohneJB_scale [c(1:8000) , c ("BASKETS_NZ", "LOGONS", "PIS", "PIS_AP", "PIS_DV", "PIS_PL", "PIS_SHOPS",
"EXIST_CUST", "WEB_CUST", "MOBILE_CUST", "TABLET_CUST", "LOGON_CUST_STEP2",
"PIS_SDV", "PIS_SR", "QUANTITY", "WKA", "NEW_CUST")]
MC_Baur <- Mclust(WKA_ohneJB_scale_sub)
fviz_mclust(MC_Baur, "classification", geom = "point", pointsize = 1.0, palette = "jco")
Error message: Error in grid.Call.graphics(C_polygon, x$x, x$y, index) :
invalid color name in 'cluster'
Here is part of my data set:
dput(rbind(head(WKA_ohneJB, 10), tail(WKA_ohneJB, 10)))
structure(list(X = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
821039L, 821040L, 821041L, 821042L, 821043L, 821044L, 821045L,
821046L, 821047L, 821048L), BASKETS_NZ = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
LOGONS = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), PIS = c(71L, 39L, 50L, 4L,
13L, 4L, 30L, 65L, 13L, 31L, 111L, 33L, 3L, 46L, 11L, 8L,
17L, 68L, 65L, 15L), PIS_AP = c(14L, 2L, 4L, 0L, 0L, 0L,
1L, 0L, 2L, 1L, 13L, 0L, 0L, 2L, 1L, 0L, 3L, 8L, 0L, 1L),
PIS_DV = c(3L, 19L, 4L, 1L, 0L, 0L, 6L, 2L, 2L, 3L, 38L,
8L, 0L, 5L, 2L, 0L, 1L, 0L, 3L, 2L), PIS_PL = c(0L, 5L, 8L,
2L, 0L, 0L, 0L, 24L, 0L, 6L, 32L, 8L, 0L, 0L, 4L, 0L, 0L,
0L, 0L, 0L), PIS_SDV = c(18L, 0L, 11L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 6L, 0L, 0L, 13L, 0L, 0L, 1L, 15L, 1L, 0L), PIS_SHOPS = c(3L,
24L, 13L, 3L, 0L, 0L, 6L, 28L, 2L, 11L, 71L, 16L, 2L, 5L,
6L, 0L, 1L, 0L, 3L, 2L), PIS_SR = c(19L, 0L, 14L, 0L, 0L,
0L, 2L, 23L, 0L, 3L, 6L, 0L, 0L, 20L, 0L, 0L, 3L, 32L, 1L,
0L), QUANTITY = c(13L, 2L, 18L, 1L, 14L, 1L, 4L, 2L, 5L,
1L, 5L, 2L, 2L, 4L, 1L, 3L, 2L, 8L, 17L, 8L), WKA = c(1L,
1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 0L, 1L, 1L), NEW_CUST = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), EXIST_CUST = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), WEB_CUST = c(1L, 0L, 0L, 0L, 1L, 1L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L), MOBILE_CUST = c(0L,
1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 1L, 0L), TABLET_CUST = c(0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L),
LOGON_CUST_STEP2 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), row.names = c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 821039L, 821040L, 821041L,
821042L, 821043L, 821044L, 821045L, 821046L, 821047L, 821048L
), class = "data.frame")

Dimensionality reduction methods and clustering algorithms for large data set

my data set has got 17 columns and > 80.000 variables. The data set consists entirely of numeric variables. Some columns are dummy variables. I want to use my data set to apply different hard and soft clustering algorithms and compare them. Which methods of dimension reduction and clustering algorithms are recommended for large data sets?
Here is a part of my dataset:
dput(rbind(head(WKA_ohneJB, 10), tail(WKA_ohneJB, 10)))
structure(list(X = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
821039L, 821040L, 821041L, 821042L, 821043L, 821044L, 821045L,
821046L, 821047L, 821048L), BASKETS_NZ = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
LOGONS = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), PIS = c(71L, 39L, 50L, 4L,
13L, 4L, 30L, 65L, 13L, 31L, 111L, 33L, 3L, 46L, 11L, 8L,
17L, 68L, 65L, 15L), PIS_AP = c(14L, 2L, 4L, 0L, 0L, 0L,
1L, 0L, 2L, 1L, 13L, 0L, 0L, 2L, 1L, 0L, 3L, 8L, 0L, 1L),
PIS_DV = c(3L, 19L, 4L, 1L, 0L, 0L, 6L, 2L, 2L, 3L, 38L,
8L, 0L, 5L, 2L, 0L, 1L, 0L, 3L, 2L), PIS_PL = c(0L, 5L, 8L,
2L, 0L, 0L, 0L, 24L, 0L, 6L, 32L, 8L, 0L, 0L, 4L, 0L, 0L,
0L, 0L, 0L), PIS_SDV = c(18L, 0L, 11L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 6L, 0L, 0L, 13L, 0L, 0L, 1L, 15L, 1L, 0L), PIS_SHOPS = c(3L,
24L, 13L, 3L, 0L, 0L, 6L, 28L, 2L, 11L, 71L, 16L, 2L, 5L,
6L, 0L, 1L, 0L, 3L, 2L), PIS_SR = c(19L, 0L, 14L, 0L, 0L,
0L, 2L, 23L, 0L, 3L, 6L, 0L, 0L, 20L, 0L, 0L, 3L, 32L, 1L,
0L), QUANTITY = c(13L, 2L, 18L, 1L, 14L, 1L, 4L, 2L, 5L,
1L, 5L, 2L, 2L, 4L, 1L, 3L, 2L, 8L, 17L, 8L), WKA = c(1L,
1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 0L, 1L, 1L), NEW_CUST = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), EXIST_CUST = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), WEB_CUST = c(1L, 0L, 0L, 0L, 1L, 1L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L), MOBILE_CUST = c(0L,
1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 1L, 0L), TABLET_CUST = c(0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L),
LOGON_CUST_STEP2 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), row.names = c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 821039L, 821040L, 821041L,
821042L, 821043L, 821044L, 821045L, 821046L, 821047L, 821048L
), class = "data.frame")

17x80000 is not so large. You should be able to apply any clustering method on this dataset. It is hard to tell what will work best not knowing the data and the problem in detail. Have a look at "Introduction to Statistical Learning", Ch. 10 for clustering methods. There are also some very instructive R labs for this chapter, which should give you a very quick start.
For further reading also consider "Elements of Statistical Learning" (Chapter 13 onwards).

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Selection of columns by criterion - r

Many ways to do that. Among them: With data.table : library(data.table) as.data.table(df1)[WKA == 1] With dplyr: library(dplyr) df1 %>% filter(WKA==1)

We can use subset subset(df1, WKA == 1)

Related

Plot PAM Cluster results with fviz_clust

How to create Stratified Sampling for multiple columns in R

Error in prcomp.default(data, scale = FALSE, center = FALSE) : cannot rescale a constant/zero column to unit variance

Error in grid.Call.graphics(C_polygon, x$x, x$y, index) : invalid color name in 'cluster' (package Mclust)

Dimensionality reduction methods and clustering algorithms for large data set

Categories

Resources