R create variable IF ELSE leads to wrong values - r

I have a dataframe with:
"serial" the number of households, each one with a variable number of components "head, spouse, parent and child or grandchild" and total number of children in the house "nchild"
I want to create a new variable (in the dput I added an example for clarity: withCM 'living with male child' and withCF). I have tried various combinations but I cannot discriminate on the sex of the child within the same "serial", so that for withCM=1 only when relate=="child"&sex==1, but the 1 would appear on a different row (that of the head, spouse or parent)
mydata$withCM<- ifelse(mydata$nchild>0&mydata$relate!="child",1,0)
mydata <- structure(list(serial = c(12345L, 12345L, 12345L, 12345L, 12346L,
12346L, 12347L, 12347L, 12347L, 12348L, 12348L, 12348L, 12348L,
12348L, 12348L, 12348L, 12349L, 12350L, 12350L, 12351L, 12351L,
12351L, 12352L, 12352L, 12352L, 12352L, 12352L, 12353L, 12354L,
12354L), age = c(45L, 44L, 13L, 11L, 29L, 28L, 65L, 61L, 35L,
68L, 61L, 35L, 34L, 6L, 2L, 1L, 62L, 54L, 52L, 67L, 67L, 12L,
49L, 50L, 28L, 21L, 22L, 70L, 89L, 55L), sex = c(1L, 2L, 2L,
1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L,
1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L), relate = structure(c(4L,
7L, 1L, 1L, 4L, 7L, 6L, 6L, 4L, 4L, 7L, 1L, 2L, 3L, 3L, 3L, 4L,
4L, 7L, 4L, 7L, 3L, 4L, 7L, 1L, 5L, 5L, 4L, 6L, 4L), .Label = c("child",
"childinlaw", "grandchild", "head", "nonrelative", "parent",
"spouse"), class = "factor"), nchild = c(2L, 2L, 0L, 0L, 0L,
0L, 1L, 1L, 0L, 1L, 1L, 3L, 3L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 0L), conhija = c(1L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L), conhijo = c(1L,
1L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L)), .Names = c("serial",
"age", "sex", "relate", "nchild", "conhija", "conhijo"), class = "data.frame", row.names = c(NA,
-30L))

You can tabulate the gender, family, and role-within-family as:
xtab <- table(mydata$serial, mydata$sex, mydata$relate)
And then choose the heads of the families (or, in the commented line, anyone who has the specific relationship), and alter their tallies as follows:
mydata$sex1 <- 0
mydata$sex2 <- 0
ind <- mydata$relate=="head"
#ind <- mydata$relate %in% c("head","spouse","parent")
mydata$sex1[ind] <- xtab[as.character(mydata$serial[ind]), "1", "child"]
mydata$sex2[ind] <- xtab[as.character(mydata$serial[ind]), "2", "child"]

Use lapply to split into families, then test if they are an adult, and there is at least one male child in the unit.
lives_with_boy <- function(serial)
{
unit <- mydata[mydata$serial==serial,]
as.character(unit$relate) %in% c("head","spouse","parent") & any(unit$relate == "child" & unit$sex==1)
}
mydata$withCM <- unlist(lapply(unique(mydata$serial),lives_with_boy ))

Related

How to create Stratified Sampling for multiple columns in R

my data set has got 821049 variables and 18 columns. I would like to take 9 columns for the stratified sampling. These are "BASKETS_NZ", "PIS", "PIS_AP" "PIS_DV", "PIS_PL", "PIS_SDV", "PIS_SHOPS" "PIS_SR", "QUANTITY". My stratification variable is ID = 1:821049. How do I choose the intervals for my variables? How do I set the size of the sampling?
dpt(rbind(head(WKA_ohneJB, 10), tail(WKA_ohneJB, 10)))
structure(list(X = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
821039L, 821040L, 821041L, 821042L, 821043L, 821044L, 821045L,
821046L, 821047L, 821048L), BASKETS_NZ = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
LOGONS = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), PIS = c(71L, 39L, 50L, 4L,
13L, 4L, 30L, 65L, 13L, 31L, 111L, 33L, 3L, 46L, 11L, 8L,
17L, 68L, 65L, 15L), PIS_AP = c(14L, 2L, 4L, 0L, 0L, 0L,
1L, 0L, 2L, 1L, 13L, 0L, 0L, 2L, 1L, 0L, 3L, 8L, 0L, 1L),
PIS_DV = c(3L, 19L, 4L, 1L, 0L, 0L, 6L, 2L, 2L, 3L, 38L,
8L, 0L, 5L, 2L, 0L, 1L, 0L, 3L, 2L), PIS_PL = c(0L, 5L, 8L,
2L, 0L, 0L, 0L, 24L, 0L, 6L, 32L, 8L, 0L, 0L, 4L, 0L, 0L,
0L, 0L, 0L), PIS_SDV = c(18L, 0L, 11L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 6L, 0L, 0L, 13L, 0L, 0L, 1L, 15L, 1L, 0L), PIS_SHOPS = c(3L,
24L, 13L, 3L, 0L, 0L, 6L, 28L, 2L, 11L, 71L, 16L, 2L, 5L,
6L, 0L, 1L, 0L, 3L, 2L), PIS_SR = c(19L, 0L, 14L, 0L, 0L,
0L, 2L, 23L, 0L, 3L, 6L, 0L, 0L, 20L, 0L, 0L, 3L, 32L, 1L,
0L), QUANTITY = c(13L, 2L, 18L, 1L, 14L, 1L, 4L, 2L, 5L,
1L, 5L, 2L, 2L, 4L, 1L, 3L, 2L, 8L, 17L, 8L), WKA = c(1L,
1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 0L, 1L, 1L), NEW_CUST = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), EXIST_CUST = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), WEB_CUST = c(1L, 0L, 0L, 0L, 1L, 1L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L), MOBILE_CUST = c(0L,
1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 1L, 0L), TABLET_CUST = c(0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L),
LOGON_CUST_STEP2 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), row.names = c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 821039L, 821040L, 821041L,
821042L, 821043L, 821044L, 821045L, 821046L, 821047L, 821048L
), class = "data.frame")
Here is a solution to perform a stratified sampling based on multiple columns. Before implementing this, consider that your data is continuous and a sufficiently large that just a random sampling is adequate.
To solve this problem is to take a stratified sample from each group. The potential approaches to group the data together is by either pasting the 9 columns together or using dplyr's groupby function.
Using the solution is this question How to get around error "factor has new levels" in cross-validation glm? and updating with dplyr style.
This dplyr_stratified function will take the desired sampling ration and an arbitrary number of column and will return a data frame with the sampled rows. See the example below for taking 2 columns.
set.seed(1)
x <- rnorm(n = 100)
y <- rep(x = c("A","B"), times = c(50,50))
z <- rep(x = c("D","E","F"), times = c(33,33,34))
data <- data.frame(x, y=sample(y, replace = TRUE), z=sample(z, replace=TRUE))
library(dplyr)
#optional tag row for later identification:
data$rowid<-1:nrow(data)
dplyr_stratified <- function(df, percent, ...){
columns<-enquos(...)
#group then sample each group
out<-df %>% group_by(!!!columns) %>% slice( sample(1:n(), percent*n()))
}
testgroup<-dplyr_stratified(data, 0.8, z, y)
testgroup
Note: this is assuming each grouping will have a sufficient number of sample in order to select a representative sample. (If the groups are too small then this approach may not meet expectations)

Plotting the distribution for multiple columns

I would like to plot the distribution of multiple columns of my data set. It has over 820.000 rows and 18 columns. I want to plot all columns except the columns with the dummy variables. I have already been able to create a graphic. But I want to have the values of the x-axis on the y-axis because these are the column values and I want to display their distribution for each column.
1. Definition of the path
setwd("C:/Users/A/Documents/Master BWL/Masterarbeit")
2. Loading the required packages
library(factoextra); library(cluster); library(skmeans); library(mclust);
library(fpc); library(psda); library(simEd); library (ggpubr);
library(dbscan); library(clustertend); library(MASS); library(devtools);
library(ggbiplot);library(NbClust); library(clValid); library(plotrix)
library(graphics); library(reshape2)
3. Import csv file
WKA_ohneJB <- read.csv("WKA_ohneJB_PCA.csv", header=TRUE, sep = ";", stringsAsFactors = FALSE)
4 Select columns
WKA_ohneJB2 <- c(WKA_ohneJB[, "BASKETS_NZ"], WKA_ohneJB[, "PIS"], WKA_ohneJB[, "PIS_AP"],
WKA_ohneJB[, "PIS_DV"], WKA_ohneJB[, "PIS_PL"], WKA_ohneJB [, "PIS_SDV"],
WKA_ohneJB[, "PIS_SHOPS"], WKA_ohneJB[,"PIS_SR"], WKA_ohneJB[, "QUANTITY"]
)
df <- melt(WKA_ohneJB2)
5 Plot
ggplot(df) +
geom_col(aes(x= WKA_ohneJB2 , y=value))
This is the plot I have generated so far.
Here is a part of my dataset:
dput(rbind(head(WKA_ohneJB, 10), tail(WKA_ohneJB, 10)))
structure(list(X = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
821039L, 821040L, 821041L, 821042L, 821043L, 821044L, 821045L,
821046L, 821047L, 821048L), BASKETS_NZ = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
LOGONS = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), PIS = c(71L, 39L, 50L, 4L,
13L, 4L, 30L, 65L, 13L, 31L, 111L, 33L, 3L, 46L, 11L, 8L,
17L, 68L, 65L, 15L), PIS_AP = c(14L, 2L, 4L, 0L, 0L, 0L,
1L, 0L, 2L, 1L, 13L, 0L, 0L, 2L, 1L, 0L, 3L, 8L, 0L, 1L),
PIS_DV = c(3L, 19L, 4L, 1L, 0L, 0L, 6L, 2L, 2L, 3L, 38L,
8L, 0L, 5L, 2L, 0L, 1L, 0L, 3L, 2L), PIS_PL = c(0L, 5L, 8L,
2L, 0L, 0L, 0L, 24L, 0L, 6L, 32L, 8L, 0L, 0L, 4L, 0L, 0L,
0L, 0L, 0L), PIS_SDV = c(18L, 0L, 11L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 6L, 0L, 0L, 13L, 0L, 0L, 1L, 15L, 1L, 0L), PIS_SHOPS = c(3L,
24L, 13L, 3L, 0L, 0L, 6L, 28L, 2L, 11L, 71L, 16L, 2L, 5L,
6L, 0L, 1L, 0L, 3L, 2L), PIS_SR = c(19L, 0L, 14L, 0L, 0L,
0L, 2L, 23L, 0L, 3L, 6L, 0L, 0L, 20L, 0L, 0L, 3L, 32L, 1L,
0L), QUANTITY = c(13L, 2L, 18L, 1L, 14L, 1L, 4L, 2L, 5L,
1L, 5L, 2L, 2L, 4L, 1L, 3L, 2L, 8L, 17L, 8L), WKA = c(1L,
1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 0L, 1L, 1L), NEW_CUST = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), EXIST_CUST = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), WEB_CUST = c(1L, 0L, 0L, 0L, 1L, 1L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L), MOBILE_CUST = c(0L,
1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 1L, 0L), TABLET_CUST = c(0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L),
LOGON_CUST_STEP2 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), row.names = c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 821039L, 821040L, 821041L,
821042L, 821043L, 821044L, 821045L, 821046L, 821047L, 821048L
), class = "data.frame")
6 Plotting histogram
var_to_plot = c("BASKETS_NZ","PIS","PIS_AP","PIS_DV","PIS_PL","PIS_SDV", "PIS_SHOPS","PIS_SR", "QUANTITY")
par(mfrow=c(3,3))
for(i in var_to_plot){hist(WKA_ohneJB[,i],xlab=i,main="")}
I have created several histograms. But the scaling of the axes is wrong. I want the numerical values of the x axis to appear on the y axis and the numerical values of the y axis to appear on the x axis. How does this work? I also want the values to be displayed completely and not as e^.
You don't need to combine your dataframe all over again. What you need is either a density plot or histogram.
Also as good practice, load only the packages required for plotting, in this case it would be maybe ggplot2 and tidyr.
For example, I just used an example with 5 of the column names I can see in your data:
library(tidyr)
library(ggplot2)
WKA_ohneJB = data.frame(dummyvar=1:10000,sapply(1:5,rnorm,n=10000))
colnames(WKA_ohneJB)[-1] = c("BASKETS_NZ","PIS","PIS_AP","PIS_DV","PIS_PL")
head(WKA_ohneJB)
dummyvar BASKETS_NZ PIS PIS_AP PIS_DV PIS_PL
1 1 0.92088518 0.9167877 1.956920 4.695379 4.349631
2 2 0.05335686 2.8225161 3.059749 4.317281 5.985579
3 3 1.00141759 3.5743033 2.499662 4.761415 5.886588
4 4 -1.31231486 2.5335004 5.396917 4.364643 5.866026
5 5 -0.65336724 0.2647117 3.203358 4.838659 4.437011
6 6 0.78769080 0.3630670 2.516433 3.826074 3.741611
To one of them do:
ggplot(WKA_ohneJB,aes(x=PIS)) + geom_histogram()
Or:
ggplot(WKA_ohneJB,aes(x=PIS)) + geom_density()
To plot everything at one go, you can try to pivot it long, as you have done with melt, but I don't know if your machine can handle it, so try it for a few variables first:
var_to_plot = c("BASKETS_NZ","PIS","PIS_AP","PIS_DV","PIS_PL")
dummyvar = "dummyvar"
ggplot(pivot_longer(WKA_ohneJB[,c(var_to_plot,dummyvar)],-dummyvar),
aes(x=value)) +
geom_histogram() +
facet_wrap(~name)
If melting the data.frame is too intensive, just use baseR plot:
# means 2 rows, 3 columns
par(mfrow=c(2,3))
for(i in var_to_plot){hist(WKA_ohneJB[,i],xlab=i,main="")}

Error in grid.Call.graphics(C_polygon, x$x, x$y, index) : invalid color name in 'cluster' (package Mclust)

I use the Model Based Clustering from the package mclust for a subset of my data set. MClust recommended 1 cluster. The following error was displayed in the graphical display:
WKA_ohneJB <- read.csv("WKA_ohneJB_PCA.csv", header=TRUE, sep = ";", stringsAsFactors = FALSE)
WKA_ohneJB_scale <- scale (WKA_ohneJB)
set.seed(123)
WKA_ohneJB_scale_sub <- WKA_ohneJB_scale [c(1:8000) , c ("BASKETS_NZ", "LOGONS", "PIS", "PIS_AP", "PIS_DV", "PIS_PL", "PIS_SHOPS",
"EXIST_CUST", "WEB_CUST", "MOBILE_CUST", "TABLET_CUST", "LOGON_CUST_STEP2",
"PIS_SDV", "PIS_SR", "QUANTITY", "WKA", "NEW_CUST")]
MC_Baur <- Mclust(WKA_ohneJB_scale_sub)
fviz_mclust(MC_Baur, "classification", geom = "point", pointsize = 1.0, palette = "jco")
Error message: Error in grid.Call.graphics(C_polygon, x$x, x$y, index) :
invalid color name in 'cluster'
Here is part of my data set:
dput(rbind(head(WKA_ohneJB, 10), tail(WKA_ohneJB, 10)))
structure(list(X = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
821039L, 821040L, 821041L, 821042L, 821043L, 821044L, 821045L,
821046L, 821047L, 821048L), BASKETS_NZ = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
LOGONS = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), PIS = c(71L, 39L, 50L, 4L,
13L, 4L, 30L, 65L, 13L, 31L, 111L, 33L, 3L, 46L, 11L, 8L,
17L, 68L, 65L, 15L), PIS_AP = c(14L, 2L, 4L, 0L, 0L, 0L,
1L, 0L, 2L, 1L, 13L, 0L, 0L, 2L, 1L, 0L, 3L, 8L, 0L, 1L),
PIS_DV = c(3L, 19L, 4L, 1L, 0L, 0L, 6L, 2L, 2L, 3L, 38L,
8L, 0L, 5L, 2L, 0L, 1L, 0L, 3L, 2L), PIS_PL = c(0L, 5L, 8L,
2L, 0L, 0L, 0L, 24L, 0L, 6L, 32L, 8L, 0L, 0L, 4L, 0L, 0L,
0L, 0L, 0L), PIS_SDV = c(18L, 0L, 11L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 6L, 0L, 0L, 13L, 0L, 0L, 1L, 15L, 1L, 0L), PIS_SHOPS = c(3L,
24L, 13L, 3L, 0L, 0L, 6L, 28L, 2L, 11L, 71L, 16L, 2L, 5L,
6L, 0L, 1L, 0L, 3L, 2L), PIS_SR = c(19L, 0L, 14L, 0L, 0L,
0L, 2L, 23L, 0L, 3L, 6L, 0L, 0L, 20L, 0L, 0L, 3L, 32L, 1L,
0L), QUANTITY = c(13L, 2L, 18L, 1L, 14L, 1L, 4L, 2L, 5L,
1L, 5L, 2L, 2L, 4L, 1L, 3L, 2L, 8L, 17L, 8L), WKA = c(1L,
1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 0L, 1L, 1L), NEW_CUST = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), EXIST_CUST = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), WEB_CUST = c(1L, 0L, 0L, 0L, 1L, 1L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L), MOBILE_CUST = c(0L,
1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 1L, 0L), TABLET_CUST = c(0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L),
LOGON_CUST_STEP2 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), row.names = c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 821039L, 821040L, 821041L,
821042L, 821043L, 821044L, 821045L, 821046L, 821047L, 821048L
), class = "data.frame")

Dimensionality reduction methods and clustering algorithms for large data set

my data set has got 17 columns and > 80.000 variables. The data set consists entirely of numeric variables. Some columns are dummy variables. I want to use my data set to apply different hard and soft clustering algorithms and compare them. Which methods of dimension reduction and clustering algorithms are recommended for large data sets?
Here is a part of my dataset:
dput(rbind(head(WKA_ohneJB, 10), tail(WKA_ohneJB, 10)))
structure(list(X = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
821039L, 821040L, 821041L, 821042L, 821043L, 821044L, 821045L,
821046L, 821047L, 821048L), BASKETS_NZ = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
LOGONS = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), PIS = c(71L, 39L, 50L, 4L,
13L, 4L, 30L, 65L, 13L, 31L, 111L, 33L, 3L, 46L, 11L, 8L,
17L, 68L, 65L, 15L), PIS_AP = c(14L, 2L, 4L, 0L, 0L, 0L,
1L, 0L, 2L, 1L, 13L, 0L, 0L, 2L, 1L, 0L, 3L, 8L, 0L, 1L),
PIS_DV = c(3L, 19L, 4L, 1L, 0L, 0L, 6L, 2L, 2L, 3L, 38L,
8L, 0L, 5L, 2L, 0L, 1L, 0L, 3L, 2L), PIS_PL = c(0L, 5L, 8L,
2L, 0L, 0L, 0L, 24L, 0L, 6L, 32L, 8L, 0L, 0L, 4L, 0L, 0L,
0L, 0L, 0L), PIS_SDV = c(18L, 0L, 11L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 6L, 0L, 0L, 13L, 0L, 0L, 1L, 15L, 1L, 0L), PIS_SHOPS = c(3L,
24L, 13L, 3L, 0L, 0L, 6L, 28L, 2L, 11L, 71L, 16L, 2L, 5L,
6L, 0L, 1L, 0L, 3L, 2L), PIS_SR = c(19L, 0L, 14L, 0L, 0L,
0L, 2L, 23L, 0L, 3L, 6L, 0L, 0L, 20L, 0L, 0L, 3L, 32L, 1L,
0L), QUANTITY = c(13L, 2L, 18L, 1L, 14L, 1L, 4L, 2L, 5L,
1L, 5L, 2L, 2L, 4L, 1L, 3L, 2L, 8L, 17L, 8L), WKA = c(1L,
1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 0L, 1L, 1L), NEW_CUST = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), EXIST_CUST = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), WEB_CUST = c(1L, 0L, 0L, 0L, 1L, 1L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L), MOBILE_CUST = c(0L,
1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 1L, 0L), TABLET_CUST = c(0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L),
LOGON_CUST_STEP2 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), row.names = c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 821039L, 821040L, 821041L,
821042L, 821043L, 821044L, 821045L, 821046L, 821047L, 821048L
), class = "data.frame")
17x80000 is not so large. You should be able to apply any clustering method on this dataset. It is hard to tell what will work best not knowing the data and the problem in detail. Have a look at "Introduction to Statistical Learning", Ch. 10 for clustering methods. There are also some very instructive R labs for this chapter, which should give you a very quick start.
For further reading also consider "Elements of Statistical Learning" (Chapter 13 onwards).

Calculating optimal number of clusters with Nbclust()

I would like to calculate the optimal number of clusters for a large dataset: 17 columns and >80.000 rows.
This is my code:
1. Definition of the path
setwd("C:/Users/A/Documents/Master BWL/Masterarbeit")
2. Loading the required packages
library(factoextra); library(cluster); library(skmeans); library(mclust);
library(fpc); library(psda); library(simEd); library (ggpubr);
library(dbscan); library(clustertend); library(MASS); library(devtools);
library(ggbiplot);library(NbClust)
3. Import csv file
WKA_ohneJB <- read.csv("WKA_ohneJB_PCA.csv", header=TRUE, sep = ";", stringsAsFactors = FALSE)
WKA_ohneJB_scaled <- scale(WKA_ohneJB)
# NbClust ()
nb <- NbClust(WKA_ohneJB_scaled , distance = "manhattan", min.nc = 2, max.nc = 7, method = "kmeans")
dput(rbind(head(WKA_ohneJB, 10), tail(WKA_ohneJB, 10)))
structure(list(X = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
821039L, 821040L, 821041L, 821042L, 821043L, 821044L, 821045L,
821046L, 821047L, 821048L), BASKETS_NZ = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
LOGONS = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), PIS = c(71L, 39L, 50L, 4L,
13L, 4L, 30L, 65L, 13L, 31L, 111L, 33L, 3L, 46L, 11L, 8L,
17L, 68L, 65L, 15L), PIS_AP = c(14L, 2L, 4L, 0L, 0L, 0L,
1L, 0L, 2L, 1L, 13L, 0L, 0L, 2L, 1L, 0L, 3L, 8L, 0L, 1L),
PIS_DV = c(3L, 19L, 4L, 1L, 0L, 0L, 6L, 2L, 2L, 3L, 38L,
8L, 0L, 5L, 2L, 0L, 1L, 0L, 3L, 2L), PIS_PL = c(0L, 5L, 8L,
2L, 0L, 0L, 0L, 24L, 0L, 6L, 32L, 8L, 0L, 0L, 4L, 0L, 0L,
0L, 0L, 0L), PIS_SDV = c(18L, 0L, 11L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 6L, 0L, 0L, 13L, 0L, 0L, 1L, 15L, 1L, 0L), PIS_SHOPS = c(3L,
24L, 13L, 3L, 0L, 0L, 6L, 28L, 2L, 11L, 71L, 16L, 2L, 5L,
6L, 0L, 1L, 0L, 3L, 2L), PIS_SR = c(19L, 0L, 14L, 0L, 0L,
0L, 2L, 23L, 0L, 3L, 6L, 0L, 0L, 20L, 0L, 0L, 3L, 32L, 1L,
0L), QUANTITY = c(13L, 2L, 18L, 1L, 14L, 1L, 4L, 2L, 5L,
1L, 5L, 2L, 2L, 4L, 1L, 3L, 2L, 8L, 17L, 8L), WKA = c(1L,
1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 0L, 1L, 1L), NEW_CUST = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), EXIST_CUST = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), WEB_CUST = c(1L, 0L, 0L, 0L, 1L, 1L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L), MOBILE_CUST = c(0L,
1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 1L, 0L), TABLET_CUST = c(0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L),
LOGON_CUST_STEP2 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), row.names = c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 821039L, 821040L, 821041L,
821042L, 821043L, 821044L, 821045L, 821046L, 821047L, 821048L
), class = "data.frame")
Error: Error in na.omit(jeu1) : object 'polygons' not found
Simple means of determining number of clusters is to examine the elbow in the plot of within groups sum of squares and/or average width of the silhouette, the code produces simple plots to examine these...
In order to perform clustering, you need to solve the problem of NaNs after scaling...
WKA_ohneJB_scaled <- as.matrix(scale(data[, c(-1, -2, -18)]))
plot_scree_clusters <- function(x) {
wss <- 0
max_i <- 10 # max clusters
for (i in 1:max_i) {
km.model <- kmeans(x, centers = i, nstart = 20)
wss[i] <- km.model$tot.withinss
}
plot(1:max_i, wss, type = "b",
xlab = "Number of Clusters",
ylab = "Within groups sum of squares")
}
plot_scree_clusters(WKA_ohneJB_scaled)
plot_sil_width <- function(x) {
sw <- 0
max_i <- 10 # max clusters
for (i in 2:max_i) {
km.model <- cluster::pam(x = pc_comp$x, k = i)
sw[i] <- km.model$silinfo$avg.width
}
sw <- sw[-1]
plot(2:max_i, sw, type = "b",
xlab = "Number of Clusters",
ylab = "Average silhouette width")
}
plot_sil_width(WKA_ohneJB_scaled)
Use the Elbow Method, as alluded to by knytt. Here are a couple references that describe the technique.
https://www.r-bloggers.com/finding-optimal-number-of-clusters/
https://uc-r.github.io/kmeans_clustering#elbow
Also, consider using the Affinity Propogation library. The AP library will automatically determine the optimal number of clusters for you. Check out the siple example below.
install.packages("apcluster")
library("apcluster")
c1 <- cbind(rnorm(30,.3,.5),rnorm(30.7,.4))
c2 <- cbind(rnorm(30,.7,.4),rnorm(30.4,.5))
x1 <- rbind(c1,c2)
plot(x1, xlab="", ylab="", pch=19, cex=.8)
apresia <- apcluster(negDistMat(r=2),x1)
s1 <- negDistMat(x1,r=2)
apres1b <- apcluster(s1)
apresia
plot(apresia, x1)
Resource:
https://cran.r-project.org/web/packages/apcluster/vignettes/apcluster.pdf

Resources