Count Complete Cases per Group - r

I have a big data set (roughly 10 000 rows), and want to create a function that counts the number of complete cases (not NAs) per group. I tried various functions (aggregate, table, sum(complete.cases), group_by, etc), but somehow I miss one - probably little - trick. Thanks for any help!
A little sample data set to explain, the result I need.
x <- data.frame(group = c(1:4),
age = c(4:1, c(11, NA,13, NA)),
speed = c(12, NA,15,NA))
print(x)
# group age speed
#1 1 4 12
#2 2 3 NA
#3 3 2 15
#4 4 1 NA
#5 1 11 12
#6 2 NA NA
#7 3 13 15
#8 4 NA NA
One function I wrote reads as follows:
CountPerGroup <- function(group) {
data.set <- subset(x,group %in% group)
vect <- vector()
for (i in 1:length(group)) {
vect[i] <- sum(complete.cases(data.set))
}
output <- data.frame(cbind(group,count=vect))
return(output)
}
The result of
CountPerGroup(2:1)
is
group count
1 2 4
2 1 4
Unfortunately, this is wrong. Instead the outcome should look like
group count
1 2 1
2 1 4
What am I missing? How can I tell R to count of complete.cases per Group?
Thank you very much for any help on this!

Something like should do the trick if you wish to maintain your functionality:
x <- data.frame(group = c(1:4),
age = c(4:1, c(11, NA,13, NA)),
speed = c(12, NA,15,NA))
CountPerGroup <- function(x, groups) {
data.set <- subset(x, group %in% groups)
ans <- sapply(split(data.set, data.set$group),
function(y) sum(complete.cases(y)))
return(data.frame(group = names(ans), count = unname(ans)))
}
CountPerGroup(x, 1:2)
# group count
#1 1 2
#2 2 0
Which is correct from what I can count. But it does not agree with your suggested outcome.
EDIT
It seems that you want the number of non-NA instead and correctly sorted. Use this function instead:
CountPerGroup2 <- function(x, groups) {
data.set <- subset(x, group %in% groups)
ans <- sapply(split(data.set, data.set$group),
function(y) sum(!is.na(y[, !grepl("group", names(y))])))[groups]
return(data.frame(group = names(ans), count = unname(ans)))
}
CountPerGroup2(x, 2:1)
# group count
#1 2 1
#2 1 4

If you are just looking for a way to get the full count of non-NA values per group, you could use something like:
library(plyr)
x <- data.frame(group = c(1:4),
age = c(4:1, c(11, NA,13, NA)),
speed = c(12, NA,15,NA))
counts <- ddply(x, "group", summarize, count=sum(!is.na(c(age, speed))))
## group count
## 1 1 4
## 2 2 1
## 3 3 4
## 4 4 1
You do miss out on having a function that lets you query a subset of the groups, but you get a one-line way to calculate the full solution.

Here is a way with data.table
library(data.table)
library(functional)
countPerGroup = function(x, vec)
{
dt = data.table(x)
d1 = setkey(dt, group)[group %in% vec]
d2 = d1[,lapply(.SD, Compose(Negate(is.na), sum)),by=group]
transform(d2, count=age+speed, speed=NULL, age=NULL)
}
countPerGroup(x, 1:2)
# group count
#1: 1 4
#2: 2 1
countPerGroup(x, c(1,2))
# group count
#1: 1 4
#2: 2 1
If you have a high number of lines in your data.table, it is particularly efficient!

I just had the same problem and found an easier solution
library(data.table)
x <- data.table(group = c(1:4),
age = c(4:1, c(11, NA,13, NA)),
speed = c(12, NA,15,NA))
x[,sum(complete.cases(.SD)), by=group]

Related

Filtering ids when they have the same value across a column in r [duplicate]

I have a data frame like below
sample <- data.frame(ID = 1:9,
Group = c('AA','AA','AA','BB','BB','CC','CC','BB','CC'),
Value = c(1,1,1,2,2,2,3,2,3))
ID Group Value
1 AA 1
2 AA 1
3 AA 1
4 BB 2
5 BB 2
6 CC 2
7 CC 3
8 BB 2
9 CC 3
I want to select groups according to the number of distinct (unique) values within each group. For example, select groups where all values within the group are the same (one distinct value per group). If you look at the group CC, it has more than one distinct value (2 and 3) and should thus be removed. The other groups, with only one distinct value, should be kept. Desired output:
ID Group Value
1 AA 1
2 AA 1
3 AA 1
4 BB 2
5 BB 2
8 BB 2
Would you tell me simple and fast code in R that solves the problem?
Here's a solution using dplyr:
library(dplyr)
sample <- data.frame(
ID = 1:9,
Group= c('AA', 'AA', 'AA', 'BB', 'BB', 'CC', 'CC', 'BB', 'CC'),
Value = c(1, 1, 1, 2, 2, 2, 3, 2, 3)
)
sample %>%
group_by(Group) %>%
filter(n_distinct(Value) == 1)
We group the data by Group, and then only select groups where the number of distinct values of Value is 1.
data.table version:
library(data.table)
sample <- as.data.table(sample)
sample[ , if(uniqueN(Value) == 1) .SD, by = Group]
# Group ID Value
#1: AA 1 1
#2: AA 2 1
#3: AA 3 1
#4: BB 4 2
#5: BB 5 2
#6: BB 8 2
An alternative using ave if the data is numeric, is to check if the variance is 0:
sample[with(sample, ave(Value, Group, FUN=var ))==0,]
An alternative solution that could be faster on large data is:
setkey(sample, Group, Value)
ans <- sample[unique(sample)[, .N, by=Group][N==1, Group]]
The point is that calculating unique values for each group could be time consuming when there are more groups. Instead, we can set the key on the data.table, then take unique values by key (which is extremely fast) and then count the total values for each group. We then require only those where it is 1. We can then perform a join (which is once again very fast). Here's a benchmark on large data:
require(data.table)
set.seed(1L)
sample <- data.table(ID=1:1e7,
Group = sample(rep(paste0("id", 1:1e5), each=100)),
Value = sample(2, 1e7, replace=TRUE, prob=c(0.9, 0.1)))
system.time (
ans1 <- sample[,if(length(unique(Value))==1) .SD ,by=Group]
)
# minimum of three runs
# user system elapsed
# 14.328 0.066 14.382
system.time ({
setkey(sample, Group, Value)
ans2 <- sample[unique(sample)[, .N, by=Group][N==1, Group]]
})
# minimum of three runs
# user system elapsed
# 5.661 0.219 5.877
setkey(ans1, Group, ID)
setkey(ans2, Group, ID)
identical(ans1, ans2) # [1] TRUE
You can make a selector for sample using ave many different ways.
sample[ ave( sample$Value, sample$Group, FUN = function(x) length(unique(x)) ) == 1,]
or
sample[ ave( sample$Value, sample$Group, FUN = function(x) sum(x - x[1]) ) == 0,]
or
sample[ ave( sample$Value, sample$Group, FUN = function(x) diff(range(x)) ) == 0,]
Here's an approach
> ind <- aggregate(Value~Group, FUN=function(x) length(unique(x))==1, data=sample)[,2]
> sample[sample[,"Group"] %in% levels(sample[,"Group"])[ind], ]
ID Group Value
1 1 AA 1
2 2 AA 1
3 3 AA 1
4 4 BB 2
5 5 BB 2
8 8 BB 2

Change the names of multiple cells in R using a function

Consider the following data frame:
df <- setNames(data.frame(1:5,rep(1,5)), c("id", "value"))
I want to change the names for multiple cells in the column "id". Let's say I want to change the following:
df$id[df$id %In% 2:3] <- 1
df$id[df$id == 4] <- 3
However, instead of using the code above, I want to create a function, where I can do the transformation more "smooth" (because I have a lot of data frames, where I need to change the names for the cells). I want to create a function:
mapping <- function(...) {
...
}
where I afterward can create a simple and smooth mapping function for my df, where I only have to specific the "old" and the "new" names for the cells. Something like this:
df_mapping <- function(...) {
2.1
3.1
4.3
}
And then I can apply the function on my data and specific which column it should do it for, and it will work in the same way as the code with gsub:
df <- df_mapping(df,id)
Is it possible to create that mapping function?
if we need a function, then can have a 'data' argument, column name, values to replace and replacer value, then create the logical condition, subset the column, assign with replacer_val and return the dataset after the assignment
f1 <- function(dat, colnm, values_to_replace, replacer_val) {
dat[[colnm]][dat[[colnm]] %in% values_to_replace] <- replacer_val
return(dat)
}
f1(df, "id", c(2, 3), 1)
-output
# id value
#1 1 1
#2 1 1
#3 1 1
#4 4 1
#5 5 1
To replace values with corresponding sets of replacers,
f2 <- function(dat, colnm, values_to_replace, replacer_vals) {
nm1 <- setNames(replacer_vals, values_to_replace)
v1 <- nm1[as.character(dat[[colnm]])]
i1 <- !is.na(v1)
dat[[colnm]][i1] <- v1[i1]
return(dat)
}
f2(df, "id", c(2, 3), c(5, 6))
# id value
#1 1 1
#2 5 1
#3 6 1
#4 4 1
#5 5 1
Or another option is to create a key/value dataset and use merge or join
library(data.table)
f3 <- function(dat, colnm, values_to_replace, replacer_vals) {
keydat <- data.frame(key = values_to_replace, val = replacer_vals)
names(keydat)[1] <- colnm
dt <- as.data.table(dat)
dt[keydat, (colnm) := val, on = colnm][]
return(dt)
}
f3(df, "id", c(2, 5), c(3, 6))
Maybe a mapping like below could help
mapping <- function(df, id, to_replace, obj_value) {
transform(df, id = replace(id, id %in% to_replace, obj_value))
}
e.g.,
> mapping(df, id, c(2, 3), 1)
id value
1 1 1
2 1 1
3 1 1
4 4 1
5 5 1
You can use dplyr's recode function
mapping <- function(data, col, old, new) {
data[[col]] <- dplyr::recode(data[[col]], !!!setNames(new, old))
data
}
mapping(df, "id", c(2, 3), c(7L, 8L))
# id value
#1 1 1
#2 7 1
#3 8 1
#4 4 1
#5 5 1

Getting rows whose value are greater than the group mean

I have a data frame where column "A" has 6 distinct values. Column "B" has float values. By using dplyr, I can group by column "A" and find mean of column "B" of each group as follows:
mydf %>% group_by(A) %>% summarize(Mean = mean(B, na.rm=TRUE))
My utter aim is to find rows in each group whose "B" values are higher than the group average. How can I achieve this (using base R or dplyr)?
A simple alternative with base R ave would be
df[df$b > ave(df$b, df$a) , ]
# a b
#4 1 4
#5 1 5
#9 2 9
#10 2 10
The default argument for ave is mean so no need to mention it explicitly, if there are NA values present in b modify it to
df[df$b > ave(df$b, df$a, FUN = function(x) mean(x,na.rm = TRUE)) , ]
Another solution with subset and ave as suggested by #Onyambu
subset(df,b>ave(b,a))
# a b
#4 1 4
#5 1 5
#9 2 9
#10 2 10
data
df <- data.frame(a = rep(c(1, 2), each = 5), b = 1:10)
df
# a b
#1 1 1
#2 1 2
#3 1 3
#4 1 4
#5 1 5
#6 2 6
#7 2 7
#8 2 8
#9 2 9
#10 2 10
You can just group and then filter:
mydf %>%
group_by(A) %>%
filter(B > mean(B, na.rm = TRUE)) %>%
ungroup()
Using Base R, I would go for this. It is not as elegant as dplyr.
mean.df <- aggregate(mydf$b, by =list(a = mydf$a), FUN = mean)
names(mean.df)[2] <- "mean"
mydf <- merge(mydf, mean.df, by = "a")
# Rows whose values are higher than mean
new.df <- subset(mydf, b > mean, select = -mean)
I like working with Data tables. So a data.table solution would be,
mydt <- data.table(mydf)
mydt[, mean := mean(b), by = a]
new.dt <- mydt[b > mean, -c("mean"), with = TRUE]
Another way to do it using base R and tapply:
mydf = cbind.data.frame(A=sample(6,20,rep=T),B=runif(20))
mydf.ave = tapply(mydf$B,mydf$A,mean)
newdf = mydf[mydf$B > mydf.ave[as.character(mydf$A)],]
(thus the one liner would be:mydf[mydf$B > tapply(mydf$B,mydf$A,mean)[as.character(mydf$A)],])

Factor levels reaching certain values

I need to find out how many factor levels reach values of a continuous variable.
The code below produces the desired result for the example data, but it is rather an awkward work around.
My real dataframe is much larger and the real plot should show more values (or is continuous) on the x-axis. I would appreciate an applicable code a lot.
set.seed(5)
df <- data.frame(ID = factor(c("a","a","b","c","d","e","e")),values = runif(7,0,6))
seq <- 1:5
length.unique <- function(x) length(unique(x))
sub1 <- df[which(df$values >= 1), ]
sub2 <- df[which(df$values >= 2), ]
sub3 <- df[which(df$values >= 3), ]
sub4 <- df[which(df$values >= 4), ]
sub5 <- df[which(df$values >= 5), ]
N_IDs <- c(length.unique(sub1$ID),length.unique(sub2$ID),length.unique(sub3$ID),length.unique(sub4$ID),length.unique(sub5$ID))
plot(N_IDs ~ seq, type="b")
Using tidyverse, you can save some time by first calculating the max value for each ID,
library(tidyverse)
idmax <- df %>% group_by(ID) %>% summarize(max=max(values)) %>% pull(max)
Then for each cut point, return the count that pass
map_df(1:5, ~data.frame(cut=., count=sum(idmax >.)))
# cut count
# 1 1 4
# 2 2 3
# 3 3 3
# 4 4 3
# 5 5 1
Using non-equi joins:
library(data.table)
setDT(df)
df[.(seq = 1:5), on = .(values >= seq), allow = T, .(N_IDs = uniqueN(ID)), by = .EACHI]
# values N_IDs
#1: 1 4
#2: 2 3
#3: 3 3
#4: 4 3
#5: 5 1

Select groups based on number of unique / distinct values

I have a data frame like below
sample <- data.frame(ID = 1:9,
Group = c('AA','AA','AA','BB','BB','CC','CC','BB','CC'),
Value = c(1,1,1,2,2,2,3,2,3))
ID Group Value
1 AA 1
2 AA 1
3 AA 1
4 BB 2
5 BB 2
6 CC 2
7 CC 3
8 BB 2
9 CC 3
I want to select groups according to the number of distinct (unique) values within each group. For example, select groups where all values within the group are the same (one distinct value per group). If you look at the group CC, it has more than one distinct value (2 and 3) and should thus be removed. The other groups, with only one distinct value, should be kept. Desired output:
ID Group Value
1 AA 1
2 AA 1
3 AA 1
4 BB 2
5 BB 2
8 BB 2
Would you tell me simple and fast code in R that solves the problem?
Here's a solution using dplyr:
library(dplyr)
sample <- data.frame(
ID = 1:9,
Group= c('AA', 'AA', 'AA', 'BB', 'BB', 'CC', 'CC', 'BB', 'CC'),
Value = c(1, 1, 1, 2, 2, 2, 3, 2, 3)
)
sample %>%
group_by(Group) %>%
filter(n_distinct(Value) == 1)
We group the data by Group, and then only select groups where the number of distinct values of Value is 1.
data.table version:
library(data.table)
sample <- as.data.table(sample)
sample[ , if(uniqueN(Value) == 1) .SD, by = Group]
# Group ID Value
#1: AA 1 1
#2: AA 2 1
#3: AA 3 1
#4: BB 4 2
#5: BB 5 2
#6: BB 8 2
An alternative using ave if the data is numeric, is to check if the variance is 0:
sample[with(sample, ave(Value, Group, FUN=var ))==0,]
An alternative solution that could be faster on large data is:
setkey(sample, Group, Value)
ans <- sample[unique(sample)[, .N, by=Group][N==1, Group]]
The point is that calculating unique values for each group could be time consuming when there are more groups. Instead, we can set the key on the data.table, then take unique values by key (which is extremely fast) and then count the total values for each group. We then require only those where it is 1. We can then perform a join (which is once again very fast). Here's a benchmark on large data:
require(data.table)
set.seed(1L)
sample <- data.table(ID=1:1e7,
Group = sample(rep(paste0("id", 1:1e5), each=100)),
Value = sample(2, 1e7, replace=TRUE, prob=c(0.9, 0.1)))
system.time (
ans1 <- sample[,if(length(unique(Value))==1) .SD ,by=Group]
)
# minimum of three runs
# user system elapsed
# 14.328 0.066 14.382
system.time ({
setkey(sample, Group, Value)
ans2 <- sample[unique(sample)[, .N, by=Group][N==1, Group]]
})
# minimum of three runs
# user system elapsed
# 5.661 0.219 5.877
setkey(ans1, Group, ID)
setkey(ans2, Group, ID)
identical(ans1, ans2) # [1] TRUE
You can make a selector for sample using ave many different ways.
sample[ ave( sample$Value, sample$Group, FUN = function(x) length(unique(x)) ) == 1,]
or
sample[ ave( sample$Value, sample$Group, FUN = function(x) sum(x - x[1]) ) == 0,]
or
sample[ ave( sample$Value, sample$Group, FUN = function(x) diff(range(x)) ) == 0,]
Here's an approach
> ind <- aggregate(Value~Group, FUN=function(x) length(unique(x))==1, data=sample)[,2]
> sample[sample[,"Group"] %in% levels(sample[,"Group"])[ind], ]
ID Group Value
1 1 AA 1
2 2 AA 1
3 3 AA 1
4 4 BB 2
5 5 BB 2
8 8 BB 2

Resources