Drop columns when there are many missingness in R - r

I am trying to drop some columns that have less than 5 valid values. Here is an example dataset.
df <- data.frame(id = c(1,2,3,4,5,6,7,8,9,10),
i1 = c(0,1,1,1,1,0,0,1,NA,1),
i2 = c(1,0,0,1,0,1,1,0,0,NA),
i3 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,0),
i4 = c(NA,1,NA,NA,NA,NA,NA,NA,1,NA))
> df
id i1 i2 i3 i4
1 1 0 1 NA NA
2 2 1 0 NA 1
3 3 1 0 NA NA
4 4 1 1 NA NA
5 5 1 0 NA NA
6 6 0 1 NA NA
7 7 0 1 NA NA
8 8 1 0 NA NA
9 9 NA 0 NA 1
10 10 1 NA 0 NA
in this case, columns i3 and i4 needs to be dropped from the data frame.
How can I get the desired dataset below:
> df
id i1 i2
1 1 0 1
2 2 1 0
3 3 1 0
4 4 1 1
5 5 1 0
6 6 0 1
7 7 0 1
8 8 1 0
9 9 NA 0
10 10 1 NA

You can keep cols with at least 5 non-missing values with:
df[colSums(!is.na(df)) >= 5]

Can use discard from the purrr package:
library(purrr)
df <- data.frame(id = c(1,2,3,4,5,6,7,8,9,10),
i1 = c(0,1,1,1,1,0,0,1,NA,1),
i2 = c(1,0,0,1,0,1,1,0,0,NA),
i3 = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,0),
i4 = c(NA,1,NA,NA,NA,NA,NA,NA,1,NA))
df %>%
discard(~ sum(!is.na(.))<5)
#> id i1 i2
#> 1 1 0 1
#> 2 2 1 0
#> 3 3 1 0
#> 4 4 1 1
#> 5 5 1 0
#> 6 6 0 1
#> 7 7 0 1
#> 8 8 1 0
#> 9 9 NA 0
#> 10 10 1 NA
Created on 2022-11-10 with reprex v2.0.2
While this is likely slower than base R methods (for datasets with extremely many columns > 1000), I generally feel the readability of the code is far superior. In addition, it is easy to do more complicated statements.

Using R base, another approach...
> df[, sapply(df, function(x) sum(is.na(x))) < 5]
id i1 i2
1 1 0 1
2 2 1 0
3 3 1 0
4 4 1 1
5 5 1 0
6 6 0 1
7 7 0 1
8 8 1 0
9 9 NA 0
10 10 1 NA

A performance comparison of the different answers given in this post:
funs = list(
colSums = function(df){df[colSums(!is.na(df)) >= nrow/10]},
sapply = function(df){df[, sapply(df, function(x) sum(!is.na(x))) >= nrow/10]},
discard = function(df){df %>% discard(~ sum(!is.na(.)) < nrow/10)},
mutate = function(df){df %>% mutate(across(where(~ sum(!is.na(.)) < nrow/10), ~ NULL))},
select = function(df){df %>% select(where(~ sum(!is.na(.)) >= nrow/10))})
ncol = 10000
nrow = 100
df = replicate(ncol, sample(c(1:9, NA), nrow, TRUE)) %>% as_tibble()
avrtime = map_dbl(funs, function(f){
duration = c()
for(i in 1:10){
t1 = Sys.time()
f(df)
t2 = Sys.time()
duration[i] = as.numeric(t2 - t1)}
return(mean(duration))})
avrtime[order(avrtime)]
The average time taken by each (in seconds):
colSums sapply discard select mutate
0.04510500 0.04692972 0.29207475 0.29451160 0.31755514

Using select
library(dplyr)
df %>%
select(where(~ sum(complete.cases(.x)) >=5))
-output
id i1 i2
1 1 0 1
2 2 1 0
3 3 1 0
4 4 1 1
5 5 1 0
6 6 0 1
7 7 0 1
8 8 1 0
9 9 NA 0
10 10 1 NA
Or in base R
Filter(\(x) sum(complete.cases(x)) >=5 , df)

Related

how to count number of response values by time thresholds in r

I have a student dataset that includes responses to questions as right or wrong. There is also a time variable in seconds. I would like to create a time flag to record number of correct and incorrect responses by 1 minute 2 minute and 3 minute thresholds. Here is a sample dataset.
df <- data.frame(id = c(1,2,3,4,5),
gender = c("m","f","m","f","m"),
age = c(11,12,12,13,14),
i1 = c(1,0,NA,1,0),
i2 = c(0,1,0,"1]",1),
i3 = c("1]",1,"1]",0,"0]"),
i4 = c(0,"0]",1,1,0),
i5 = c(1,1,NA,"0]","1]"),
i6 = c(0,0,"0]",1,1),
i7 = c(1,"1]",1,0,0),
i8 = c(0,0,0,"1]","1]"),
i9 = c(1,1,1,0,NA),
time = c(115,138,148,195, 225))
> df
id gender age i1 i2 i3 i4 i5 i6 i7 i8 i9 time
1 1 m 11 1 0 1] 0 1 0 1 0 1 115
2 2 f 12 0 1 1 0] 1 0 1] 0 1 138
3 3 m 12 NA 0 1] 1 <NA> 0] 1 0 1 148
4 4 f 13 1 1] 0 1 0] 1 0 1] 0 195
5 5 m 14 0 1 0] 0 1] 1 0 1] NA 225
The minute thresholds are represented by a ] sign at the right side of the score.
For example for the id = 3, the 1-minute threshold is at item i3 , the 2-minute threshold is at item i6. Each student might have different time thresholds.
I need to create flagging variables to count number of correct and incorrect responses by the 1-min 2-min and 3-min thresholds.
How can I achieve the desired dataset as below.
> df1
id gender age i1 i2 i3 i4 i5 i6 i7 i8 i9 time one_true one_false two_true two_false three_true three_false
1 1 m 11 1 0 1] 0 1 0 1 0 1 115 2 1 NA NA NA NA
2 2 f 12 0 1 1 0] 1 0 1] 0 1 138 2 2 4 3 NA NA
3 3 m 12 NA 0 1] 1 <NA> 0] 1 0 1 148 1 1 2 2 NA NA
4 4 f 13 1 1] 0 1 0] 1 0 1] 0 195 2 0 3 2 5 3
5 5 m 14 0 1 0] 0 1] 1 0 1] NA 225 1 2 2 3 4 4
library(tidyverse)
df %>%
pivot_longer(i1:i9,values_transform = as.character) %>%
group_by(id)%>%
mutate(vs = rev(cumsum(replace_na(str_detect(rev(value),']'),0))))%>%
filter(vs > 0)%>%
mutate(vs = max(vs) - vs + 1)%>%
group_by(vs,.add = TRUE)%>%
summarise(true = sum(str_detect(value, '1'), na.rm = TRUE),
false = sum(str_detect(value, '0'), na.rm = TRUE),
.groups = "drop_last")%>%
mutate(across(c(true, false),cumsum)) %>%
pivot_wider(id, names_from = vs, values_from = c(true, false))
# A tibble: 5 x 7
# Groups: id [5]
id true_1 true_2 true_3 false_1 false_2 false_3
<dbl> <int> <int> <int> <int> <int> <int>
1 1 2 NA NA 1 NA NA
2 2 2 4 NA 2 3 NA
3 3 1 2 NA 1 2 NA
4 4 2 3 5 0 2 3
5 5 1 2 4 2 3 4
You could also accomplish the same in base R:
fun <- function(x){
a <- diff(c(0,which(grepl("]", x))))
f_sum <- function(x,y) sum(na.omit(grepl(x,y)))
fn <- function(x) c(true = f_sum('1',x), false = f_sum('0',x))
y <- tapply(x[seq(sum(a))], rep(seq_along(a),a), fn)
s <- do.call(rbind, Reduce("+", y, accumulate = TRUE))
nms <- do.call(paste, c(sep='_',expand.grid(colnames(s), seq(nrow(s)))))
setNames(c(t(s)), nms)
}
fun2 <- function(x){
ln <- lengths(x)
nms <- names(x[[which.max(ln)]])
do.call(rbind, lapply(x, function(x)setNames(`length<-`(x,max(ln)),nms)))
}
fun2(apply(df[4:12],1,fun))
true_1 false_1 true_2 false_2 true_3 false_3
[1,] 2 1 NA NA NA NA
[2,] 2 2 4 3 NA NA
[3,] 1 1 2 2 NA NA
[4,] 2 0 3 2 5 3
[5,] 1 2 2 3 4 4

How select and remove rows based on position for a specific range in R

Suppose I have two data frames like this:
df1 <- data.frame(a = c(1,2,4,0,0),
b = c(0,3,5,5,0),
c = c(0,0,6,7,6))
df2 <- data.frame(a = c(3,6,8,0,0),
b = c(0,9,10,4,0),
c = c(0,0,1,4,9))
And then I joint it, like
df3 <- full_join(df1, df2)
print(df3)
a b c
1 1 0 0
2 2 3 0
3 4 5 6
4 0 5 7
5 0 0 6
6 3 0 0
7 6 9 0
8 8 10 1
9 0 4 4
10 0 0 9
Note that I have always the same pattern, with zeros in rows 1 and 2; and in rows 9 and 10. And I also have zeros between rows 4 and 7.
I want to remove, only, the zeros between rows 4 and 7.
So, I can solve it, like:
df3[4,1] <- NA
df3[5,1] <- NA
df3[5,2] <- NA
df3[6,2] <- NA
df3[6,3] <- NA
df3[7,3] <- NA
new.df3 <- as.data.frame(lapply(df3, na.omit))
print(new.df3)
a b c
1 1 0 0
2 2 3 0
3 4 5 6
4 3 5 7
5 6 9 6
6 8 10 1
7 0 4 4
8 0 0 9
But it is not elegant and very time-consuming.
Any thoughts? I really appreciate it, thanks in advance.
Best!
df3 %>%
mutate(rn = between(row_number(), 4, 7)) %>%
summarise(across(-rn, ~.x[!(.x == 0 & rn)]))
a b c
1 1 0 0
2 2 3 0
3 4 5 6
4 3 5 7
5 6 9 6
6 8 10 1
7 0 4 4
8 0 0 9
First, you find which one is zero between rows 4 and 7.
to_remove <- apply(df3[4:7, ], 1, function(x) which(x == 0))
Then, you substitute them by NAs.
for(i in seq(length(to_remove))){
df3[as.numeric(names(to_remove))[i], to_remove[[i]]] <- NA
}
And, finally, drop them.
new.df3 <- as.data.frame(lapply(df3, na.omit))
print(new.df3)
Here's a different approach:
mask <- !(seq(nrow(df3)) %in% 4:7 & df3 == 0)
df.lst <- lapply(1:3, function(x) df3[mask[, x], x])
sapply(df.lst, length)
# [1] 8 8 8 # Check to make sure the columns are the same length
names(df.lst) <- colnames(df3)
(new.df3 <- as.data.frame(df.lst))
# a b c
# 1 1 0 0
# 2 2 3 0
# 3 4 5 6
# 4 3 5 7
# 5 6 9 6
# 6 8 10 1
# 7 0 4 4
# 8 0 0 9

Assign observation level values by grouping variable

Thanks in advance for any help.
I have the below data frame
> df <- data.frame(
id = c(1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5,5,5,5,5),
time = c(1,2,3,4,5,6,1,2,3,4,5,6,1,2,3,4,5,6,1,2,3,4,5,6,1,2,3,4,5,6),
mortality = c(NA,1,0,0,0,0,NA,0,0,0,0,1,NA,0,0,0,0,0,NA,0,0,1,0,0,NA,0,1,0,0,0)
)
> head(df)
id time mortality
1 1 1 NA
2 1 2 1
3 1 3 0
4 1 4 0
5 1 5 0
6 1 6 0
df$id represents individuals measured at six points in time throughout a survival trail. At the start of the trial all individuals are alive and they subsequently die or remain alive. df$mortality represents within which time period that the individual died, for example individual 1 died in time period 2.
I would like to create a new variable indicating what I have called cumulative survival. This would indicate if the individual had died in the current time period or any of the previous time periods. How would I code this?
I have tried a number of different ways using ifelse() statements and dplyr group_by() without success.
Below is what the new data frame should look like. Thanks
> df
id time mortality cum.survival
1 1 1 NA 0
2 1 2 1 1
3 1 3 0 1
4 1 4 0 1
5 1 5 0 1
6 1 6 0 1
7 2 1 NA 0
8 2 2 0 0
9 2 3 0 0
10 2 4 0 0
11 2 5 0 0
12 2 6 1 1
13 3 1 NA 0
14 3 2 0 0
15 3 3 0 0
16 3 4 0 0
17 3 5 0 0
18 3 6 0 0
19 4 1 NA 0
20 4 2 0 0
21 4 3 0 0
22 4 4 1 1
23 4 5 0 1
24 4 6 0 1
25 5 1 NA 0
26 5 2 0 0
27 5 3 1 1
28 5 4 0 1
29 5 5 0 1
30 5 6 0 1
Assuming the person will die only once, we can also use cumsum.
First replacing NA in mortality to 0 in cum.survival.
df <- transform(df, cum.survival = replace(mortality, is.na(mortality), 0))
We can then use base R :
df$cum.survival <- with(df, ave(cum.survival, id, FUN = cumsum))
dplyr :
library(dplyr)
df %>% group_by(id) %>% mutate(cum.survival = cumsum(cum.survival))
Or data.table :
library(data.table)
setDT(df)[, cum.survival := cumsum(cum.survival), id]
Another option is to match the row index in the group to the index where 1 is present.
We can use which.max :
df %>%
group_by(id) %>%
mutate(cum.survival = +(row_number() >= which.max(mortality)))
OR match :
df %>%
group_by(id) %>%
mutate(cum.survival = +(row_number() >= match(1, mortality)))
An option using by:
df$cum.survival <- unlist(by(df$mortality, df$id, function(x) cummax(replace(x, is.na(x), 0L))))
or ave:
df$cum.survival <- ave(df$mortality, df$id, FUN=function(x) cummax(replace(x, is.na(x), 0L)))
or tapply:
df$cum.survival <- unlist(tapply(df$mortality, df$id, FUN=function(x) cummax(replace(x, is.na(x), 0L))))

Finding values in consecutive rows

An example of the dataframe I have is given below.
ID X
1 1
2 2
3 1
4 0
5 0
6 1
7 4
8 5
9 6
10 7
11 0
12 0
I want to apply logic to it that looks to see whether 3 or more consecutive rows have a value >0 in it. If they do I want to flag them in another column. Hence the output will look as follows.
ID X Y
1 1 1
2 2 1
3 1 1
4 0 0
5 0 0
6 1 1
7 4 1
8 5 1
9 6 1
10 7 1
11 0 0
12 0 0
EXTENSION -
How would I get the following output, givibng a different Y value for each group?
ID X Y
1 1 1
2 2 1
3 1 1
4 0 0
5 0 0
6 1 2
7 4 2
8 5 2
9 6 2
10 7 2
11 0 0
12 0 0
One option with base R. Using rle to find the adjacent values in 'X' that are greater than 0, then do the replication based on the lengths
df1$Y <- with(rle(df1$X > 0), as.integer(rep(values & lengths > 2, lengths)))
df1$Y
#[1] 1 1 1 0 0 1 1 1 1 1 0 0
For the updated case in the OP's post
df1$Y <- inverse.rle(within.list(rle(df1$X > 0), {
i1 <- values & (lengths > 2)
values[i1] <- seq_along(values[i1])}))
df1$Y
#[1] 1 1 1 0 0 2 2 2 2 2 0 0
Or using rleid from data.table
library(data.table)
setDT(df1)[, Y := as.integer((.N > 2) * (X > 0)),rleid(X > 0)]
data
df1 <- structure(list(ID = 1:12, X = c(1L, 2L, 1L, 0L, 0L, 1L, 4L, 5L,
6L, 7L, 0L, 0L)), class = "data.frame", row.names = c(NA, -12L
))
We can use rleid from data.table to create groups and use it in ave and get length of each group and assign 1 to groups which has length greater than equal to 3.
library(data.table)
df$Y <- as.integer(ave(df$X, rleid(df$X > 0), FUN = length) >= 3)
df
# ID X Y
#1 1 1 1
#2 2 2 1
#3 3 1 1
#4 4 0 0
#5 5 0 0
#6 6 1 1
#7 7 4 1
#8 8 5 1
#9 9 6 1
#10 10 7 1
#11 11 0 0
#12 12 0 0
EDIT
For updated post we could include the above data.table part with dplyr by doing
library(dplyr)
library(data.table)
df %>%
group_by(group = rleid(X > 0)) %>%
mutate(Y = ifelse(n() >= 3 & row_number() == 1, 1, 0)) %>%
ungroup() %>%
mutate(Y = cumsum(Y) * Y) %>%
group_by(group) %>%
mutate(Y = first(Y)) %>%
ungroup() %>%
select(-group)
# ID X Y
# <int> <int> <dbl>
# 1 1 1 1
# 2 2 2 1
# 3 3 1 1
# 4 4 0 0
# 5 5 0 0
# 6 6 1 2
# 7 7 4 2
# 8 8 5 2
# 9 9 6 2
#10 10 7 2
#11 11 0 0
#12 12 0 0

New variable that indicates the first occurrence of a specific value

I want to create a new variable that indicates the first specific observation of a value for a variable.
In the following example dataset I want to have a new variable "firstna" that is "1" for the first observation of "NA" for this player.
game_data <- data.frame(player = c(1,1,1,1,2,2,2,2), level = c(1,2,3,4,1,2,3,4), points = c(20,NA,NA,NA,20,40,NA,NA))
game_data
player level points
1 1 1 20
2 1 2 NA
3 1 3 NA
4 1 4 NA
5 2 1 20
6 2 2 40
7 2 3 NA
8 2 4 NA
The resulting dataframe should look like this:
game_data_new <- data.frame(player = c(1,1,1,1,2,2,2,2), level = c(1,2,3,4,1,2,3,4), points = c(20,NA,NA,NA,20,40,NA,NA), firstna = c(0,1,0,0,0,0,1,0))
game_data_new
player level points firstna
1 1 1 20 0
2 1 2 NA 1
3 1 3 NA 0
4 1 4 NA 0
5 2 1 20 0
6 2 2 40 0
7 2 3 NA 1
8 2 4 NA 0
To be honest i don't know how to do this. It would be perfect if there is a dplyr option to do so.
A base R solution:
ave(game_data$points, game_data$player,
FUN = function(x) seq_along(x) == match(NA, x, nomatch = 0))
Another ave option to find out first NA by group (player).
game_data$firstna <- ave(game_data$points, game_data$player,
FUN = function(x) cumsum(is.na(x)) == 1)
game_data
# player level points firstna
#1 1 1 20 0
#2 1 2 NA 1
#3 1 3 NA 0
#4 1 4 NA 0
#5 2 1 20 0
#6 2 2 40 0
#7 2 3 NA 1
#8 2 4 NA 0
Here is a solution with data.table:
library("data.table")
game_data <- data.table(player = c(1,1,1,1,2,2,2,2), level = c(1,2,3,4,1,2,3,4), points = c(20,NA,NA,NA,20,40,NA,NA))
game_data[, firstna:=is.na(points) & !is.na(shift(points)), player][]
# > game_data[, firstna:=is.na(points) & !is.na(shift(points)), player][]
# player level points firstna
# 1: 1 1 20 FALSE
# 2: 1 2 NA TRUE
# 3: 1 3 NA FALSE
# 4: 1 4 NA FALSE
# 5: 2 1 20 FALSE
# 6: 2 2 40 FALSE
# 7: 2 3 NA TRUE
# 8: 2 4 NA FALSE
You can do this by grouping by player and then mutating to check if a row has an NA value and the previous row doesn't
game_data %>%
group_by(player) %>%
mutate(firstna = ifelse(is.na(points) & lag(!is.na(points)),1,0)) %>%
ungroup()
Result:
# A tibble: 8 x 4
# Groups: player [2]
player level points firstna
<dbl> <dbl> <dbl> <dbl>
1 1 1 20 0
2 1 2 NA 1
3 1 3 NA 0
4 1 4 NA 0
5 2 1 20 0
6 2 2 40 0
7 2 3 NA 1
8 2 4 NA 0
library(tidyverse)
library(data.table)
data.frame(
player = c(1,1,1,1,2,2,2,2),
level = c(1,2,3,4,1,2,3,4),
points = c(20,NA,NA,NA,20,40,NA,NA)
) -> game_data
game_data_base1 <- game_data
game_data_dt <- data.table(game_data)
microbenchmark::microbenchmark(
better_base = game_data$first_na <- ave(
game_data$points,
game_data$player,
FUN=function(x) seq_along(x)==match(NA,x,nomatch=0)
),
brute_base = do.call(
rbind.data.frame,
lapply(
split(game_data, game_data$player),
function(x) {
x$firstna <- 0
na_loc <- which(is.na(x$points))
if (length(na_loc) > 0) x$firstna[na_loc[1]] <- 1
x
}
)
),
tidy = game_data %>%
group_by(player) %>%
mutate(firstna=as.numeric(is.na(points) & !duplicated(points))) %>%
ungroup(),
dt = game_data_dt[, firstna:=as.integer(is.na(points) & !is.na(shift(points))), player]
)
## Unit: microseconds
## expr min lq mean median uq max neval
## better_base 125.188 156.861 362.9829 191.6385 355.6675 3095.958 100
## brute_base 366.642 450.002 2782.6621 658.0380 1072.6475 174373.974 100
## tidy 998.924 1119.022 2528.3687 1509.0705 2516.9350 42406.778 100
## dt 330.428 421.211 1031.9978 535.8415 1042.1240 9671.991 100
game_data %>%
group_by(player) %>%
mutate(firstna=as.numeric(is.na(points) & !duplicated(points)))
Group by player, then create a boolean vector for cases that are both NA and not duplicates for previous rows.
# A tibble: 8 x 4
# Groups: player [2]
player level points firstna
<dbl> <dbl> <dbl> <dbl>
1 1 1 20 0
2 1 2 NA 1
3 1 3 NA 0
4 1 4 NA 0
5 2 1 20 0
6 2 2 40 0
7 2 3 NA 1
8 2 4 NA 0
If you want the 1s on the last non-NA line before an NA, replace the mutate line with this:
mutate(lastnonNA=as.numeric(!is.na(points) & is.na(lead(points))))
First row of a block of NAs that runs all the way to the end of the player's group:
game_data %>%
group_by(player) %>%
mutate(firstna=as.numeric(is.na(points) & !duplicated(cbind(points,cumsum(!is.na(points))))))
Another way using base:
game_data$firstna <-
unlist(
tapply(game_data$points, game_data$player, function(x) {i<-which(is.na(x))[1];x[]<-0;x[i]<-1;x})
)
or as another ?ave clone:
ave(game_data$points, game_data$player, FUN = function(x) {
i<-which(is.na(x))[1];x[]<-0;x[i]<-1;x
})
An option using diff
transform(game_data, firstna = ave(is.na(points), player, FUN = function(x) c(0,diff(x))))
# player level points firstna
# 1 1 1 20 0
# 2 1 2 NA 1
# 3 1 3 NA 0
# 4 1 4 NA 0
# 5 2 1 20 0
# 6 2 2 40 0
# 7 2 3 NA 1
# 8 2 4 NA 0
And its dplyr equivalent:
library(dplyr)
game_data %>% group_by(player) %>% mutate(firstna = c(0,diff(is.na(points))))
# # A tibble: 8 x 4
# # Groups: player [2]
# player level points firstna
# <dbl> <dbl> <dbl> <dbl>
# 1 1 1 20 0
# 2 1 2 NA 1
# 3 1 3 NA 0
# 4 1 4 NA 0
# 5 2 1 20 0
# 6 2 2 40 0
# 7 2 3 NA 1
# 8 2 4 NA 0

Resources