Lengthen an extremely large data frame with massive sparsity - r

I have a data frame such as this (but of size 16 Billion):
structure(list(id1 = c(1, 2, 3, 4, 4, 4, 4, 4, 4, 4), id2 = c("a",
"b", "c", "d", "e", "f", "g", "h", "i", "j"), b1 = c(NA, NA,
NA, 1L, 1L, 1L, 1L, 1L, 1L, 1L), b2 = c(1, NA, NA, NA, NA, NA,
1, 1, 1, 1), b3 = c(NA, 1, NA, NA, NA, NA, NA, NA, 1, 1), b4 = c(NA,
NA, 1, NA, NA, NA, NA, NA, 1, 1)), .Names = c("id1", "id2", "b1",
"b2", "b3", "b4"), row.names = c(NA, 10L), class = "data.frame")
df
id1 id2 b1 b2 b3 b4
1 1 a NA 1 NA NA
2 2 b NA NA 1 NA
3 3 c NA NA NA 1
4 4 d 1 NA NA NA
5 4 e 1 NA NA NA
6 4 f 1 NA NA NA
7 4 g 1 1 NA NA
8 4 h 1 1 NA NA
9 4 i 1 1 1 1
10 4 j 1 1 1 1
I need to get it into long format, while ONLY keeping values of 1. Of course, I tried using gather from tidyr and also melt from data.table to no avail as the memory requirements of them are explosive. My original data had zeros and ones, but I filled zeroes with NA and hoped na.rm = TRUE option will help with memory issue. But, it does not.
With just ones retained and lengthened, my data frame will fit easily in memory I have.
Is there a better way to get at this vs. using the standard methods - reasonable compute as a tradeoff for better memory fit is acceptable.
My desired output is the equivalent of:
library(dplyr)
library(tidyr)
df %>% gather(b, value, -id1, -id2, na.rm = TRUE)
id1 id2 b value
1 4 d b1 1
2 4 e b1 1
3 4 f b1 1
4 4 g b1 1
5 4 h b1 1
6 4 i b1 1
7 4 j b1 1
8 1 a b2 1
9 4 g b2 1
10 4 h b2 1
11 4 i b2 1
12 4 j b2 1
13 2 b b3 1
14 4 i b3 1
15 4 j b3 1
16 3 c b4 1
17 4 i b4 1
18 4 j b4 1
# or
reshape2::melt(df, id=c("id1","id2"), na.rm=TRUE)
# or
library(data.table)
melt(setDT(df), id=c("id1","id2"), na.rm=TRUE)
Currently, the call to gather on my full data set gives me this error, which I believe is due to memory issue:
Error in .Call("tidyr_melt_dataframe", PACKAGE = "tidyr", data, id_ind, :
negative length vectors are not allowed

Related

Join similar observations within a data.frame with R

I want to mix several observations in a data.frame using as a reference one constantly repeated variable.
Example:
id var1 var2 var3
a 1 na na
a na 2 na
a na na 3
b 1 na
b na 2 na
b na na na
c na na 3
c na 2 na
c 1 na na
Expected result:
id var1 var2 var3
a 1 2 3
b 1 2 na
c 1 2 3
A possible solution (replacing "na" by NA with na_if):
library(tidyverse)
df %>%
na_if("na") %>%
group_by(id) %>%
summarize(across(var1:var3, ~ sort(.x)[1]))
#> # A tibble: 3 × 4
#> id var1 var2 var3
#> <chr> <chr> <chr> <chr>
#> 1 a 1 2 3
#> 2 b 1 2 <NA>
#> 3 c 1 2 3
Assumptions:
"na" above is really R's native NA (not a string);
b's first row, var2 should be NA instead of an empty string ""
perhaps from the above, var1:var3 should be numbers
either you will never have a group where there is more than one non-NA in a group/column, or you don't care about anything other than the first and want the remaining discarded
library(dplyr)
dat %>%
group_by(id) %>%
summarize(across(everything(), ~ na.omit(.)[1]))
# # A tibble: 3 x 4
# id var1 var2 var3
# <chr> <int> <int> <int>
# 1 a 1 2 3
# 2 b 1 2 NA
# 3 c 1 2 3
Data
dat <- structure(list(id = c("a", "a", "a", "b", "b", "b", "c", "c", "c"), var1 = c(1L, NA, NA, 1L, NA, NA, NA, NA, 1L), var2 = c(NA, 2L, NA, NA, 2L, NA, NA, 2L, NA), var3 = c(NA, NA, 3L, NA, NA, NA, 3L, NA, NA)), class = "data.frame", row.names = c(NA, -9L))
Assuming that your data has NA, you can use the following base R option using the Data from #r2evans (thanks!):
aggregate(.~id, dat, mean, na.rm = TRUE, na.action=NULL)
Output:
id var1 var2 var3
1 a 1 2 3
2 b 1 2 NaN
3 c 1 2 3

Replace numerical value in two columns with NA based on a single other column NA value in R

I have simplified my df to:
A <- c("a", "b", "c", "d", "e", "f", "g", "NA", "h", "I")
B <- c(NA, 2, 3, 4, NA, NA, 5, 6, 8, NA)
C <- c(NA, 9, 8, 4, 5, 7, 5, 6, NA, NA)
D <- c(NA, 1, NA, 3, NA, 5, NA, NA, 8, NA)
E <- c(1,2,3,4,5,6,7,8,9,10)
df <- data.frame(A, B, C, D, E)
I would like to create a general code to change the numerical value of columns B and C based on the NA value of column D.
The resulting df2 would be:
A <- c("a", "b", "c", "d", "e", "f", "g", "NA", "h", "I")
B <- c(NA, 2, NA, 4, NA, NA, NA, NA, 8, NA)
C <- c(NA, 9, NA, 4, NA, 7, NA, NA, NA, NA)
D <- c(NA, 1, NA, 3, NA, 5, NA, NA, 8, NA)
E <- c(1,2,3,4,5,6,7,8,9,10)
df2 <- data.frame(A, B, C, D, E)
For my code that isn't working I have so far tried the below which give me the error of "unused argument (as.numeric(B))":
df2 <- df %>% na_if(is.na(D), as.numeric(B)) %>%
na_if(is.na(D), as.numeric(C))
Any help with be greatly appreciate. I cannot install library(naniar) so please no solution that use replace_with_na_at.
Thank you!
With dplyr, we can apply a simple ifelse statement to both B and C using across and replace with NA when they meet the condition (i.e., D is NA).
library(dplyr)
output <- df %>%
mutate(across(B:C, ~ ifelse(is.na(D), NA, .x)))
Output
A B C D E
1 a NA NA NA 1
2 b 2 9 1 2
3 c NA NA NA 3
4 d 4 4 3 4
5 e NA NA NA 5
6 f NA 7 5 6
7 g NA NA NA 7
8 NA NA NA NA 8
9 h 8 NA 8 9
10 I NA NA NA 10
Test
identical(output, df2)
# [1] TRUE
data.table
A <- c("a", "b", "c", "d", "e", "f", "g", "NA", "h", "I")
B <- c(NA, 2, 3, 4, NA, NA, 5, 6, 8, NA)
C <- c(NA, 9, 8, 4, 5, 7, 5, 6, NA, NA)
D <- c(NA, 1, NA, 3, NA, 5, NA, NA, 8, NA)
E <- c(1,2,3,4,5,6,7,8,9,10)
df <- data.frame(A, B, C, D, E)
library(data.table)
cols <- c("B", "C")
setDT(df)[is.na(D), (cols) := NA][]
#> A B C D E
#> 1: a NA NA NA 1
#> 2: b 2 9 1 2
#> 3: c NA NA NA 3
#> 4: d 4 4 3 4
#> 5: e NA NA NA 5
#> 6: f NA 7 5 6
#> 7: g NA NA NA 7
#> 8: NA NA NA NA 8
#> 9: h 8 NA 8 9
#> 10: I NA NA NA 10
Created on 2022-03-02 by the reprex package (v2.0.1)
Base R
A base R solution with Map and is.na<-.
A <- c("a", "b", "c", "d", "e", "f", "g", "NA", "h", "I")
B <- c(NA, 2, 3, 4, NA, NA, 5, 6, 8, NA)
C <- c(NA, 9, 8, 4, 5, 7, 5, 6, NA, NA)
D <- c(NA, 1, NA, 3, NA, 5, NA, NA, 8, NA)
E <- c(1,2,3,4,5,6,7,8,9,10)
df <- data.frame(A, B, C, D, E)
df[c("B", "C")] <- Map(\(x, y) {
is.na(x) <- is.na(y)
x
}, df[c("B", "C")], df["D"])
df
#> A B C D E
#> 1 a NA NA NA 1
#> 2 b 2 9 1 2
#> 3 c NA NA NA 3
#> 4 d 4 4 3 4
#> 5 e NA NA NA 5
#> 6 f NA 7 5 6
#> 7 g NA NA NA 7
#> 8 NA NA NA NA 8
#> 9 h 8 NA 8 9
#> 10 I NA NA NA 10
Created on 2022-03-01 by the reprex package (v2.0.1)
dplyr
And a solution with dplyr, but the same is.na<-.
library(dplyr)
df %>%
mutate(across(B:C, \(x) {is.na(x) <- is.na(D); x}))
#> A B C D E
#> 1 a NA NA NA 1
#> 2 b 2 9 1 2
#> 3 c NA NA NA 3
#> 4 d 4 4 3 4
#> 5 e NA NA NA 5
#> 6 f NA 7 5 6
#> 7 g NA NA NA 7
#> 8 NA NA NA NA 8
#> 9 h 8 NA 8 9
#> 10 I NA NA NA 10
Created on 2022-03-01 by the reprex package (v2.0.1)

How to replace missing data of questionnaire items with row means in R?

df <- data.frame(A1 = c(6, 8, NA, 1, 5),
A2 = c(NA, NA, 9, 3, 6),
A3 = c(9, NA, 1, NA, 4),
B1 = c(NA, NA, 9, 3, 6),
B2 = c(9, NA, 1, NA, 4),
B3 = c(NA, NA, 9, 3, 6)
)
I have a dataset with multiple questionnaires that each have multiple items. I would like to replace the missing data with the row mean of the observable values for each of the questionnaires (missing values in A items replaced by row mean of A1 to A3 and missing values in B items replaces by row mean of B1 to B3). What is the best way to do that?
You may try
df <- data.frame(A1 = c(6, 8, NA, 1, 5),
A2 = c(NA, NA, 9, 3, 6),
A3 = c(9, NA, 1, NA, 4),
B1 = c(NA, NA, 9, 3, 6),
B2 = c(9, NA, 1, NA, 4),
B3 = c(NA, NA, 9, 3, 6)
)
df1 <- df %>%
select(starts_with("A"))
df2 <- df %>%
select(starts_with("B"))
x1 <- which(is.na(df1), arr.ind = TRUE)
df1[x1] <- rowMeans(df1, na.rm = T)[x1[,1]]
x2 <- which(is.na(df2), arr.ind = TRUE)
df2[x2] <- rowMeans(df2, na.rm = T)[x2[,1]]
df <- cbind(df1, df2)
df
A1 A2 A3 B1 B2 B3
1 6 7.5 9 9 9 9
2 8 8.0 8 NaN NaN NaN
3 5 9.0 1 9 1 9
4 1 3.0 2 3 3 3
5 5 6.0 4 6 4 6
You may use split.default to split data in different groups and replace NA with row-wise mean (taken from this answer https://stackoverflow.com/a/6918323/3962914 )
as.data.frame(lapply(split.default(df, sub('\\d+', '', names(df))), function(x) {
k <- which(is.na(x), arr.ind = TRUE)
x[k] <- rowMeans(x, na.rm = TRUE)[k[, 1]]
x
})) -> result
names(result) <- names(df)
result
# A1 A2 A3 B1 B2 B3
#1 6 7.5 9 9 9 9
#2 8 8.0 8 NaN NaN NaN
#3 5 9.0 1 9 1 9
#4 1 3.0 2 3 3 3
#5 5 6.0 4 6 4 6
You could also do:
library(dplyr)
df %>%
reshape(names(.), dir='long', sep="")%>%
group_by(id) %>%
mutate(across(A:B, ~replace(.x, is.na(.x), mean(.x, na.rm = TRUE))))%>%
pivot_wider(id, names_from = time, values_from = A:B, names_sep = "") %>%
ungroup() %>%
select(-id)
# A tibble: 5 x 6
A1 A2 A3 B1 B2 B3
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 6 7.5 9 9 9 9
2 8 8 8 NaN NaN NaN
3 5 9 1 9 1 9
4 1 3 2 3 3 3
5 5 6 4 6 4 6
We can use split.default with na.aggregate
library(purrr)
library(zoo)
library(dplyr)
library(stringr)
map_dfc(split.default(df, str_remove(names(df), "\\d+")), ~
as_tibble(t(na.aggregate(t(.x)))))
# A tibble: 5 × 6
A1 A2 A3 B1 B2 B3
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 6 7.5 9 9 9 9
2 8 8 8 NaN NaN NaN
3 5 9 1 9 1 9
4 1 3 2 3 3 3
5 5 6 4 6 4 6
Span a matrix of rowMeans on the rows and replace the NA's. In an lapply that greps the questions.
do.call(cbind, lapply(c('A', 'B'), function(q) {
s <- df[, grep(q, names(df))]
na <- is.na(s)
replace(s, na, rowMeans(s, na.rm=TRUE)[row(s)][na])
}))
# A1 A2 A3 B1 B2 B3
# 1 6 7.5 9 9 9 9
# 2 8 8.0 8 NaN NaN NaN
# 3 5 9.0 1 9 1 9
# 4 1 3.0 2 3 3 3
# 5 5 6.0 4 6 4 6
Data:
df <- structure(list(A1 = c(6, 8, NA, 1, 5), A2 = c(NA, NA, 9, 3, 6
), A3 = c(9, NA, 1, NA, 4), B1 = c(NA, NA, 9, 3, 6), B2 = c(9,
NA, 1, NA, 4), B3 = c(NA, NA, 9, 3, 6)), class = "data.frame", row.names = c(NA,
-5L))

Create multiple sequences dependent on data frame column

Starting with data with the start of the desired sequences filled in with 1, I need to fill in the NA rows with sequences. Below is the starting data (first two columns) and the desired third column:
I can make this happen with a loop, below, but what is the better R programming way to do it?
for(i in 1:length(df2$col2)) {
df2$col3[i] <- ifelse(df2$col2[i] == 1, 1, df2$col3[i - 1] + 1)
if(is.na(df2$col2[i])) df2$col3[i] <- df2$col3[i - 1] + 1
}
Here is a 20-row data set of the first two columns:
structure(list(col1 = c(478.69, 320.45, 503.7, 609.3, 478.19,
478.69, 320.45, 503.7, 609.3, 478.19, 419.633683050051, 552.939975773916,
785.119385505095, 18.2542654918507, 98.6469651805237, 132.587260054424,
697.119552921504, 512.560374778695, 916.425200179219, 14.3385051051155
), col2 = c(1, NA, 1, NA, NA, 1, NA, 1, NA, NA, NA, NA, 1, NA,
NA, NA, NA, NA, NA, NA)), class = "data.frame", row.names = c(NA,
-20L))
Try:
library(data.table)
df2 <- data.table(df2)
df2[, col3 := col2[1] + 1 * (1:.N - 1), by = .(cumsum(!is.na(col2)))]
You can use ave with seq_along with grouping using cumsum.
df2$col3 <- ave(integer(nrow(df2)), cumsum(!is.na(df2$col2)), FUN=seq_along)
df2
# col1 col2 col3
#1 478.69000 1 1
#2 320.45000 NA 2
#3 503.70000 1 1
#4 609.30000 NA 2
#5 478.19000 NA 3
#6 478.69000 1 1
#7 320.45000 NA 2
#8 503.70000 1 1
#9 609.30000 NA 2
#10 478.19000 NA 3
#11 419.63368 NA 4
#12 552.93998 NA 5
#13 785.11939 1 1
#14 18.25427 NA 2
#15 98.64697 NA 3
#16 132.58726 NA 4
#17 697.11955 NA 5
#18 512.56037 NA 6
#19 916.42520 NA 7
#20 14.33851 NA 8

filtering a dataframe in R based on how many elements in a Row are filled out

I have the following data frame (dput at end):
> d
a b d
1 1 NA NA
2 NA NA NA
3 2 2 2
4 3 3 NA
I want to filter the rows that have at least two items that are not NA. I wish to get the result -- how do I do that?:
> d
a b d
3 2 2 2
4 3 3 NA
> dput(d)
structure(list(a = c(1, NA, 2, 3), b = c(NA, NA, 2, 3), d = c(NA,
NA, 2, NA)), .Names = c("a", "b", "d"), row.names = c(NA, -4L
), class = "data.frame")
We can get the rowSums of the logical matrix (is.na(d)), use that to create a logical vector (..<2) to subset the rows.
d[rowSums(is.na(d))<2,]
# a b d
#3 2 2 2
#4 3 3 NA
Or as #DavidArenburg mentioned, it can be also done with Reduce
df[Reduce(`+`, lapply(df, is.na)) < 2, ]

Resources