Related
I want to perform bootstrap on this data set. Notice that the data has two factors: replicate and level, and two variables high.density and low.density that need to be regressed. I want to perform a bootstrap on this data-set but the replacements can occur only within the nested factor of replicate and level.
replicate level high.density low.density
1 low 14 36
1 low 54 31
1 mid 82 10
1 mid 24 NA
2 low 12 28
2 low 11 45
2 mid 12 17
2 mid NA 24
2 up 40 10
2 up NA 5
2 up 20 2
For instance, in replicate/ level: 1/low the low.density 31 and 36 can be interchanged (or high.density interchanged) so the head of that dataset may look like:
replicate level high.density low.density
1 low 14 31
1 low 54 36
1 mid 82 10
1 mid 24 NA
I then want to estimate the linear regression (glm) from this dataset. I would appreciate any feedback on trying to achieve this.
##DATA FRAME (credits: caldwellst)
df <- structure(list(replicate = c(1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2), level = c("low", "low", "mid", "mid", "low", "low", "mid", "mid", "up", "up", "up"), high.density = c(14, 54, 82, 24, 12, 11, 12, NA, 40, NA, 20), low.density = c(36, 31, 10,
NA, 28, 45, 17, 24, 10, 5, 2)), class = c("spec_tbl_df","tbl_df","tbl", "data.frame"), row.names = c(NA, -11L), spec = structure(list(cols = list(replicate = structure(list(), class = c("collector_double", "collector")), level = structure(list(), class = c("collector_character","collector")), high.density = structure(list(), class = c("collector_double","collector")), low.density = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess", "collector")), skip = 1L), class = "col_spec"))
df$replicate <- as.factor(as.numeric(df$replicate))
df$level <- as.factor(as.character(df$level)
)
We may exploit split and do the sampling according to unique combinations of replicate and level. We could repeat this process B times.
df_shuffle <- function(DF) {
my_split <- split(DF, f = ~ DF$replicate + DF$level)
shuffle <- lapply(my_split, function(x) {
nrX <- nrow(x)
cbind(x[, c('replicate', 'level')],
high.density = x[sample(seq_len(nrX), replace = TRUE), 'high.density'],
low.density = x[sample(seq_len(nrX), replace = TRUE), 'low.density'])
})
DF_new <- do.call(rbind, shuffle)
rownames(DF_new) <- NULL
return(DF_new)
}
B <- 1000L
df_list <- replicate(B, df_shuffle(df), simplify = FALSE)
# ---------------------------------------------------
> df_list[[B]]
replicate level high.density low.density
1 1 low 54 36
2 1 low 54 36
3 2 low 12 45
4 2 low 12 28
5 1 mid 24 10
6 1 mid 82 10
7 2 mid NA 17
8 2 mid 12 17
9 2 up 20 10
10 2 up 40 10
11 2 up 20 5
Because the original data contains missing observations, we either have to multiply impute them or opt to lisewise delete them. For now, let's perform the latter option.
# listwise delete missing observations
df_list <- lapply(df_list, function(x) x[complete.cases(x), ])
Finally, we perform a linear regression on each shuffled dataset and store the B coefficients in out.
row_bind <- function(x) data.frame(do.call(rbind, x))
out <- row_bind(
lapply(df_list, function(x) lm(high.density ~ low.density, data = x)$coef)
)
## out <- row_bind(
## lapply(df_list, function(x) glm(replicate ~ low.density, data = x,
## family = binomial())$coef)
## )
# -------------------------------------------------------------------
> dim(out)
[1] 1000 2
Output
> head(out)
X.Intercept. low.density
1 13.74881 0.2804738
2 20.01074 -0.2095672
3 30.26643 -0.2946373
4 29.19541 -0.2752761
5 37.76273 -0.4555651
6 37.72250 -0.1548349
The code required to create this image can be found here.
Here's a solution using dplyr, purrr, and tidyr. First nest the numeric data, and then sample each of the unique combinations of replicate and level in the data. Then within those, bootstrap the unique values of the densities and then unnest for final data frame.
# library(tidyverse)
library(dplyr)
library(tidyr)
library(purrr)
df %>%
nest(data = ends_with("density")) %>%
slice_sample(n = 500, replace = TRUE) %>%
mutate(data = map(data, ~summarize(.x, across(.fns = sample, size = 1)))) %>%
unnest(cols = data)
#> # A tibble: 500 × 4
#> replicate level high.density low.density
#> <dbl> <chr> <dbl> <dbl>
#> 1 1 low 54 31
#> 2 2 mid 12 24
#> 3 1 mid 24 10
#> 4 2 up 20 2
#> 5 2 mid 12 24
#> 6 2 mid 12 24
#> 7 1 mid 82 10
#> 8 2 up NA 2
#> 9 1 low 14 36
#> 10 2 mid 12 17
#> # … with 490 more rows
Data
df <- structure(list(replicate = c(1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2),
level = c("low", "low", "mid", "mid", "low", "low", "mid",
"mid", "up", "up", "up"), high.density = c(14, 54, 82, 24,
12, 11, 12, NA, 40, NA, 20), low.density = c(36, 31, 10,
NA, 28, 45, 17, 24, 10, 5, 2)), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -11L), spec = structure(list(
cols = list(replicate = structure(list(), class = c("collector_double",
"collector")), level = structure(list(), class = c("collector_character",
"collector")), high.density = structure(list(), class = c("collector_double",
"collector")), low.density = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))
I would like to compare per row 2 df based on serial and day variables and to create a new column called compare to highlight the missing rows. How can this be done in R? I tried the inner_join function without success.
Sample structure df1 and df2
Desired output:
Sample data
df1<-structure(list(serial = c(1, 2, 3, 4, 5), day = c(1, 0, 1, 0,
0)), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"), row.names = c(NA,
-5L), spec = structure(list(cols = list(serial = structure(list(), class = c("collector_double",
"collector")), day = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))
df2<-structure(list(serial = c(1, 2, 3, 4, 5, 5, 7), day = c(1, 0,
1, 0, 0, 1, 1)), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -7L), spec = structure(list(cols = list(
serial = structure(list(), class = c("collector_double",
"collector")), day = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))
We can use tidyverse
library(dplyr)
df2 %>%
mutate(compare = TRUE) %>%
left_join(df1 %>%
mutate(compare1 = TRUE), by = c('serial', 'day')) %>%
transmute(serial, day, compare = (!is.na(compare1)))
-output
# A tibble: 7 x 3
serial day compare
<dbl> <dbl> <lgl>
1 1 1 TRUE
2 2 0 TRUE
3 3 1 TRUE
4 4 0 TRUE
5 5 0 TRUE
6 5 1 FALSE
7 7 1 FALSE
Or with a faster and efficient data.table
library(data.table)
setDT(df2)[, compare := FALSE][setDT(df1), compare := TRUE, on = .(serial, day)]
One way would be to create a unique key combining the two columns and use %in% to find if the key is present in another dataset.
A base R option -
df2$compare <- do.call(paste, df2) %in% do.call(paste, df1)
df2
# A tibble: 7 x 3
# serial day compare
# <dbl> <dbl> <lgl>
#1 1 1 TRUE
#2 2 0 TRUE
#3 3 1 TRUE
#4 4 0 TRUE
#5 5 0 TRUE
#6 5 1 FALSE
#7 7 1 FALSE
If there are more columns in your data apart from serial and day use the below code.
cols <- c('serial', 'day')
df2$compare <- do.call(paste, df2[cols]) %in% do.call(paste, df1[cols])
A base R option
transform(
merge(cbind(df1, compare = TRUE), df2, all = TRUE),
compare = !is.na(compare)
)
gives
serial day compare
1 1 1 TRUE
2 2 0 TRUE
3 3 1 TRUE
4 4 0 TRUE
5 5 0 TRUE
6 5 1 FALSE
7 7 1 FALSE
I'm trying to translate a mutate_at() to a mutate() using dplyr's new "across" function and a bit stumped.
In a nutshell, I need to compare the values in a series of columns to a "baseline" column. When the values in the columns are higher than the baseline, I need to use the baseline value. When the values in the columns are lower than or equal to the baseline, I need to keep the value. Here's an example dataset (my actual dataset is much larger):
test <- structure(list(baseline = c(5, 7, 8, 4, 9, 1, 0, 46, 47), bob = c(7,
11, 34, 9, 6, 8, 3, 49, 12), sally = c(3, 5, 2, 2, 6, 1, 3, 4,
56), rita = c(6, 4, 6, 7, 6, 0, 3, 11, 3)), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -9L), spec = structure(list(
cols = list(baseline = structure(list(), class = c("collector_double",
"collector")), bob = structure(list(), class = c("collector_double",
"collector")), sally = structure(list(), class = c("collector_double",
"collector")), rita = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))
My current code uses mutate_at() and works fine:
trial1 <- test %>%
mutate_at(
vars('bob','sally', 'rita'),
funs(case_when(
. > baseline ~ baseline,
. <= baseline ~ .)))
But when I try to update it to reflect across() from dplyr 1.0, I keep getting an error. Here is my attempt:
trial2 <- test %>%
mutate(across(c(bob, sally, rita),
case_when(. > baseline ~ baseline,
. <= baseline ~ .)))
And here is the error:
error: Problem with mutate() input ..1.
x . > baseline ~ baseline, . <= baseline ~ . must be length 36 or one, not 9, 4.
ℹ Input ..1 is across(...)
Any ideas what I might be doing wrong? Does case_when() work with across?
We can use the ~ to specify the anonymous function/lambda function call
library(dplyr)
test %>%
mutate(across(c(bob, sally, rita),
~ case_when(. > baseline ~ baseline,
. <= baseline ~ .)))
-output
# A tibble: 9 x 4
# baseline bob sally rita
# <dbl> <dbl> <dbl> <dbl>
#1 5 5 3 5
#2 7 7 5 4
#3 8 8 2 6
#4 4 4 2 4
#5 9 6 6 6
#6 1 1 1 0
#7 0 0 0 0
#8 46 46 4 11
#9 47 12 47 3
According to ?across the arguments to fns can be either
Functions to apply to each of the selected columns. Possible values are:
NULL, to returns the columns untransformed.
A function, e.g. mean.
A purrr-style lambda, e.g. ~ mean(.x, na.rm = TRUE)
A list of functions/lambdas, e.g. list(mean = mean, n_miss = ~ sum(is.na(.x))
Also, instead of case_when, we can make use of the pmin
test %>%
mutate(across(c(bob, sally, rita), ~ pmin(baseline, .)))
-output
# A tibble: 9 x 4
# baseline bob sally rita
# <dbl> <dbl> <dbl> <dbl>
#1 5 5 3 5
#2 7 7 5 4
#3 8 8 2 6
#4 4 4 2 4
#5 9 6 6 6
#6 1 1 1 0
#7 0 0 0 0
#8 46 46 4 11
#9 47 12 47 3
This question already has answers here:
Complete dataframe with missing combinations of values
(2 answers)
Closed 2 years ago.
I have a dataframe called flu that is a count of case(n) by group per week.
flu <- structure(list(isoweek = c(1, 1, 2, 2, 3, 3, 4, 5, 5), group = c("fluA",
"fluB", "fluA", "fluB", "fluA", "fluB", "fluA", "fluA", "fluB"
), n = c(5, 6, 3, 5, 12, 14, 6, 23, 25)), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -9L), spec = structure(list(
cols = list(isoweek = structure(list(), class = c("collector_double",
"collector")), group = structure(list(), class = c("collector_character",
"collector")), n = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))
In the data set there are some rows where zero cases are not reported in the data so there are no NA values to work with.
I have identified a fix for this to fill down missing weeks with zeros.
flu %>% complete(isoweek, nesting(group), fill = list(n = 0))
My problem is that this only works for the weeks of data reported. For example, at weeks 6, 7, 8 etc if there are no cases reported I have no data.
How can I extend this fill down process to extend the data frame with zeros for isoweeks 6 to 10 (for example) and have a corresponding fluA and fluB for each week with a zero value for each isoweek/group pair?
You can expand multiple columns in complete. Let's say if you need data till week 8, you can do :
tidyr::complete(flu, isoweek = 1:8, group, fill = list(n = 0))
# A tibble: 16 x 3
# isoweek group n
# <dbl> <chr> <dbl>
# 1 1 fluA 5
# 2 1 fluB 6
# 3 2 fluA 3
# 4 2 fluB 5
# 5 3 fluA 12
# 6 3 fluB 14
# 7 4 fluA 6
# 8 4 fluB 0
# 9 5 fluA 23
#10 5 fluB 25
#11 6 fluA 0
#12 6 fluB 0
#13 7 fluA 0
#14 7 fluB 0
#15 8 fluA 0
#16 8 fluB 0
I would like to merge two dataframe columns.
I have df1 and that has a specific column (df$col1). This column has rows 1-100, certain rows have NA values (lets say rows 10,15,20,50,69).
Dataframe 2 has rows 10,15,20,50,69.
Is it possible to merge DF2 to df$col such that only the NA values in df$col are filled by DF2..depending on the index number for each dataset
I tried this but instead got a dataframe that did not look anything like what I want
merge(brfss2$pa1min_,df,by.x=1,by.y=1,all.x=TRUE,all.y=TRUE)
Here are the two dataframes
Dataframe1:
1 NA
2 110
3 NA
4 35
5 NA
6 120
7 280
8 30
9 240
10 260
11 322
12 NA
Dataframe 2:
1 2127.6
3 1403.0
5 198.0
12 112.8
a different method - I imported your data and gave column names:
df <- structure(list(col1 = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
), col2 = c(NA, 110, NA, 35, NA, 120, 280, 30, 240, 260, 322,
NA)), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"), row.names = c(NA,
-12L), spec = structure(list(cols = list(col1 = structure(list(), class = c("collector_double",
"collector")), col2 = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 2), class = "col_spec"))
df2 <- structure(list(col1 = c(1, 3, 5, 12), col2 = c(2127.6, 1403,
198, 112.8)), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -4L), spec = structure(list(cols = list(
col1 = structure(list(), class = c("collector_double", "collector"
)), col2 = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 2), class = "col_spec"))
Using tidyverse you can merge and then add a new column conditionally based on the value without NA:
library(tidyverse)
df %>%
merge(df2, by = "col1", all.x = TRUE) %>%
mutate(new_col = if_else(is.na(col2.x), col2.y, col2.x)) %>%
select(new_col)
new_col
1 2127.6
2 110.0
3 1403.0
4 35.0
5 198.0
6 120.0
7 280.0
8 30.0
9 240.0
10 260.0
11 322.0
12 112.8
I wrote the package safejoin which solves this very succinctly
# devtools::install_github("moodymudskipper/safejoin")
safe_left_join(df1,df2, by = "col1", conflict = dplyr::coalesce)
# # A tibble: 12 x 2
# col1 col2
# <dbl> <dbl>
# 1 1 2128.
# 2 2 110
# 3 3 1403
# 4 4 35
# 5 5 198
# 6 6 120
# 7 7 280
# 8 8 30
# 9 9 240
# 10 10 260
# 11 11 322
# 12 12 113.