My dataframe looks like this:
Date = c(rep(as.Date(seq(15000,15012)),2))
Group = c(rep("a",13),rep("b",13))
y = c(seq(1,26,1))
x1 = c(seq(0.01,0.26,0.01))
x2 = c(seq(0.02,0.26*2,0.02))
df = data.frame(Group,Date,y,x1,x2)
head(df,3)
Group
Date
y
x1
x2
a
2011-01-26
1
0.01
0.02
a
2011-01-27
2
0.02
0.04
a
2011-01-28
3
0.03
0.06
And I would like to do multiple regression by group (y as the dependent variable and x1, x2 as the independent variables) in a rolling window i.e. 3.
I have tried to achieve this using packages tidyverse and zoo with following codes but failed.
## define multi-var-linear regression function and get the residual
rsd <- function(df){
lm(formula = y~x1+x2, data = as.data.frame(df), na.action = na.omit) %>%
resid() %>%
return()
}
## apply it by group with rolling window
x <- df %>% group_by(Group) %>%
rollapplyr(. , width = 3, FUN = rsd)
The output of this code is not what I acutually want.
Does anyone know how to do multiple regression by group in a rolling window?
Thanks in advance, Giselle
Thank Grothendieck and Marcus for your codes!
It really helped me a lot:)
I now appened them here:
# Grothendieck method
rsd <- function(df){
lm(formula = y~x1+x2, data = as.data.frame(df), na.action = na.omit) %>%
resid() %>%
return()
}
width <- 5
df_m2 <-
df %>%
group_by(Group) %>%
group_modify(~ {
cbind(., rollapplyr(.[c("y", "x1", "x2")], width, rsd, fill = NA,
by.column = FALSE))
}) %>%
ungroup %>%
select(c("Group","Date","5")) %>%
dplyr::rename(residual_m2 = "5")
# Marcus method
output <- data.frame()
for (i in unique(df$Group)) {
a = df%>% subset(Group==i)
a[,"residual"] = NA
max = nrow(a)
if(max<5){
next
}
for (j in seq(5,max,by=1)) {
b = a %>% slice((j-4):j)
lm_ = lm(y~x1+x2, data = b)
a[j,]$residual = residuals(lm_)[5]
}
output <-
output %>%
rbind(a)
}
Use group_modify and use rollapplyr with the by.column = FALSE argument so that rsd is applied to all columns at once rather than one at a time.
Note that if you use width 3 with two predictors and an intercept the residuals will necessarily be all zero so we changed the width to 5.
library(dplyr, exclude = c("lag", "filter"))
library(zoo)
width <- 5
df %>%
group_by(Group) %>%
group_modify(~ {
cbind(., rollapplyr(.[c("y", "x1", "x2")], width, rsd, fill = NA,
by.column = FALSE))
}) %>%
ungroup
A good old-fashioned for-loop here could be:
for (i in unique(df$Group)){
for (j in (seq(15000,15012, 3))){
lm_ <- lm(formula = df[df$Group== i & df$Date %in% c(j, j+1, j+2), 3] ~ df[df$Group== i & df$Date %in% c(j, j+1, j+2), 4] + df[df$Group== i & df$Date %in% c(j, j+1, j+2), 5], na.action = na.omit)
print(paste('Group', i, 'Dates from', j, 'to', j+3, residuals(lm_)))
}
}
Related
library(tidyverse)
library(broom)
library(dplyr)
# create a tibble with an id column for each simulation and x wrapped in list()
sim <- tibble(id = 1:1000,
x = list(rbinom(1000,1,0.5))) %>%
# to generate z, pr, y, k use map and map2 from the purrr package to loop over the list column x
# `~ ... ` is similar to `function(.x) {...}`
# `.x` represents the variable you are using map on
mutate(z = map(x, ~ log(1.3) * .x),
pr = map(z, ~ 1 / (1 + exp(-.x))),
y = map(pr, ~ rbinom(1000, 1, .x)),
k = map2(x, y, ~ glm(.y ~ .x, family="binomial")),
# use broom::tidy to get the model summary in form of a tibble
sum = map(k, broom::tidy)) %>%
# select id and sum and unnest the tibbles
select(id, sum) %>%
unnest(cols = c(sum)) %>%
# drop the intercepts and every .x with a p < 0.05
filter(term !="(Intercept)",
p.value < 0.05)
sim
j=exp(sim %>% select("estimate"))
OR=as.numeric(unlist(j))
mean(OR)
hist(OR,main=NULL,freq=T,breaks=10)
abline(v=mean(OR),lwd=4,col=1)
The question here: now I extract all the value which p<0.05, now I using the code "hist(OR,main=NULL,freq=T,breaks=10)" to make a histogram for the odds ratio. The new thing I want to do is make another histogram(like without any condition for p-value) overlapping the original one, then I could compare the histogram with the different p-value in one plot, which code can work with that?
This solution repeats the question's code but
stop the pipe right after unnest(cols = c(sum));
create a simOR like you have continued the pipe and a simAll but this time not filtering the p-values.
First the question's code. Note that if package tidyverse is loaded there is no need to load package dplyr.
I also set the RNG seed to make the results reproducible.
library(tidyverse)
library(broom)
# create a tibble with an id column for each simulation and x wrapped in list()
set.seed(2020)
sim <- tibble(id = 1:1000,
x = list(rbinom(1000,1,0.5))) %>%
# to generate z, pr, y, k use map and map2 from the purrr package to loop over the list column x
# `~ ... ` is similar to `function(.x) {...}`
# `.x` represents the variable you are using map on
mutate(z = map(x, ~ log(1.3) * .x),
pr = map(z, ~ 1 / (1 + exp(-.x))),
y = map(pr, ~ rbinom(1000, 1, .x)),
k = map2(x, y, ~ glm(.y ~ .x, family="binomial")),
# use broom::tidy to get the model summary in form of a tibble
sum = map(k, broom::tidy)) %>%
# select id and sum and unnest the tibbles
select(id, sum) %>%
unnest(cols = c(sum))
Now create the two data sets to be plotted.
simOR <- sim %>%
# drop the intercepts and every .x with a p < 0.05
filter(term !="(Intercept)", p.value < 0.05)
j <- exp(simOR %>% select("estimate"))
OR <- as.numeric(unlist(j))
mean(OR)
And the data set with all the rows, dropping only the intercepts.
simAll <- sim %>%
filter(term !="(Intercept)")
j <- exp(simAll %>% select("estimate"))
All <- as.numeric(unlist(j))
mean(All)
Now plot the histograms (not overlapped).
op <- par(mfrow = c(2, 1))
hist(OR, main = NULL, freq = TRUE, breaks = 10)
abline(v = mean(OR), lwd = 4, col = 1)
hist(All, main = NULL, freq = TRUE, breaks = 10)
abline(v = mean(All), lwd = 4, col = 1)
par(op)
I frequently work with data frames and have to run some sophisticated data wrangling / manipulations by subgroup that is defined in one of the columns. I am aware of dplyr and group_by and know that many things could be solved using group_by. However, often I have to do some pretty intricate calculations and end up just using the 'for' loop.
I was wondering about the existence of some other general approach or paradigm that is faster/more elegant. Maybe map (that I am not very familiar with)?
Below is an example. Notice - it is fake and meaningless. So let's ignore why I need to do those things or the fact that there could be 2 consequtive NAs in a column, etc. That's not the focus of my question. The point is that often I have to operate "within the constraints of a subgroup" and then - inside that subgroup - I have to do operations columnwise, rowwise and sometimes even cellwise.
I also realize that I could probably put most of that code inside a function, split my data frame into a list based on 'group', apply this function to each element of that list and then do.call(rbind...) at the end. But is this the only way?
Thanks a lot for any hints!
library(dplyr)
library(forcats)
set.seed(123)
x <- tibble(group = c(rep('a', 10), rep('b', 10), rep('c', 10)),
attrib = c(sample(c("one", "two", "three", "four"), 10, replace = T),
sample(c("one", "two", "three"), 10, replace = T),
sample(c("one", "three", "four"), 10, replace = T)),
v1 = sample(c(1:5, NA), 30, replace = T),
v2 = sample(c(1:5, NA), 30, replace = T),
v3 = sample(c(1:5, NA), 30, replace = T),
n1 = abs(rnorm(30)), n2 = abs(rnorm(30)), n3 = abs(rnorm(30)))
v_vars = paste0("v", 1:3)
n_vars = paste0("n", 1:3)
results <- NULL # Placeholder for final results
for(i in seq(length(unique(x$group)))) { # loop through groups
mygroup <- unique(x$group)[i]
mysubtable <- x %>% filter(group == mygroup)
# IMPUTE NAs in v columns
# Replace every NA with a mean of values above and below it; and if it's the first or
# the last value, with the mean of 2 values below or above it.
for (v in v_vars){ # loop through v columns
which_nas <- which(is.na(mysubtable[[v]])) # create index of NAs for column v
if (length(which_nas) == 0) next else {
for (na in which_nas) { # loop through indexes of column values that are NAs
if (na == 1) {
mysubtable[[v]][na] <- mean(c(mysubtable[[v]][na + 1],
mysubtable[[v]][na + 2]), na.rm = TRUE)
} else if (na == nrow(mysubtable)) {
mysubtable[[v]][na] <- mean(c(mysubtable[[v]][na - 2],
mysubtable[[v]][na - 1]), na.rm = TRUE)
} else {
mysubtable[[v]][na] <- mean(c(mysubtable[[v]][na - 1],
mysubtable[[v]][na + 1]), na.rm = TRUE)
}
} # end of loop through NA indexes
} # end of else
} # end of loop through v vars
# Aggregate v columns (mean) for each value of column 'attrib'
result1 <- mysubtable %>% group_by(attrib) %>%
summarize_at(v_vars, mean)
# Aggregate n columns (sum) for each value of column 'attrib'
result2 <- mysubtable %>% group_by(attrib) %>%
summarize_at(n_vars, sum)
# final result should contain the name of the group
results[[i]] <- cbind(mygroup, result1, result2[-1])
}
results <- do.call(rbind, results)
Maybe this example is too simple, but in this case, the only thing you need to pull out is the imputation.
my_impute <- function(x) {
which_nas <- which(is.na(x))
for (na in which_nas) {
if (na == 1) {
x[na] <- mean(c(x[na + 1], x[na + 2]), na.rm = TRUE)
} else if (na == length(x)) {
x[na] <- mean(c(x[na - 2], x[na - 1]), na.rm = TRUE)
} else {
x[na] <- mean(c(x[na - 1], x[na + 1]), na.rm = TRUE)
}
}
x
}
Then you just need to group appropriately and impute and summarize.
x2 <- x %>% group_by(group) %>% mutate_at(v_vars, my_impute) %>%
group_by(group, attrib)
full_join(x2 %>% summarize_at(v_vars, mean),
x2 %>% summarize_at(n_vars, sum))
My usual method for things like this, where similar calculations need to be on a bunch of columns, is to put it in long format. Here it feels a little like the long way round, but perhaps this would be useful to see.
x %>% mutate(row=1:n()) %>% gather("variable", "value", c(v_vars, n_vars)) %>%
separate(variable, c("var", "x"), sep=1) %>% spread(var, value) %>%
arrange(group, x, row) %>% group_by(group, x) %>%
mutate(v=my_impute(v)) %>% group_by(group, attrib, x) %>%
summarize(v=mean(v), n=sum(n)) %>%
gather("var", "value", v, n) %>% mutate(X=paste0(var, x)) %>%
select(-x, -var) %>% spread(X, value)
More generally, split-apply-combine is probably the way to go, as you suggest in your question; here's a way using the tidyverse.
doX <- function(x) {
x2 <- x %>% mutate_at(v_vars, my_impute) %>% group_by(attrib)
full_join(x2 %>% summarize_at(v_vars, mean),
x2 %>% summarize_at(n_vars, sum))
}
x %>% group_by(group) %>% nest() %>%
mutate(result=map(data, doX)) %>% select(-data) %>% unnest()
The more traditional method is with do.call, split, and rbind; here I don't make the effort to keep the group information.
do.call(rbind, lapply(split(x, x$group), doX))
The first thing to do is to change your data imputing into a function. I made some simple modifications to have it accept a vector and simplified the call to mean.
fx_na_rm <- function(z) {
which_nas <- which(is.na(z))
if (length(which_nas) > 0) {
for (na in which_nas) { # loop through indexes of column values that are NAs
if (na == 1) {
z[na] <- mean(z[na + (1:2)], na.rm = TRUE)
} else if (na == nrow(mysubtable)) {
z[na] <- mean(z[na - (1:2)], na.rm = TRUE)
} else {
z[na] <- mean(z[c(na - 1, na + 1)], na.rm = TRUE)
}
} # end of loop through NA indexes
}
return(z)
}
I like data.table so here's a solution that uses it. Now since you use different functions for the n and v variable groups, most purrr or any other solutions will also be a little funny.
library(data.table)
dt <- copy(as.data.table(x))
v_vars = paste0("v", 1:3)
n_vars = paste0("n", 1:3)
dt[, (v_vars) := lapply(.SD, as.numeric), .SDcols = v_vars]
dt[, (v_vars) := lapply(.SD, fx_na_rm), by = group, .SDcols = v_vars]
# see https://stackoverflow.com/questions/50626316/r-data-table-apply-function-a-to-some-columns-and-function-b-to-some-others
scols <- list(v_vars, n_vars)
funs <- rep(c(mean, sum), lengths(scols))
dt[, setNames(Map(function(f, x) f(x), funs, .SD), unlist(scols))
, by = .(group,attrib)
, .SDcols = unlist(scols)]
The for loop itself is difficult to vectorize because the results can depend on itself. Here is my attempt which is not an identical output to yours:
# not identical
fx_na_rm2 <- function(z) {
which_nas <- which(is.na(z))
if (length(which_nas) > 0) {
ind <- c(rbind(which_nas - 1 + 2 * (which_nas == 1) + -1 * (which_nas == length(z)),
which_nas + 1 + 1 * (which_nas == 1) + -2 * (which_nas == length(z))))
z[which_nas] <- colMeans(matrix(z[ind], nrow = 2), na.rm = T)
}
return(z)
}
I have the following data frame:
library(tidyverse)
set.seed(1234)
df <- data.frame(
x = seq(1, 100, 1),
y = rnorm(100)
)
Where I apply a smooth spline using different knots:
nknots <- seq(4, 15, 1)
output <- map(nknots, ~ smooth.spline(x = df$x, y = df$y, nknots = .x))
What I need to do now is to apply the same function using 2-point and 3-point averages:
df_2 <- df %>%
group_by(., x = round(.$x/2)*2) %>%
summarise_all(funs(mean))
df_3 <- df %>%
group_by(., x = round(.$x/3)*3) %>%
summarise_all(funs(mean))
In summary, I need to apply the function I used in output with the following data frames:
df
df_2
df_3
Of course, this is a minimal example, so I am looking for a efficient way of doing it. Preferably with the purrr package.
Using lapply, and the library zoo to calculate the moving average in a more simple and elegant manner:
library(zoo)
lapply(1:3,function(roll){
dftemp <- as.data.frame(rollmean(df,roll))
map(nknots, ~ smooth.spline(x = dftemp$x, y = dftemp$y, nknots = .x))
})
Here's one possible solution:
library(tidyverse)
set.seed(1234)
df <- data.frame(x = seq(1, 100, 1),
y = rnorm(100))
# funtion to get v-point averages
GetAverages = function(v) {
df %>%
group_by(., x = round(.$x/v)*v) %>%
summarise_all(funs(mean)) }
# specify nunber of knots
nknots <- seq(4, 15, 1)
dt_res = tibble(v=1:3) %>% # specify v-point averages
mutate(d = map(v, GetAverages)) %>% # get data for each v-point
crossing(., data.frame(nknots=nknots)) %>% # combine each dataset with a knot
mutate(res = map2(d, nknots, ~smooth.spline(x = .x$x, y = .x$y, nknots = .y))) # apply smooth spline
You can use dt_res$res[dt_res$v == 1] to see all results for your original daatset, dt_res$res[dt_res$v == 2] to see results for your 2-point estimate, etc.
i have a question regarding indexing a dataframe in R. This is the Code:
Gewicht <- data %>%
group_by(data[[376]]) %>%
summarise(weights = mean(data[[10190]], na.rm = TRUE))
Gewicht2 <- data %>%
group_by(data[[376]]) %>%
summarise(weights = mean(Weights, na.rm = TRUE))
a <- seq(1:10)
b <- rep(c("male", "female"),5)
c <- seq(1:10)
data <- as.data.frame(cbind(a,b,c))
data$c <- as.numeric(data$c)
newdata <- data %>%
group_by(data[[2]]) %>%
summarise(Mean = mean(c, na.rm = TRUE))
newdata2 <- data %>%
group_by(data[[2]]) %>%
summarise(Mean = mean(data[[3]], na.rm = TRUE))
print(newdata)
print(newdata2)
I get different results for both dataframes. The desired result in the "newdata". Can you tell me WHY i get different values for these two calculations?
I need brackets for a more complex custom function, but it seems it writes the mean for the whole dataframe, where i would hope to get the mean for each group.
How to use [] or [[]] correctly here?
a <- c(1,2,3,4,5,6,7,8,9,10)
b <- rep(c("male", "female"),5)
c <- c(1,2,3,4,5,6,7,8,9,10)
data <- data.frame(cbind(a,b,c))
data$c <- as.numeric(as.character(data$c))
c
data$c
print(newdata)
print(newdata2)
newdata <- data %>%
group_by(data[[2]]) %>%
summarise(Mean = mean(c, na.rm = TRUE))
newdata2 <- data %>%
group_by(data[[2]]) %>%
summarise(Mean = mean(data[[3]], na.rm = TRUE))
newdata
newdata2
updated code, still different results :(
Gewicht <- aggregate(data[[varGewicht]], by=list(data[[varx]]), FUN=mean, na.rm = TRUE)
Aggregate function works :-)
I wanted to use dplyr instead of apply,1 in order to filter a dataset rowwise according to a logical expression, ie for this example I´d like to remove all rows that have one or more values of 99.
However, I was surprised by the poor performance in dplyr. Any ideas if I can speed this up in dplyr? Also, I would have thought that the rowwise function would pipe the individual rows, but apparently not (see below). How can I use the rowwise function?
library(tidyverse)
s <- tibble(rows = seq(from = 250, to = 5000, by = 250)) #my original dataset has 400K rows...
s$num <- map(s$rows, ~ rnorm(.x * 6))
s$num <-
map(s$num, ~ replace(.x, sample(1:length(.x), size = length(.x) / 20), 99))
s$mat <- map(s$num, ~ as_data_frame(matrix(.x, ncol = 6)))
help_an <- function(vec) {
browser()
return(!any(vec == 99))
}
help_dp_t <- function(df) {
clo1 <- proc.time()
a <- as_data_frame(t(df)) %>% summarise_all(help_an)
df2 <- filter(df, t(a)[, 1])
b <- tibble(time = (proc.time() - clo1)[3], df = list(df2))
return(b)
}
s$dplyr <- map(s$mat, ~ dplyr::mutate(help_dp_t(.x)))
help_lap <- function(df) {
clo1 <- proc.time()
a_base <- df[apply(df, 1, function(x)
! any(x == 99)), ]
b <- tibble(time = (proc.time() - clo1)[3], df = list(a_base))
return(b)
}
s$lapply <- map(s$mat, ~ mutate(help_lap(.x)))
s$equal_dplyr_lapply <-
map2_lgl(s$dplyr, s$lapply, ~ all.equal(.x$df, .y$df))
s$dplyr_time <- map_dbl(s$dplyr, "time")
s$lapply_time <- map_dbl(s$lapply, "time")
ggplot(gather(s, ... = c(7, 8)), aes(x = rows, y = value, color = key)) +
geom_line()
I tried the following with rowwise, but the rowwise pipe does not send a vector, but the entire df to the help_an function.
help_dp_r <- function(df) {
clo1 <- proc.time()
df2 <-
df %>% rowwise() %>% mutate(cond = help_an(.)) ### . is not passed on as a vector, but the entire df??
b <- tibble(time = (proc.time() - clo1)[3], df = list(df2))
}
s$dplyr_r <- map(s$mat, ~ dplyr::mutate(help_dp_r(.x)))