Make column with "sample" for each row with purrr - r

I'm trying to make column with sample value for each row of data
But I'm new with purrr and can't make this.
My code
df<-data.frame(x=rep(1:3,each=4),y=99)
df%>%
group_by(x)%>%
mutate_(val=~purrr::map_dbl(function(x) sample(50,1)))
This didn't work.
But function with purrr only working:
1:5%>%purrr::map_dbl(function(x) sample(50,1))
[1] 39 30 7 18 45
Thanks for any help!

You don't need purrr:
df <- data.frame(x = rep(1:3, each = 4), y = 99)
df %>%
group_by(x) %>%
mutate(val = sample(50, n()))
Output
# A tibble: 12 x 3
# Groups: x [3]
x y val
<int> <dbl> <int>
1 1 99.0 10
2 1 99.0 25
3 1 99.0 2
4 1 99.0 24
5 2 99.0 48
6 2 99.0 19
7 2 99.0 34
8 2 99.0 33
9 3 99.0 24
10 3 99.0 14
11 3 99.0 37
12 3 99.0 12

If you need to use purrr, I guess you could do:
dplyr::mutate(df, val = purrr::map(x, ~ sample(50, 1)))
x y val
1 1 99 35
2 1 99 4
3 1 99 43
4 1 99 28
5 2 99 49
6 2 99 31
7 2 99 31
8 2 99 31
9 3 99 19
10 3 99 4
11 3 99 43
12 3 99 20
Or with the pipe:
library(dplyr)
library(purrr)
df %>%
mutate(val = map(x, ~ sample(50, 1)))
Data:
df <- data.frame(x = rep(1:3, each = 4), y = 99)

Related

Looping a loop to do everything at once

I am trying to simulate the following "game:
There is a population of 100 units
You randomly sample 10 of these units, record the id's of the units you saw, and then put them back into the population
You then take a second sample, record the id's of the units you saw in this second sample along with the first sample, and then put the second sample back into the population
Repeat this many times
I wrote the following code in R that performs the above procedure:
library(dplyr)
var_1 = rnorm(100,10,10)
var_2 = rnorm(100,1,10)
var_3 = rnorm(100,5,10)
response = rnorm(100,1,1)
my_data = data.frame(var_1, var_2, var_3, response)
my_data$id = 1:100
results <- list()
results2<- list()
for (i in 1:100)
{
iteration_i = i
sample_i = my_data[sample(nrow(my_data), 10), ]
results_tmp = data.frame(iteration_i, sample_i)
results[[i]] <- results_tmp
}
results_df <- do.call(rbind.data.frame, results)
test_1 <- data.frame(results_df %>%
group_by(id) %>%
filter(iteration_i == min(iteration_i)) %>%
distinct)
summary_file = data.frame(test_1 %>% group_by(iteration_i) %>% summarise(Count = n()))
cumulative = cumsum(summary_file$Count)
summary_file$Cumulative = cumulative
summary_file$unobserved = 100 - cumulative
The result looks something like this:
> summary_file
iteration_i Count Cumulative unobserved
1 1 10 10 90
2 2 8 18 82
3 3 9 27 73
4 4 8 35 65
5 5 6 41 59
6 6 5 46 54
7 7 7 53 47
8 8 7 60 40
9 9 4 64 36
10 10 3 67 33
11 11 4 71 29
12 12 4 75 25
13 13 1 76 24
14 14 4 80 20
15 15 1 81 19
16 16 2 83 17
17 17 2 85 15
18 18 1 86 14
19 20 1 87 13
20 22 1 88 12
21 23 2 90 10
22 24 1 91 9
23 25 1 92 8
24 27 2 94 6
25 28 1 95 5
26 30 1 96 4
27 35 1 97 3
28 37 1 98 2
29 44 1 99 1
30 46 1 100 0
I would now like to repeat this "game" many times.
I would like to keep the "summary_file" for each "game" (e.g. summary_file_1, summary_file_2, summary_file_3, etc.)
I would then like to create a "total" summary file that shows the number of iterations that were required in each game to observe all units.
This total_summary_file would look something like this:
game_id iterations_required
1 game_1 47
2 game_2 45
3 game_3 44
4 game_4 42
5 game_5 42
Currently, I am just copy/pasting my earlier code several times and storing the results, then I append everything at the end and calculate the summary statistics - but I am trying to find a way to "loop the loop" and do everything at once. I do not know if it is possible to introduce a command like "results_df_i <- do.call(rbind.data.frame, results_i)" into the loop and efficiently create everything at the same time instead of manually copy/pasting the earlier loop.
You're making this a lot less efficient than it could be. To get, say, 100 repeated samples of 10 from the set 1:100 (with replacement), we can do replicate(100, sample(100, 10, TRUE)).
We can then coerce this into a vector and count the number of unique values every 10 entries along the vector until we get to 100. This gives us the number of iterations required to exhaust the samples.
If we put this inside an sapply, we don't even need an explicit loop, which means we can create the results data frame in a single call:
set.seed(1)
n_games <- 10
results <- data.frame(game_id = paste("game", seq(n_games), sep = "_"),
iterations_required = sapply(seq(n_games), function(x) {
samp <- c(replicate(100, sample(100, 10, TRUE)))
sum(sapply(1:100 * 10, function(n) length(unique(samp[1:n]))) < 100)
}))
results
#> game_id iterations_required
#> 1 game_1 59
#> 2 game_2 44
#> 3 game_3 54
#> 4 game_4 59
#> 5 game_5 57
#> 6 game_6 58
#> 7 game_7 96
#> 8 game_8 60
#> 9 game_9 71
#> 10 game_10 33
Created on 2022-06-11 by the reprex package (v2.0.1)
There are lots of ways to get your desired outcome; wrapping your loop in a function and running the function multiple times is another potential solution:
library(dplyr)
var_1 = rnorm(100,10,10)
var_2 = rnorm(100,1,10)
var_3 = rnorm(100,5,10)
response = rnorm(100,1,1)
my_data = data.frame(var_1, var_2, var_3, response)
my_data$id = 1:100
results <- list()
results2<- list()
sample_func <- function(output_file_name) {
for (i in 1:100)
{
iteration_i = i
sample_i = my_data[sample(nrow(my_data), 10), ]
results_tmp = data.frame(iteration_i, sample_i)
results[[i]] <- results_tmp
}
results_df <- do.call(rbind.data.frame, results)
test_1 <- data.frame(results_df %>%
group_by(id) %>%
filter(iteration_i == min(iteration_i)) %>%
distinct)
summary_file = data.frame(test_1 %>% group_by(iteration_i) %>% summarise(Count = n()))
cumulative = cumsum(summary_file$Count)
summary_file$Cumulative = cumulative
summary_file$unobserved = 100 - cumulative
output_file_name <- summary_file
return(output_file_name)
}
list_of_names <- paste0("game_", 1:10)
output <- lapply(list_of_names, sample_func)
names(output) <- list_of_names
head(output, n = 2)
#> $game_1
#> iteration_i Count Cumulative unobserved
#> 1 1 10 10 90
#> 2 2 9 19 81
#> 3 3 8 27 73
#> 4 4 8 35 65
#> 5 5 6 41 59
#> 6 6 6 47 53
#> 7 7 6 53 47
#> 8 8 4 57 43
#> 9 9 4 61 39
#> 10 10 3 64 36
#> 11 11 2 66 34
#> 12 12 1 67 33
#> 13 13 4 71 29
#> 14 14 1 72 28
#> 15 15 2 74 26
#> 16 16 2 76 24
#> 17 17 3 79 21
#> 18 18 4 83 17
#> 19 19 2 85 15
#> 20 20 2 87 13
#> 21 21 1 88 12
#> 22 24 1 89 11
#> 23 25 2 91 9
#> 24 26 1 92 8
#> 25 27 1 93 7
#> 26 30 1 94 6
#> 27 31 1 95 5
#> 28 33 1 96 4
#> 29 34 1 97 3
#> 30 36 1 98 2
#> 31 41 1 99 1
#> 32 66 1 100 0
#>
#> $game_2
#> iteration_i Count Cumulative unobserved
#> 1 1 10 10 90
#> 2 2 10 20 80
#> 3 3 7 27 73
#> 4 4 7 34 66
#> 5 5 8 42 58
#> 6 6 5 47 53
#> 7 7 7 54 46
#> 8 8 5 59 41
#> 9 9 1 60 40
#> 10 10 7 67 33
#> 11 11 3 70 30
#> 12 12 3 73 27
#> 13 13 1 74 26
#> 14 14 3 77 23
#> 15 15 4 81 19
#> 16 16 3 84 16
#> 17 17 2 86 14
#> 18 18 1 87 13
#> 19 19 2 89 11
#> 20 20 1 90 10
#> 21 21 2 92 8
#> 22 22 1 93 7
#> 23 25 2 95 5
#> 24 27 1 96 4
#> 25 29 2 98 2
#> 26 30 1 99 1
#> 27 41 1 100 0
The output is a list, so you can use purrr to apply functions to each element (e.g. https://purrr.tidyverse.org/reference/lmap.html) or use the bind_rows() function to create a single dataframe for further use, e.g.
df2 <- bind_rows(output, .id = "game") %>%
group_by("Game" = factor(game, levels = list_of_names)) %>%
summarise(rows_in_output = n(),
number_of_iterations = max(iteration_i))
df2
#> # A tibble: 10 × 3
#> Game rows_in_output number_of_iterations
#> <fct> <int> <int>
#> 1 game_1 32 66
#> 2 game_2 27 41
#> 3 game_3 27 48
#> 4 game_4 32 50
#> 5 game_5 27 35
#> 6 game_6 27 71
#> 7 game_7 28 68
#> 8 game_8 27 48
#> 9 game_9 29 43
#> 10 game_10 29 66
Created on 2022-06-17 by the reprex package (v2.0.1)
Or you can use list2env() to get have each individual dataframe in your environment, e.g.
list2env(output, envir = .GlobalEnv)
ls()
#> [1] "df2" "game_1" "game_10" "game_2" "game_3" "game_4"
#> [7] "game_5" "game_6" "game_7" "game_8" "game_9" "list_of_names"
#> [13] "my_data" "output" "response" "results" "results2" "sample_func"
#> [19] "var_1" "var_2" "var_3"
This seems easily solvable using recursion:
fun <- function(x, i=1, size = 10){
a <- setdiff(x,sample(100, 10, TRUE)) # Remove the seen from x
if(length(a)) Recall(a, i+1) else i # if we have unobserved, call fun again
}
Now we can have as many games as we want:
data.frame(game = paste0('game',seq(10)), results = replicate(10, fun(1:100)))
game results
1 game1 62
2 game2 40
3 game3 51
4 game4 50
5 game5 34
6 game6 83
7 game7 38
8 game8 40
9 game9 53
10 game10 41
You could also do
hist(replicate(1000, fun(1:100)), breaks = 30)
Edit:
Note that this can be editted to take in any size and vector. eg:
fun <- function(x, size = 10, y=x, i=1){
a <- setdiff(x,sample(y, size, TRUE))
cat('i', i, '\t a: ',a, '\n')
if(length(a)>0) Recall(a, size, y, i+1) else i
}
set.seed(117);fun(1:10, 1)
i 1 a: 1 2 4 5 6 7 8 9 10 # 3 removed
i 2 a: 1 2 4 5 7 8 9 10 # 6 removed
i 3 a: 1 2 4 5 7 8 9 # 10 removed
i 4 a: 1 2 4 5 8 9 # 7 removed
i 5 a: 1 2 5 8 9 # 4 removed
i 6 a: 1 2 5 8 9 # Nothing removed
i 7 a: 1 5 8 9 # 2 removed
i 8 a: 1 5 8 # 9 removed
i 9 a: 1 5 # Nothing removed
i 10 a: 1 5 # Nothing removed
i 11 a: 5 # 1 removed
i 12 a: # 5 removed
Using Markov chains, we can produce the cumulative distribution function for the number of iterations required for a game (up to machine precision). The resulting CDF can be sampled directly using findInterval.
We can simplify things slightly by starting with the second iteration, since the first iteration will always result in 90 unseen units.
First, set up a matrix for all possible transitions:
m <- matrix(c(rep(90:1, each = 11), sequence(rep(11,90), 90:1, -1)), ncol = 2, dimnames = list(NULL, c("from", "to")))
m <- m[m[,2] >= 0L,]
Then create a transition matrix with row 1 representing the state where all units have been seen and row 91 representing the state where 10 units have been seen:
mTrans <- matrix(0, 91, 91)
The number of previously unseen units selected follows the hypergeometric distribution.
mTrans[m + 1L] <- dhyper(m[,1] - m[,2], m[,1], 100L - m[,1], 10L)
Row 1 represents an absorbing state since all units have been seen.
mTrans[1, 1] <- 1
mTrans contains the probabilities of each state after the second iteration.
Initialize a while loop and calculate the CDF.
mm <- mTrans %*% mTrans
maxIter <- 1000L
p <- numeric(maxIter)
iter <- 3L
while (p[iter] < 1) {
if ((iter <- iter + 1L) > maxIter) {
p <- c(p, numeric(maxIter))
maxIter <- maxIter*2L
}
mm <- mm %*% mTrans
p[iter] <- mm[91, 1]
}
p <- p[1:iter]
iter
#> [1] 345
Machine precision limits the CDF to less than 345 iterations. Plot the CDF:
plot(p, xlab = "iterations", ylab = "cumulative probability")
Using findInterval we can quickly generate a large number of random samples of the iterations required.
ngames <- 1e6L # one million games
results <- data.frame(game_id = 1:ngames, iterations_required = findInterval(runif(ngames), p))
head(results)
#> game_id iterations_required
#> 1 1 73
#> 2 2 69
#> 3 3 40
#> 4 4 41
#> 5 5 44
#> 6 6 43
Get a histogram of the sample number of iterations required.
hist(results$iterations_required)
OP here! I think I was able to find an answer to my own question:
library(dplyr)
var_1 <- rnorm(100, 10, 10)
var_2 <- rnorm(100, 1, 10)
var_3 <- rnorm(100, 5, 10)
response <- rnorm(100, 1, 1)
my_data <- data.frame(var_1, var_2, var_3, response)
my_data$id <- 1:100
simulate <- function() {
results <- list()
results2 <- list()
for (i in 1:100) {
iteration_i <- i
sample_i <- my_data[sample(nrow(my_data), 10), ]
results_tmp <- data.frame(iteration_i, sample_i)
results[[i]] <- results_tmp
}
results_df <- do.call(rbind.data.frame, results)
test_1 <- data.frame(results_df %>%
group_by(id) %>%
filter(iteration_i == min(iteration_i)) %>%
distinct)
summary_file <- data.frame(test_1 %>%
group_by(iteration_i) %>%
summarise(Count=n()))
cumulative <- cumsum(summary_file$Count)
summary_file$Cumulative <- cumulative
summary_file$unobserved <- 100 - cumulative
return(summary_file)
}
# now, loop 10 times!
results <- list()
for (i in 1:10) {
game_i <- i
s_i <- simulate()
results_tmp <- data.frame(game_i, s_i)
results[[i]] <- results_tmp
}
final_file <- do.call(rbind.data.frame, results)
Thanks for your help everyone!

how to apply a function to each group

I have this function*,
x<- data.frame (
'Ones'=c(1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0),
'Thats'=c(0,5,3,6,8,4,5,6,8,3,1,NA,4,5,6,7,4,3,4,5))
f<- function(df, n){
df %>%
collapse::flag(-n:n) %>%
rowSums(na.rm = T) - 1
}
x %>%
mutate(gap1 = f(., 1),
gap2 = f(., 2),
gap3 = f(., 3),
gap4 = f(., 4),
gap5 = f(., 5)) %>%
filter(Ones == 1)
It finds when the variable Ones is qual to 1 and sums,
if gap1 = f(., 1), the values n-1, n and n+1 in Thats, with n being in the same row as 1,
if gap2 = f(.,2) n-2 + n-1 + n + n+1 + n+2...
And so on.
This is the output.
Ones Thats gap1 gap2 gap3 gap4 gap5
1 1 0 5 8 14 22 27
2 1 4 17 29 40 48 51
3 1 1 4 16 27 38 50
Now I want to use this function for each group of my sample.
This is a Facsimile of my sample. The real sample has more conditions and more observation, but this is just to give you an idea.
dat<- data.frame (
'sub' = c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2),
'trial' = c(1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2),
'Thumb'= c(1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0),
'Index'= c(0,5,3,6,8,4,5,6,8,3,1,3,4,5,6,7,4,3,4,5,4,5,3,23,2,1,4,6,3,2,3,2,4,6,3,2,4,3,2,1))
I need to apply the function to all groups of my sample.
For example, in this specific case I would 4 conditions:
sub1/trial1, sub2/trial1, sub1/trial2, sub2/trial2
I tried to write the function in this way:
f<- function(df, n){
df %>% group_by(trial, sub) %>%
collapse::flag(-n:n) %>%
rowSums(na.rm = T) - 1
}
But it doesn't work.
Do you have any advice to give me?
Please consider that the sample above is only an example... I need a general function working also with more conditions and observations.
Thank you,
Best regards
*credits: Kindly proposed by Donald Seinen on this site.
We may need group_modify if we are using the first function f
library(dplyr)
dat %>%
group_by(trial, sub) %>%
group_modify(~ .x %>%
mutate(gap1 = f(.x, 1), gap2 = f(.x, 2),
gap3 = f(.x, 3), gap4 = f(.x, 4), gap5 = f(.x, 5))) %>%
ungroup
-output
# A tibble: 40 × 9
trial sub Thumb Index gap1 gap2 gap3 gap4 gap5
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 1 0 5 8 14 22 27
2 1 1 0 5 8 14 22 27 32
3 1 1 0 3 13 22 27 32 38
4 1 1 0 6 16 26 32 38 46
5 1 1 0 8 18 26 37 46 49
6 1 1 1 4 17 29 40 48 49
7 1 1 0 5 15 31 40 43 48
8 1 1 0 6 18 26 34 40 43
9 1 1 0 8 16 21 26 34 40
10 1 1 0 3 10 16 21 26 34
# … with 30 more rows
which is the same result we get on the subset of data for the first combination
> dat %>%
filter(sub == 1, trial == 1) %>%
select(-sub, -trial) %>%
mutate(gap1 = f(., 1),
gap2 = f(., 2),
gap3 = f(., 3),
gap4 = f(., 4),
gap5 = f(., 5))
Thumb Index gap1 gap2 gap3 gap4 gap5
1 1 0 5 8 14 22 27
2 0 5 8 14 22 27 32
3 0 3 13 22 27 32 38
4 0 6 16 26 32 38 46
5 0 8 18 26 37 46 49
6 1 4 17 29 40 48 49
7 0 5 15 31 40 43 48
8 0 6 18 26 34 40 43
9 0 8 16 21 26 34 40
10 0 3 10 16 21 26 34

Create multiple new columns in tibble in R based on value of previous row giving prefix to all

I have a tibble as so:
df <- tibble(a = seq(1:10),
b = seq(21,30),
c = seq(31,40))
I want to create a new tibble, where I want to lag some. I want to create new columns called prev+lagged_col_name, eg prev_a.
In my actual data, there are a lot of cols so I don't want to manually write it out. Additonally I only want to do it for some cols. In this eg, I have done it manually but wanted to know if there is a way to use a function to do it.
df_new <- df %>%
mutate(prev_a = lag(a),
prev_b = lag(b),
prev_d = lag(d))
Thanks for your help!
With the current dplyr version you can create new variable names with mutate_at, using a named list will take the name of the list as suffix. If you want it as a prefix as in your example you can use rename_at to correct the variable naming. With your real data, you need to adjust the vars() selection. For your example data matches("[a-c]") did work.
library(dplyr)
df <- tibble(a = seq(1:10),
b = seq(21,30),
c = seq(31,40))
df %>%
mutate_at(vars(matches("[a-c]")), list(prev = ~ lag(.x)))
#> # A tibble: 10 x 6
#> a b c a_prev b_prev c_prev
#> <int> <int> <int> <int> <int> <int>
#> 1 1 21 31 NA NA NA
#> 2 2 22 32 1 21 31
#> 3 3 23 33 2 22 32
#> 4 4 24 34 3 23 33
#> 5 5 25 35 4 24 34
#> 6 6 26 36 5 25 35
#> 7 7 27 37 6 26 36
#> 8 8 28 38 7 27 37
#> 9 9 29 39 8 28 38
#> 10 10 30 40 9 29 39
df %>%
mutate_at(vars(matches("[a-c]")), list(prev = ~ lag(.x))) %>%
rename_at(vars(contains( "_prev") ), list( ~paste("prev", gsub("_prev", "", .), sep = "_")))
#> # A tibble: 10 x 6
#> a b c prev_a prev_b prev_c
#> <int> <int> <int> <int> <int> <int>
#> 1 1 21 31 NA NA NA
#> 2 2 22 32 1 21 31
#> 3 3 23 33 2 22 32
#> 4 4 24 34 3 23 33
#> 5 5 25 35 4 24 34
#> 6 6 26 36 5 25 35
#> 7 7 27 37 6 26 36
#> 8 8 28 38 7 27 37
#> 9 9 29 39 8 28 38
#> 10 10 30 40 9 29 39
Created on 2020-04-29 by the reprex package (v0.3.0)
You could do this this way
df_new <- bind_cols(
df,
df %>% mutate_at(.vars = vars("a","b","c"), function(x) lag(x))
)
Names are a bit nasty but you can rename them check here. Or see #Bas comment to get the names with a suffix.
# A tibble: 10 x 6
a b c a1 b1 c1
<int> <int> <int> <int> <int> <int>
1 1 21 31 NA NA NA
2 2 22 32 1 21 31
3 3 23 33 2 22 32
4 4 24 34 3 23 33
5 5 25 35 4 24 34
6 6 26 36 5 25 35
7 7 27 37 6 26 36
8 8 28 38 7 27 37
9 9 29 39 8 28 38
10 10 30 40 9 29 39
If you have dplyr 1.0 you can use the new accross() function.
See some expamples from the docs, instead of mean you want lag
df %>% mutate_if(is.numeric, mean, na.rm = TRUE)
# ->
df %>% mutate(across(is.numeric, mean, na.rm = TRUE))
df %>% mutate_at(vars(x, starts_with("y")), mean, na.rm = TRUE)
# ->
df %>% mutate(across(c(x, starts_with("y")), mean, na.rm = TRUE))
df %>% mutate_all(mean, na.rm = TRUE)
# ->
df %>% mutate(across(everything(), mean, na.rm = TRUE))

Group_by / summarize by two variables within a function

I would like to write a function that summarize the provided data by some specified criteria, in this case by age
The example data is a table of users' age and their stats.
df <- data.frame('Age'=rep(18:25,2), 'X1'=10:17, 'X2'=28:35,'X4'=22:29)
Next I define the output columns that are relevant for the analysis
output_columns <- c('Age', 'X1', 'X2', 'X3')
This function computes the basic the sum of X1. X2 and X3 grouped by age.
aggr <- function(data, criteria, output_columns){
k <- data %>% .[, colnames(.) %in% output_columns] %>%
group_by_(.dots = criteria) %>%
#summarise_each(funs(count), age) %>%
summarize_if(is.numeric, sum)
return (k)
}
When I call it like this
> e <- aggr(df, "Age", output_columns)
> e
# A tibble: 8 x 3
Age X1 X2
<int> <int> <int>
1 18 20 56
2 19 22 58
3 20 24 60
4 21 26 62
5 22 28 64
6 23 30 66
7 24 32 68
8 25 34 70
I want to have another column called count which shows the number of observations in each age group. Desired output is
> desired
Age X1 X2 count
1 18 20 56 2
2 19 22 58 2
3 20 24 60 2
4 21 26 62 2
5 22 28 64 2
6 23 30 66 2
7 24 32 68 2
8 25 34 70 2
I have tried different ways to do that, e.g. tally(), summarize_each
etc. They all deliver wrong results.
I believe their should be an easy and simple way to do that.
Any help is appreciated.
Since you're already summing all variables, you can just add a column of all 1s before the summary function
aggr <- function(data, criteria, output_columns){
data %>%
.[, colnames(.) %in% output_columns] %>%
group_by_(.dots = criteria) %>%
mutate(n = 1L) %>%
summarize_if(is.numeric, sum)
}
# A tibble: 8 x 4
Age X1 X2 n
<int> <int> <int> <int>
1 18 20 56 2
2 19 22 58 2
3 20 24 60 2
4 21 26 62 2
5 22 28 64 2
6 23 30 66 2
7 24 32 68 2
8 25 34 70 2
We could create the 'count' column before summarise_if
aggr<- function(data, criteria, output_columns){
data %>%
select(intersect(names(.), output_columns))%>%
group_by_at(criteria)%>%
group_by(count = n(), add= TRUE) %>%
summarize_if(is.numeric,sum) %>%
select(setdiff(names(.), 'count'), count)
}
aggr(df,"Age",output_columns)
# A tibble: 8 x 4
# Groups: Age [8]
# Age X1 X2 count
# <int> <int> <int> <int>
#1 18 20 56 2
#2 19 22 58 2
#3 20 24 60 2
#4 21 26 62 2
#5 22 28 64 2
#6 23 30 66 2
#7 24 32 68 2
#8 25 34 70 2
In base R you could do
aggr <- function(data, criteria, output_columns){
ds <- data[, colnames(data) %in% output_columns]
d <- aggregate(ds, by=list(criteria), function(x) c(sum(x), length(x)))
"names<-"(do.call(data.frame, d)[, -c(2:3, 5)], c(names(ds), "n"))
}
> with(df, aggr(df, Age, output_columns))
Age X1 X2 n
1 18 20 56 2
2 19 22 58 2
3 20 24 60 2
4 21 26 62 2
5 22 28 64 2
6 23 30 66 2
7 24 32 68 2
8 25 34 70 2

summarise dataset conditioning on variable using dplyr

I want to summarise my dataset grouping the variable age into 5 years age groups, so instead of single age 0 1 2 3 4 5 6... I would have 0 5 10 15 etc. with 80 being my open-ended category. I could do this by categorizing everything by hand creating a new variable, but I am sure there must be a quicker way!
a <- cbind(age=c(rep(seq(0, 90, by=1), 2)), value=rnorm(182))
Any ideas?
like this ?
library(dplyr)
a %>% data.frame %>% group_by(age_group = (sapply(age,min,80) %/% 5)*5) %>%
summarize(avg_val = mean(value))
# A tibble: 17 x 2
age_group avg_val
<dbl> <dbl>
1 0 -0.151470805
2 5 0.553619149
3 10 0.198915973
4 15 -0.436646287
5 20 -0.024193193
6 25 0.102671120
7 30 -0.350059839
8 35 0.010762264
9 40 0.339268917
10 45 -0.056448481
11 50 0.002982158
12 55 0.348232262
13 60 -0.364050091
14 65 0.177551510
15 70 -0.178885909
16 75 0.664215782
17 80 -0.376929230
Example data
set.seed(1)
df <- data.frame(age=runif(1000)*100,
value=runif(1000))
Simply add the max value of your group to seq(0,80,5) for irregular breaks with c(..., max(age))
library(dplyr)
df %>%
mutate(age = cut(age, breaks=c(seq(0,80,5), max(age)))) %>%
group_by(age) %>%
summarise(value=mean(value))
Output
age value
<fctr> <dbl>
1 (0,5] 0.4901119
2 (5,10] 0.5131055
3 (10,15] 0.5022297
4 (15,20] 0.4712481
5 (20,25] 0.5610872
6 (25,30] 0.4207203
7 (30,35] 0.5218318
8 (35,40] 0.4377102
9 (40,45] 0.5007616
10 (45,50] 0.4941768
11 (50,55] 0.5350272
12 (55,60] 0.5226967
13 (60,65] 0.5031688
14 (65,70] 0.4652641
15 (70,75] 0.5667020
16 (75,80] 0.4664898
17 (80,100] 0.4604779

Resources