Related
Question:
Below works, but is there a better "R way" of achieving similar result? I am essentially trying to create / distribute groups into individual line items according to a user defined function (currently just using a loop).
Example:
df1 <- data.frame(group = c("A", "B", "C"),
volume = c(200L, 45L, 104L)
)
print(df1)
#> group volume
#> 1 A 200
#> 2 B 45
#> 3 C 104
I want the volume to be broken across multiple rows according to group so that the final result is a dataframe where the new volume (vol2 in the below) would add up to original volume above. In this example, I'm applying integer math with a divisor of 52, so my final result should be:
print(df3)
#> group vol2
#> 1 A 52
#> 2 A 52
#> 3 A 52
#> 4 A 44
#> 21 B 45
#> 31 C 52
#> 32 C 52
This works
The code below DOES get me to the desired result shown above:
div <- 52L
df1$intgr <- df1$volume %/% div
df1$remainder <- df1$volume %% div
print(df1)
#> group volume intgr remainder
#> 1 A 200 3 44
#> 2 B 45 0 45
#> 3 C 104 2 0
df2 <- data.frame()
for (r in 1:nrow(df1)){
if(df1[r,"intgr"] > 0){
for (k in 1:as.integer(df1[r,"intgr"])){
df1[r,"vol2"] <- div
df2 <- rbind(df2, df1[r,])
}
}
if(df1[r,"remainder"]>0){
df1[r, "vol2"] <- as.integer(df1[r, "remainder"])
df2 <- rbind(df2, df1[r,])
}
}
print(df2)
#> group volume intgr remainder vol2
#> 1 A 200 3 44 52
#> 2 A 200 3 44 52
#> 3 A 200 3 44 52
#> 4 A 200 3 44 44
#> 21 B 45 0 45 45
#> 31 C 104 2 0 52
#> 32 C 104 2 0 52
df3 <- subset(df2, select = c("group", "vol2"))
print(df3)
#> group vol2
#> 1 A 52
#> 2 A 52
#> 3 A 52
#> 4 A 44
#> 21 B 45
#> 31 C 52
#> 32 C 52
Being still relatively new to R, I'm just curious if someone knows a better way / function / method that gets to the same place. Seems like there might be. I could potentially have a more complex way of breaking up the rows and I was thinking maybe there's a method that applies a UDF to the dataframe to do something like this. I was searching for "expand group/groups" but was finding mostly "expand.grid" which isn't what I'm doing here.
Thank you for any suggestions!
A quick function to help split each number by the modulus,
fun <- function(num, mod) c(rep(mod, floor(num / mod)), (num-1) %% mod + 1)
fun(200, 52)
# [1] 52 52 52 44
fun(45, 52)
# [1] 45
fun(104, 52)
# [1] 52 52
And we can apply this a number of ways:
dplyr
library(dplyr)
df1 %>%
group_by(group) %>%
summarize(vol2 = fun(volume, 52), .groups = "drop")
# # A tibble: 7 x 2
# group vol2
# <chr> <dbl>
# 1 A 52
# 2 A 52
# 3 A 52
# 4 A 44
# 5 B 45
# 6 C 52
# 7 C 52
base R
do.call(rbind, by(df1, seq(nrow(df1)),
FUN = function(z) data.frame(group = z$group, vol2 = fun(z$volume, 52))))
data.table
library(data.table)
setDT(df1)
df1[, .(vol2 = fun(volume, 52)), by = group]
A tidyverse approach using purrr::pmap and tidyr::unnest_longer may look like so:
library(dplyr, w = FALSE)
library(tidyr)
library(purrr)
div <- 52
df1 |>
mutate(intgr = volume %/% div, remainder = volume %% div, intgr1 = +(remainder > 0)) |>
mutate(vol2 = purrr::pmap(list(intgr, intgr1, remainder), ~ c(rep(div, ..1), rep(..3, ..2)))) |>
tidyr::unnest_longer(vol2) |>
select(-intgr1)
#> # A tibble: 7 × 5
#> group volume intgr remainder vol2
#> <chr> <int> <dbl> <dbl> <dbl>
#> 1 A 200 3 44 52
#> 2 A 200 3 44 52
#> 3 A 200 3 44 52
#> 4 A 200 3 44 44
#> 5 B 45 0 45 45
#> 6 C 104 2 0 52
#> 7 C 104 2 0 52
With data.table and rep:
library(data.table)
setDT(df1)[, .(vol2 = c(rep(52, volume%/%52), (volume%%52)[sign(volume%%52)])), group]
#> group vol2
#> 1: A 52
#> 2: A 52
#> 3: A 52
#> 4: A 44
#> 5: B 45
#> 6: C 52
#> 7: C 52
Or
setDT(df1)[, .(vol2 = c(rep(52, volume%/%52), volume%%52)), group][vol2 != 0]
#> group vol2
#> 1: A 52
#> 2: A 52
#> 3: A 52
#> 4: A 44
#> 5: B 45
#> 6: C 52
#> 7: C 52
Vectorised and without grouping:
df1 <- data.frame(group = c("A", "B", "C"),
volume = c(200L, 45L, 104L))
n <- 52
idx <- df1$volume %/% n + ((sel <- df1$volume %% n) != 0)
out <- df1[rep(seq_len(nrow(df1)), idx),]
out$volume <- n
out$volume[cumsum(idx)[sel != 0]] <- sel[sel != 0]
## group volume
##1 A 52
##1.1 A 52
##1.2 A 52
##1.3 A 44
##2 B 45
##3 C 52
##3.1 C 52
Another base R solution using aggregate :
aggregate(.~group,df1,\(x) c(rep(52, x / 52), (x-1) %% 52 + 1))
group volume
1 A 52, 52, 52, 44
2 B 45
3 C 52, 52, 52
This results in a list column for volume (could be useful)
To transform it to a long dataframe we can either use stack:
with(
aggregate(.~group,df1,\(x) c(rep(52, x / 52), (x-1) %% 52 + 1)),
setNames(stack(setNames(volume,group))[2:1],names(df1))
)
group volume
1 A 52
2 A 52
3 A 52
4 A 44
5 B 45
6 C 52
7 C 52
8 C 52
Or alternatively use unnest from tidyr
library(tidyr)
aggregate(.~group,df1,\(x) c(rep(52, x / 52), (x-1) %% 52 + 1)) %>% unnest(volume)
# A tibble: 8 × 2
group volume
<chr> <dbl>
1 A 52
2 A 52
3 A 52
4 A 44
5 B 45
6 C 52
7 C 52
8 C 52
I am trying to simulate the following "game:
There is a population of 100 units
You randomly sample 10 of these units, record the id's of the units you saw, and then put them back into the population
You then take a second sample, record the id's of the units you saw in this second sample along with the first sample, and then put the second sample back into the population
Repeat this many times
I wrote the following code in R that performs the above procedure:
library(dplyr)
var_1 = rnorm(100,10,10)
var_2 = rnorm(100,1,10)
var_3 = rnorm(100,5,10)
response = rnorm(100,1,1)
my_data = data.frame(var_1, var_2, var_3, response)
my_data$id = 1:100
results <- list()
results2<- list()
for (i in 1:100)
{
iteration_i = i
sample_i = my_data[sample(nrow(my_data), 10), ]
results_tmp = data.frame(iteration_i, sample_i)
results[[i]] <- results_tmp
}
results_df <- do.call(rbind.data.frame, results)
test_1 <- data.frame(results_df %>%
group_by(id) %>%
filter(iteration_i == min(iteration_i)) %>%
distinct)
summary_file = data.frame(test_1 %>% group_by(iteration_i) %>% summarise(Count = n()))
cumulative = cumsum(summary_file$Count)
summary_file$Cumulative = cumulative
summary_file$unobserved = 100 - cumulative
The result looks something like this:
> summary_file
iteration_i Count Cumulative unobserved
1 1 10 10 90
2 2 8 18 82
3 3 9 27 73
4 4 8 35 65
5 5 6 41 59
6 6 5 46 54
7 7 7 53 47
8 8 7 60 40
9 9 4 64 36
10 10 3 67 33
11 11 4 71 29
12 12 4 75 25
13 13 1 76 24
14 14 4 80 20
15 15 1 81 19
16 16 2 83 17
17 17 2 85 15
18 18 1 86 14
19 20 1 87 13
20 22 1 88 12
21 23 2 90 10
22 24 1 91 9
23 25 1 92 8
24 27 2 94 6
25 28 1 95 5
26 30 1 96 4
27 35 1 97 3
28 37 1 98 2
29 44 1 99 1
30 46 1 100 0
I would now like to repeat this "game" many times.
I would like to keep the "summary_file" for each "game" (e.g. summary_file_1, summary_file_2, summary_file_3, etc.)
I would then like to create a "total" summary file that shows the number of iterations that were required in each game to observe all units.
This total_summary_file would look something like this:
game_id iterations_required
1 game_1 47
2 game_2 45
3 game_3 44
4 game_4 42
5 game_5 42
Currently, I am just copy/pasting my earlier code several times and storing the results, then I append everything at the end and calculate the summary statistics - but I am trying to find a way to "loop the loop" and do everything at once. I do not know if it is possible to introduce a command like "results_df_i <- do.call(rbind.data.frame, results_i)" into the loop and efficiently create everything at the same time instead of manually copy/pasting the earlier loop.
You're making this a lot less efficient than it could be. To get, say, 100 repeated samples of 10 from the set 1:100 (with replacement), we can do replicate(100, sample(100, 10, TRUE)).
We can then coerce this into a vector and count the number of unique values every 10 entries along the vector until we get to 100. This gives us the number of iterations required to exhaust the samples.
If we put this inside an sapply, we don't even need an explicit loop, which means we can create the results data frame in a single call:
set.seed(1)
n_games <- 10
results <- data.frame(game_id = paste("game", seq(n_games), sep = "_"),
iterations_required = sapply(seq(n_games), function(x) {
samp <- c(replicate(100, sample(100, 10, TRUE)))
sum(sapply(1:100 * 10, function(n) length(unique(samp[1:n]))) < 100)
}))
results
#> game_id iterations_required
#> 1 game_1 59
#> 2 game_2 44
#> 3 game_3 54
#> 4 game_4 59
#> 5 game_5 57
#> 6 game_6 58
#> 7 game_7 96
#> 8 game_8 60
#> 9 game_9 71
#> 10 game_10 33
Created on 2022-06-11 by the reprex package (v2.0.1)
There are lots of ways to get your desired outcome; wrapping your loop in a function and running the function multiple times is another potential solution:
library(dplyr)
var_1 = rnorm(100,10,10)
var_2 = rnorm(100,1,10)
var_3 = rnorm(100,5,10)
response = rnorm(100,1,1)
my_data = data.frame(var_1, var_2, var_3, response)
my_data$id = 1:100
results <- list()
results2<- list()
sample_func <- function(output_file_name) {
for (i in 1:100)
{
iteration_i = i
sample_i = my_data[sample(nrow(my_data), 10), ]
results_tmp = data.frame(iteration_i, sample_i)
results[[i]] <- results_tmp
}
results_df <- do.call(rbind.data.frame, results)
test_1 <- data.frame(results_df %>%
group_by(id) %>%
filter(iteration_i == min(iteration_i)) %>%
distinct)
summary_file = data.frame(test_1 %>% group_by(iteration_i) %>% summarise(Count = n()))
cumulative = cumsum(summary_file$Count)
summary_file$Cumulative = cumulative
summary_file$unobserved = 100 - cumulative
output_file_name <- summary_file
return(output_file_name)
}
list_of_names <- paste0("game_", 1:10)
output <- lapply(list_of_names, sample_func)
names(output) <- list_of_names
head(output, n = 2)
#> $game_1
#> iteration_i Count Cumulative unobserved
#> 1 1 10 10 90
#> 2 2 9 19 81
#> 3 3 8 27 73
#> 4 4 8 35 65
#> 5 5 6 41 59
#> 6 6 6 47 53
#> 7 7 6 53 47
#> 8 8 4 57 43
#> 9 9 4 61 39
#> 10 10 3 64 36
#> 11 11 2 66 34
#> 12 12 1 67 33
#> 13 13 4 71 29
#> 14 14 1 72 28
#> 15 15 2 74 26
#> 16 16 2 76 24
#> 17 17 3 79 21
#> 18 18 4 83 17
#> 19 19 2 85 15
#> 20 20 2 87 13
#> 21 21 1 88 12
#> 22 24 1 89 11
#> 23 25 2 91 9
#> 24 26 1 92 8
#> 25 27 1 93 7
#> 26 30 1 94 6
#> 27 31 1 95 5
#> 28 33 1 96 4
#> 29 34 1 97 3
#> 30 36 1 98 2
#> 31 41 1 99 1
#> 32 66 1 100 0
#>
#> $game_2
#> iteration_i Count Cumulative unobserved
#> 1 1 10 10 90
#> 2 2 10 20 80
#> 3 3 7 27 73
#> 4 4 7 34 66
#> 5 5 8 42 58
#> 6 6 5 47 53
#> 7 7 7 54 46
#> 8 8 5 59 41
#> 9 9 1 60 40
#> 10 10 7 67 33
#> 11 11 3 70 30
#> 12 12 3 73 27
#> 13 13 1 74 26
#> 14 14 3 77 23
#> 15 15 4 81 19
#> 16 16 3 84 16
#> 17 17 2 86 14
#> 18 18 1 87 13
#> 19 19 2 89 11
#> 20 20 1 90 10
#> 21 21 2 92 8
#> 22 22 1 93 7
#> 23 25 2 95 5
#> 24 27 1 96 4
#> 25 29 2 98 2
#> 26 30 1 99 1
#> 27 41 1 100 0
The output is a list, so you can use purrr to apply functions to each element (e.g. https://purrr.tidyverse.org/reference/lmap.html) or use the bind_rows() function to create a single dataframe for further use, e.g.
df2 <- bind_rows(output, .id = "game") %>%
group_by("Game" = factor(game, levels = list_of_names)) %>%
summarise(rows_in_output = n(),
number_of_iterations = max(iteration_i))
df2
#> # A tibble: 10 × 3
#> Game rows_in_output number_of_iterations
#> <fct> <int> <int>
#> 1 game_1 32 66
#> 2 game_2 27 41
#> 3 game_3 27 48
#> 4 game_4 32 50
#> 5 game_5 27 35
#> 6 game_6 27 71
#> 7 game_7 28 68
#> 8 game_8 27 48
#> 9 game_9 29 43
#> 10 game_10 29 66
Created on 2022-06-17 by the reprex package (v2.0.1)
Or you can use list2env() to get have each individual dataframe in your environment, e.g.
list2env(output, envir = .GlobalEnv)
ls()
#> [1] "df2" "game_1" "game_10" "game_2" "game_3" "game_4"
#> [7] "game_5" "game_6" "game_7" "game_8" "game_9" "list_of_names"
#> [13] "my_data" "output" "response" "results" "results2" "sample_func"
#> [19] "var_1" "var_2" "var_3"
This seems easily solvable using recursion:
fun <- function(x, i=1, size = 10){
a <- setdiff(x,sample(100, 10, TRUE)) # Remove the seen from x
if(length(a)) Recall(a, i+1) else i # if we have unobserved, call fun again
}
Now we can have as many games as we want:
data.frame(game = paste0('game',seq(10)), results = replicate(10, fun(1:100)))
game results
1 game1 62
2 game2 40
3 game3 51
4 game4 50
5 game5 34
6 game6 83
7 game7 38
8 game8 40
9 game9 53
10 game10 41
You could also do
hist(replicate(1000, fun(1:100)), breaks = 30)
Edit:
Note that this can be editted to take in any size and vector. eg:
fun <- function(x, size = 10, y=x, i=1){
a <- setdiff(x,sample(y, size, TRUE))
cat('i', i, '\t a: ',a, '\n')
if(length(a)>0) Recall(a, size, y, i+1) else i
}
set.seed(117);fun(1:10, 1)
i 1 a: 1 2 4 5 6 7 8 9 10 # 3 removed
i 2 a: 1 2 4 5 7 8 9 10 # 6 removed
i 3 a: 1 2 4 5 7 8 9 # 10 removed
i 4 a: 1 2 4 5 8 9 # 7 removed
i 5 a: 1 2 5 8 9 # 4 removed
i 6 a: 1 2 5 8 9 # Nothing removed
i 7 a: 1 5 8 9 # 2 removed
i 8 a: 1 5 8 # 9 removed
i 9 a: 1 5 # Nothing removed
i 10 a: 1 5 # Nothing removed
i 11 a: 5 # 1 removed
i 12 a: # 5 removed
Using Markov chains, we can produce the cumulative distribution function for the number of iterations required for a game (up to machine precision). The resulting CDF can be sampled directly using findInterval.
We can simplify things slightly by starting with the second iteration, since the first iteration will always result in 90 unseen units.
First, set up a matrix for all possible transitions:
m <- matrix(c(rep(90:1, each = 11), sequence(rep(11,90), 90:1, -1)), ncol = 2, dimnames = list(NULL, c("from", "to")))
m <- m[m[,2] >= 0L,]
Then create a transition matrix with row 1 representing the state where all units have been seen and row 91 representing the state where 10 units have been seen:
mTrans <- matrix(0, 91, 91)
The number of previously unseen units selected follows the hypergeometric distribution.
mTrans[m + 1L] <- dhyper(m[,1] - m[,2], m[,1], 100L - m[,1], 10L)
Row 1 represents an absorbing state since all units have been seen.
mTrans[1, 1] <- 1
mTrans contains the probabilities of each state after the second iteration.
Initialize a while loop and calculate the CDF.
mm <- mTrans %*% mTrans
maxIter <- 1000L
p <- numeric(maxIter)
iter <- 3L
while (p[iter] < 1) {
if ((iter <- iter + 1L) > maxIter) {
p <- c(p, numeric(maxIter))
maxIter <- maxIter*2L
}
mm <- mm %*% mTrans
p[iter] <- mm[91, 1]
}
p <- p[1:iter]
iter
#> [1] 345
Machine precision limits the CDF to less than 345 iterations. Plot the CDF:
plot(p, xlab = "iterations", ylab = "cumulative probability")
Using findInterval we can quickly generate a large number of random samples of the iterations required.
ngames <- 1e6L # one million games
results <- data.frame(game_id = 1:ngames, iterations_required = findInterval(runif(ngames), p))
head(results)
#> game_id iterations_required
#> 1 1 73
#> 2 2 69
#> 3 3 40
#> 4 4 41
#> 5 5 44
#> 6 6 43
Get a histogram of the sample number of iterations required.
hist(results$iterations_required)
OP here! I think I was able to find an answer to my own question:
library(dplyr)
var_1 <- rnorm(100, 10, 10)
var_2 <- rnorm(100, 1, 10)
var_3 <- rnorm(100, 5, 10)
response <- rnorm(100, 1, 1)
my_data <- data.frame(var_1, var_2, var_3, response)
my_data$id <- 1:100
simulate <- function() {
results <- list()
results2 <- list()
for (i in 1:100) {
iteration_i <- i
sample_i <- my_data[sample(nrow(my_data), 10), ]
results_tmp <- data.frame(iteration_i, sample_i)
results[[i]] <- results_tmp
}
results_df <- do.call(rbind.data.frame, results)
test_1 <- data.frame(results_df %>%
group_by(id) %>%
filter(iteration_i == min(iteration_i)) %>%
distinct)
summary_file <- data.frame(test_1 %>%
group_by(iteration_i) %>%
summarise(Count=n()))
cumulative <- cumsum(summary_file$Count)
summary_file$Cumulative <- cumulative
summary_file$unobserved <- 100 - cumulative
return(summary_file)
}
# now, loop 10 times!
results <- list()
for (i in 1:10) {
game_i <- i
s_i <- simulate()
results_tmp <- data.frame(game_i, s_i)
results[[i]] <- results_tmp
}
final_file <- do.call(rbind.data.frame, results)
Thanks for your help everyone!
I have this function*,
x<- data.frame (
'Ones'=c(1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0),
'Thats'=c(0,5,3,6,8,4,5,6,8,3,1,NA,4,5,6,7,4,3,4,5))
f<- function(df, n){
df %>%
collapse::flag(-n:n) %>%
rowSums(na.rm = T) - 1
}
x %>%
mutate(gap1 = f(., 1),
gap2 = f(., 2),
gap3 = f(., 3),
gap4 = f(., 4),
gap5 = f(., 5)) %>%
filter(Ones == 1)
It finds when the variable Ones is qual to 1 and sums,
if gap1 = f(., 1), the values n-1, n and n+1 in Thats, with n being in the same row as 1,
if gap2 = f(.,2) n-2 + n-1 + n + n+1 + n+2...
And so on.
This is the output.
Ones Thats gap1 gap2 gap3 gap4 gap5
1 1 0 5 8 14 22 27
2 1 4 17 29 40 48 51
3 1 1 4 16 27 38 50
Now I want to use this function for each group of my sample.
This is a Facsimile of my sample. The real sample has more conditions and more observation, but this is just to give you an idea.
dat<- data.frame (
'sub' = c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2),
'trial' = c(1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2),
'Thumb'= c(1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0),
'Index'= c(0,5,3,6,8,4,5,6,8,3,1,3,4,5,6,7,4,3,4,5,4,5,3,23,2,1,4,6,3,2,3,2,4,6,3,2,4,3,2,1))
I need to apply the function to all groups of my sample.
For example, in this specific case I would 4 conditions:
sub1/trial1, sub2/trial1, sub1/trial2, sub2/trial2
I tried to write the function in this way:
f<- function(df, n){
df %>% group_by(trial, sub) %>%
collapse::flag(-n:n) %>%
rowSums(na.rm = T) - 1
}
But it doesn't work.
Do you have any advice to give me?
Please consider that the sample above is only an example... I need a general function working also with more conditions and observations.
Thank you,
Best regards
*credits: Kindly proposed by Donald Seinen on this site.
We may need group_modify if we are using the first function f
library(dplyr)
dat %>%
group_by(trial, sub) %>%
group_modify(~ .x %>%
mutate(gap1 = f(.x, 1), gap2 = f(.x, 2),
gap3 = f(.x, 3), gap4 = f(.x, 4), gap5 = f(.x, 5))) %>%
ungroup
-output
# A tibble: 40 × 9
trial sub Thumb Index gap1 gap2 gap3 gap4 gap5
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 1 0 5 8 14 22 27
2 1 1 0 5 8 14 22 27 32
3 1 1 0 3 13 22 27 32 38
4 1 1 0 6 16 26 32 38 46
5 1 1 0 8 18 26 37 46 49
6 1 1 1 4 17 29 40 48 49
7 1 1 0 5 15 31 40 43 48
8 1 1 0 6 18 26 34 40 43
9 1 1 0 8 16 21 26 34 40
10 1 1 0 3 10 16 21 26 34
# … with 30 more rows
which is the same result we get on the subset of data for the first combination
> dat %>%
filter(sub == 1, trial == 1) %>%
select(-sub, -trial) %>%
mutate(gap1 = f(., 1),
gap2 = f(., 2),
gap3 = f(., 3),
gap4 = f(., 4),
gap5 = f(., 5))
Thumb Index gap1 gap2 gap3 gap4 gap5
1 1 0 5 8 14 22 27
2 0 5 8 14 22 27 32
3 0 3 13 22 27 32 38
4 0 6 16 26 32 38 46
5 0 8 18 26 37 46 49
6 1 4 17 29 40 48 49
7 0 5 15 31 40 43 48
8 0 6 18 26 34 40 43
9 0 8 16 21 26 34 40
10 0 3 10 16 21 26 34
I have a dataframe like the following:
combo_2 combo_4 combo_7 combo_9
12 23 14 17
21 32 41 71
2 3 1 7
1 2 4 1
21 23 14 71
2 32 1 7
Each column has two single-digit values and two double-digit values composed of the single-digit values in each possible order.
I am trying to determine how to replace certain values in the dataframe so that there is only one version of the double-digit value. For example, all values of 21 in the first column should be 12. All values of 32 in the second column should become 23.
I know I can do something like this using the following code:
df <- df %>%
mutate_at(vars(combo_2, combo_4, combo_7, combo_9), function(x)
case_when(x == 21 ~ 12, x == 32 ~ 23, x == 41 ~ 14, x == 71 ~ 17))
The problem with this is that it gives me a dataframe that contains the correct values when specified but leaves all the other values as NA. The resulting dataframe only contains values where 21, 32, 41, and 71 were. I know I could address this by specifying each value, like x == 1 ~ 1. However, I have many values and would prefer to only specify the ones that I am trying to change.
How can I replace several values in a dataframe without all the other values becoming NA? Is there a way for me to replace the values I want to replace while holding the other values the same without directly specifying those values?
You can use TRUE ~ x at the end of your case_when() sequence:
df %>%
mutate_at(vars(combo_2, combo_4, combo_7, combo_9), function(x)
case_when(x == 21 ~ 12, x == 32 ~ 23, x == 41 ~ 14, x == 71 ~ 17, TRUE ~ x))
combo_2 combo_4 combo_7 combo_9
1 12 23 14 17
2 12 23 14 17
3 2 3 1 7
4 1 2 4 1
5 12 23 14 17
6 2 23 1 7
Another option that may be more efficient would be data.table's fcase() function.
Data:
df = read.table(header = TRUE, text = "combo_2 combo_4 combo_7 combo_9
12 23 14 17
21 32 41 71
2 3 1 7
1 2 4 1
21 23 14 71
2 32 1 7")
df[] = lapply(df, as.double) # side-note: tidyverse has become very stict about types
One dplyr and stringi option may be:
df %>%
mutate(across(everything(),
~ if_else(. %in% c(21, 32, 41, 71), as.integer(stri_reverse(.)), .)))
combo_2 combo_4 combo_7 combo_9
1 12 23 14 17
2 12 23 14 17
3 2 3 1 7
4 1 2 4 1
5 12 23 14 17
6 2 23 1 7
Using mapply:
df1[] <- mapply(function(d, x1, x2){ ifelse(d == x1, x2, d) },
d = df1,
x1 = c(21, 32, 41, 71),
x2 = c(12, 23, 14, 17))
df1
# combo_2 combo_4 combo_7 combo_9
# 1 12 23 14 17
# 2 12 23 14 17
# 3 2 3 1 7
# 4 1 2 4 1
# 5 12 23 14 17
# 6 2 23 1 7
Description of Data: Dataset contains information regarding users about their age, gender and membership they are holding.
Goal: Create a new column to identify the group/label for each user based on pre-defined conditions.
Age conditions: multiple age brackets :
18 >= age <= 24, 25 >= age <=30, 31 >= age <= 41, 41 >= age <= 60, age >= 61
Gender: M/F
Membership: A,B,C,I
I created sample data frame to try out creation of new column to identify the group/label
df = data.frame(userid = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10,11, 12),
age = c(18, 61, 23, 35, 30, 25, 55, 53, 45, 41, 21, NA),
gender = c('F', 'M', 'F', 'F', 'M', 'M', 'M', 'M', 'M', 'F', '<NA>', 'M'),
membership = c('A', 'B', 'A', 'C', 'C', 'B', 'A', 'A', 'I', 'I', 'A', '<NA>'))
userid age gender membership
1 1 18 F A
2 2 61 M B
3 3 23 F A
4 4 35 F C
5 5 30 M C
6 6 25 M B
7 7 55 M A
8 8 53 M A
9 9 45 M I
10 10 41 F I
11 11 21 <NA> A
12 12 NA M <NA>
Based on above data there exist 4 * 2 * 5 options (combinations)
Final outcome:
userid age gender membership GroupID
1 1 16 F A 1
2 2 61 M B 40
3 3 23 F A 1
4 4 35 F C 4
5 5 30 M C 5
6 6 25 M B 3
7 7 55 M A 32
8 8 53 M A 32
9 9 45 M I 34
10 10 41 F I 35
userid age gender membership GroupID
1 1 18 F A 1
2 2 61 M B 40
3 3 23 F A 1
4 4 35 F C 4
5 5 30 M C 5
6 6 25 M B 3
7 7 55 M A 32
8 8 53 M A 32
9 9 45 M I 34
10 10 41 F I 35
11 11 21 <NA> A 43 (assuming it will auto-detec combo)
12 12 NA M <NA> 46
I believe my calculation of combinations are correct and if so how can I use dplyr or any other option to get above data frame.
Use multiple if conditions to confirm all the options?
In dplyr is there a way to actually provide conditions for each column to set the grouping conditions:
df %>% group_by(age, gender, membership)
Two options,
One, more automated;
# install.packages(c("tidyverse""), dependencies = TRUE)
library(tidyverse)
df %>% mutate(ageCat = cut(age, breaks = c(-Inf, 24, 30, 41, 60, Inf))) %>%
mutate(GroupID = group_indices(., ageCat, gender, membership)) %>% select(-ageCat)
#> userid age gender membership GroupID
#> 1 1 18 F A 2
#> 2 2 61 M B 9
#> 3 3 23 F A 2
#> 4 4 35 F C 5
#> 5 5 30 M C 4
#> 6 6 25 M B 3
#> 7 7 55 M A 7
#> 8 8 53 M A 7
#> 9 9 45 M I 8
#> 10 10 41 F I 6
#> 11 11 21 <NA> A 1
#> 12 12 NA M <NA> 10
Two, more manual;
Here I make an illustration of a solution with category 1 and 4, you have to code the rest yourself.
df %>% mutate(GroupID =
ifelse((age >= 18 | age > 25) & gender == 'F' & membership == "A", 1,
ifelse((age >= 31 | age > 41) & gender == 'F' & membership == "C", 4, NA)
))
#> userid age gender membership GroupID
#> 1 1 18 F A 1
#> 2 2 61 M B NA
#> 3 3 23 F A 1
#> 4 4 35 F C 4
#> 5 5 30 M C NA
#> 6 6 25 M B NA
#> 7 7 55 M A NA
#> 8 8 53 M A NA
#> 9 9 45 M I NA
#> 10 10 41 F I NA
#> 11 11 21 <NA> A NA
#> 12 12 NA M <NA> NA
the data structure in case others feel like giving it a go,
You can try this:
setDT(df)[,agegrp:= ifelse((df$age >= 18) & (df$age <= 24), 1, ifelse((df$age >= 25) & (df$age <= 30), 2, ifelse((df$age >= 31) & (df$age <= 41),3,ifelse((df$age >= 42) & (df$age <= 60),4,5))))]
setDT(df)[, group := .GRP, by = .(agegrp,gender, membership)]
If you want to use base R only, you could do something like this:
# 1
allcombos <- expand.grid(c("M", "F"), c("A", "B", "C", "I"), 1:5)
allgroups <- do.call(paste0, allcombos) # 40 unique combinations
# 2
agegroups <- cut(df$age,
breaks = c(17, 24, 30, 41, 61, 99),
labels = c(1, 2, 3, 4, 5))
# 3
df$groupid <- paste0(df$gender, df$membership, agegroups)
df$groupid <- factor(df$groupid, levels=allgroups, labels=1:length(allgroups))
expand.grid gives you a data.frame with three columns where every row represents a unique combination of the three arguments provided. As you said, these are 40 combinations. The second line combines every row of the data frame in a single string, like "MA1", "FA1", "MB1", etc.
Then we use cut to each age to its relevant age group with names 1 to 5.
We create a column in df that contains the three character combination of the gender, membership and age group which is then converted to a factor, according to all possible combinations we found in allgroups.