expand.grid with unknown set of variables - r

So, expand.grid returns a df of all the combinations of the vectors passed.
df <- expand.grid(1:3, 1:3)
df <- expand.grid(1:3, 1:3, 1:3)
What I would like is a generalized function that takes 1 parameter (number of vectors) and returns the appropriate data frame.
combinations <- function(n) {
return(expand.grid(0, 1, ... n))
}
Such that
combinations(2) returns(expand.grid(1:3, 1:3))
combinations(3) returns(expand.grid(1:3, 1:3, 1:3))
combinations(4) returns(expand.grid(1:3, 1:3, 1:3, 1:3))
etc.

combinations <- function(n)
expand.grid(rep(list(1:3),n))
> combinations(2)
Var1 Var2
1 1 1
2 2 1
3 3 1
4 1 2
5 2 2
6 3 2
7 1 3
8 2 3
9 3 3
> combinations(3)
Var1 Var2 Var3
1 1 1 1
2 2 1 1
3 3 1 1
4 1 2 1
5 2 2 1
6 3 2 1
7 1 3 1
8 2 3 1
9 3 3 1
10 1 1 2
11 2 1 2
12 3 1 2
13 1 2 2
14 2 2 2
15 3 2 2
16 1 3 2
17 2 3 2
18 3 3 2
19 1 1 3
20 2 1 3
21 3 1 3
22 1 2 3
23 2 2 3
24 3 2 3
25 1 3 3
26 2 3 3
27 3 3 3

Related

Efficient code to remove rows containing non-unique max?

Here's a simple example of an array for which I want to extract only those rows whose max value is unique (in that row).
foo <- expand.grid(1:3,1:3,1:3)
Var1 Var2 Var3
1 1 1 1
2 2 1 1
3 3 1 1
4 1 2 1
5 2 2 1
6 3 2 1
7 1 3 1
8 2 3 1
9 3 3 1
10 1 1 2
11 2 1 2
12 3 1 2
13 1 2 2
14 2 2 2
15 3 2 2
16 1 3 2
17 2 3 2
18 3 3 2
19 1 1 3
20 2 1 3
21 3 1 3
22 1 2 3
23 2 2 3
24 3 2 3
25 1 3 3
26 2 3 3
27 3 3 3
I've got working code:
winners <- max.col(foo)
finddupe <- rep(0,length=length(winners))
for (jf in 1:length(winners)) finddupe[jf] <- sum(foo[jf,] == foo[jf, winners[jf] ] )
winners <- winners[finddupe == 1]
foo <- foo[finddupe == 1, ]
That just looks inefficient to me.
I'd prefer a solution which only uses base - R calls, but am open to using tools in other libraries.
Another base R solution:
subset(foo, max.col(foo, 'first') == max.col(foo, 'last'))
Var1 Var2 Var3
2 2 1 1
3 3 1 1
4 1 2 1
6 3 2 1
7 1 3 1
8 2 3 1
10 1 1 2
12 3 1 2
15 3 2 2
16 1 3 2
17 2 3 2
19 1 1 3
20 2 1 3
22 1 2 3
23 2 2 3
>
Same logic as above in dplyr way:
library(dplyr)
foo %>%
filter(max.col(., 'first') == max.col(., 'last'))
Create a column of max with pmax from all the columns, then filter the rows where there is only a single unique max by getting the count on a logical dataset with rowSums
library(dplyr)
foo %>%
mutate(mx = do.call(pmax, c(across(everything()), na.rm = TRUE))) %>%
filter(rowSums(across(Var1:Var3, ~ .x == mx), na.rm = TRUE) == 1)
-output
Var1 Var2 Var3 mx
1 2 1 1 2
2 3 1 1 3
3 1 2 1 2
4 3 2 1 3
5 1 3 1 3
6 2 3 1 3
7 1 1 2 2
8 3 1 2 3
9 3 2 2 3
10 1 3 2 3
11 2 3 2 3
12 1 1 3 3
13 2 1 3 3
14 1 2 3 3
15 2 2 3 3
Or with base R
subset(foo, rowSums(foo == do.call(pmax, c(foo, na.rm = TRUE)),
na.rm = TRUE) == 1)
A base R approach using apply
foo[apply(foo, 1, function(x) sum(x[which.max(x)] == x) <= 1), ]
Var1 Var2 Var3
2 2 1 1
3 3 1 1
4 1 2 1
6 3 2 1
7 1 3 1
8 2 3 1
10 1 1 2
12 3 1 2
15 3 2 2
16 1 3 2
17 2 3 2
19 1 1 3
20 2 1 3
22 1 2 3
23 2 2 3
After verifying the answers so far (18:00 EST Weds 15 Feb), I ran a benchmark comparison. #onyambu wins the race. (cgw is me; ak** are akrun's solutions)
bar5 = 1:5
foo55 <- expand.grid(bar5,bar5,bar5,bar5,bar5)
microbenchmark(ony(foo55), cgw(foo55), akply(foo55), akbase(foo55), andre(foo55))
Unit: microseconds
expr min lq mean median uq max neval cld
ony(foo55) 455.117 495.2335 589.6801 517.3755 634.9795 3107.222 100 a
cgw(foo55) 314076.038 317184.4050 348711.9522 319784.5870 324921.0335 2691161.873 100 b
akply(foo55) 14156.653 14835.2230 16194.3699 15160.0270 16441.3550 74019.622 100 a
akbase(foo55) 858.969 896.8310 1055.4277 970.6395 1117.2420 4098.860 100 a
andre(foo55) 8161.406 8531.1700 9188.4801 8872.0325 9284.0995 14548.383 100 a

identify whenever values repeat in r

I have a dataframe like this.
data <- data.frame(Condition = c(1,1,2,3,1,1,2,2,2,3,1,1,2,3,3))
I want to populate a new variable Sequence which identifies whenever Condition starts again from 1.
So the new dataframe would look like this.
Thanks in advance for the help!
data <- data.frame(Condition = c(1,1,2,3,1,1,2,2,2,3,1,1,2,3,3),
Sequence = c(1,1,1,1,2,2,2,2,2,2,3,3,3,3,3))
base R
data$Sequence2 <- cumsum(c(TRUE, data$Condition[-1] == 1 & data$Condition[-nrow(data)] != 1))
data
# Condition Sequence Sequence2
# 1 1 1 1
# 2 1 1 1
# 3 2 1 1
# 4 3 1 1
# 5 1 2 2
# 6 1 2 2
# 7 2 2 2
# 8 2 2 2
# 9 2 2 2
# 10 3 2 2
# 11 1 3 3
# 12 1 3 3
# 13 2 3 3
# 14 3 3 3
# 15 3 3 3
dplyr
library(dplyr)
data %>%
mutate(
Sequence2 = cumsum(Condition == 1 & lag(Condition != 1, default = TRUE))
)
# Condition Sequence Sequence2
# 1 1 1 1
# 2 1 1 1
# 3 2 1 1
# 4 3 1 1
# 5 1 2 2
# 6 1 2 2
# 7 2 2 2
# 8 2 2 2
# 9 2 2 2
# 10 3 2 2
# 11 1 3 3
# 12 1 3 3
# 13 2 3 3
# 14 3 3 3
# 15 3 3 3
This took a while. Finally I find this solution:
library(dplyr)
data %>%
group_by(Sequnce = cumsum(
ifelse(Condition==1, lead(Condition)+1, Condition)
- Condition==1)
)
Condition Sequnce
<dbl> <int>
1 1 1
2 1 1
3 2 1
4 3 1
5 1 2
6 1 2
7 2 2
8 2 2
9 2 2
10 3 2
11 1 3
12 1 3
13 2 3
14 3 3
15 3 3

Filter ids when a maximum score is not observed in r

I need to filter ids that do not have maximum score points in them. Here is my sample dataset looks like
df <- data.frame(id = c(1,1,1,1,1, 2,2,2,2, 3,3,3,3,3, 4,4,4,4,4, 5,5,5),
score = c(0,1,2,0,1, 1,0,1,1, 0,1,2,3,3, 3,1,2,0,3, 0,1,0),
max.score = c(2,2,2,2,2, 1,1,1,1, 4,4,4,4,4, 3,3,3,3,3, 2,2,2))
> df
id score max.score
1 1 0 2
2 1 1 2
3 1 2 2
4 1 0 2
5 1 1 2
6 2 1 1
7 2 0 1
8 2 1 1
9 2 1 1
10 3 0 4
11 3 1 4
12 3 2 4
13 3 3 4
14 3 3 4
15 4 3 3
16 4 1 3
17 4 2 3
18 4 0 3
19 4 3 3
20 5 0 2
21 5 1 2
22 5 0 2
In this dataframe, I need to filter ids c(3,5) because these ids do not have the max.score in them. The desired output would be:
> df
id score max.score
1 3 0 4
2 3 1 4
3 3 2 4
4 3 3 4
5 3 3 4
6 5 0 2
7 5 1 2
8 5 0 2
Any ideas?
Thanks

Group by each increasing sequence in data frame

If I have a data frame with a column of monotonically increasing values such as:
x
1
2
3
4
1
2
3
1
2
3
4
5
6
1
2
How do I add a column to group each increasing sequence that results in:
x y
1 1
2 1
3 1
4 1
1 2
2 2
3 2
1 3
2 3
3 3
4 3
5 3
6 3
1 4
2 4
I can only think of using a loop which will be slow.
You may choose cumsum function to do it.
> x <- c(1,2,3,4,1,2,3,1,2,4,5,1,2)
> cumsum(x==1)
[1] 1 1 1 1 2 2 2 3 3 3 3 4 4
I would use diff and compute the cumulative sum:
df$y <- c(1, cumsum(diff(df$x) < 0 ) + 1)
> df
x y
1 1 1
2 2 1
3 3 1
4 4 1
5 1 2
6 2 2
7 3 2
8 1 3
9 2 3
10 3 3
11 4 3
12 5 3
13 6 3
14 1 4
15 2 4

Create a block column based on id and the value of another column in R

Given the following first two columns(id and time_diff), i want to generate the 'block' column
test
id time_diff block
1 a NA 1
2 a 1 1
3 a 1 1
4 a 1 1
5 a 3 1
6 a 3 1
7 b NA 2
8 b 11 3
9 b 1 3
10 b 1 3
11 b 1 3
12 b 12 4
13 b 1 4
14 c NA 5
15 c 4 5
16 c 7 5
The data is already sorted by id and time. The time_diff was computed based on the difference of the previous time and the time value for the row, given the same id. I want to create a block id which is an auto-increment value and increases when a new ID or a time_diff of >10 with the same id is encountered.
How can I achieve this in R?
Importing your data as a data frame with something like:
df = read.table(text='
id time_diff block
1 a NA 1
2 a 1 1
3 a 1 1
4 a 1 1
5 a 3 1
6 a 3 1
7 b NA 2
8 b 11 3
9 b 1 3
10 b 1 3
11 b 1 3
12 b 12 4
13 b 1 4
14 c NA 5
15 c 4 5
16 c 7 5')
You can do a one-liner like this to get occurrences satisfying your two conditions:
> new_col = as.vector(cumsum(
na.exclude(
c(F,diff(as.numeric(as.factor(df$id)))) | # change of id OR
df$time_diff > 10 # time_diff greater than 10
)
))
> new_col
[1] 0 0 0 0 0 1 2 2 2 2 3 3 4 4 4
And finally append this new column to your dataframe with cbind:
> cbind(df, block = c(0,new_col))
id time_diff block block
1 a NA 1 0
2 a 1 1 0
3 a 1 1 0
4 a 1 1 0
5 a 3 1 0
6 a 3 1 0
7 b NA 2 1
8 b 11 3 2
9 b 1 3 2
10 b 1 3 2
11 b 1 3 2
12 b 12 4 3
13 b 1 4 3
14 c NA 5 4
15 c 4 5 4
16 c 7 5 4
You will notice an offset between your wanted block variable and mine: correcting it is easy and can be done at several different step, I will leave it to you :)
Another variation of #Jealie's method would be:
with(test, cumsum(c(TRUE,id[-1]!=id[-nrow(test)])|time_diff>10))
#[1] 1 1 1 1 1 1 2 3 3 3 3 4 4 5 5 5
After learning from Jealie and akrun, I came up with this idea.
mydf %>%
mutate(group = cumsum(time_diff > 10 |!duplicated(id)))
# id time_diff block group
#1 a NA 1 1
#2 a 1 1 1
#3 a 1 1 1
#4 a 1 1 1
#5 a 3 1 1
#6 a 3 1 1
#7 b NA 2 2
#8 b 11 3 3
#9 b 1 3 3
#10 b 1 3 3
#11 b 1 3 3
#12 b 12 4 4
#13 b 1 4 4
#14 c NA 5 5
#15 c 4 5 5
#16 c 7 5 5
Here is an approach using dplyr:
require(dplyr)
set.seed(999)
test <- data.frame(
id = rep(letters[1:4], each = 3),
time_diff = sample(4:15)
)
test %>%
mutate(
b = as.integer(id) - lag(as.integer(id)),
more10 = time_diff > 10,
increment = pmax(b, more10, na.rm = TRUE),
increment = ifelse(row_number() == 1, 1, increment),
block = cumsum(increment)
) %>%
select(id, time_diff, block)
Try:
> df
id time_diff
1 a NA
2 a 1
3 a 1
4 a 1
5 a 3
6 a 3
7 b NA
8 b 11
9 b 1
10 b 1
11 b 1
12 b 12
13 b 1
14 c NA
15 c 4
16 c 7
block= c(1)
for(i in 2:nrow(df))
block[i] = ifelse(df$time_diff[i]>10 || df$id[i]!=df$id[i-1],
block[i-1]+1,
block[i-1])
df$block = block
df
id time_diff block
1 a NA 1
2 a 1 1
3 a 1 1
4 a 1 1
5 a 3 1
6 a 3 1
7 b NA 2
8 b 11 3
9 b 1 3
10 b 1 3
11 b 1 3
12 b 12 4
13 b 1 4
14 c NA 5
15 c 4 5
16 c 7 5

Resources