I am working with the R programming language. I have the following data:
library(dplyr)
my_data = data.frame(id = c(1,1,1,1,2,2,2,3,4,4,5,5,5,5,5), var_1 = sample(c(0,1), 15, replace = TRUE) , var_2 =sample(c(0,1), 15 , replace = TRUE) )
my_data = data.frame(my_data %>% group_by(id) %>% mutate(index = row_number(id)))
my_data = my_data[,c(1,4,2,3)]
The data looks something like this:
id index var_1 var_2
1 1 1 0 1
2 1 2 0 0
3 1 3 1 1
4 1 4 0 1
5 2 1 1 0
6 2 2 1 1
7 2 3 0 1
8 3 1 1 0
9 4 1 0 0
10 4 2 0 0
11 5 1 0 0
12 5 2 1 0
13 5 3 0 1
14 5 4 0 0
15 5 5 0 1
I want to create two new variables (v_1, v_2). For each unique "id":
v_1: I want v_1 to be the average value of the current, previous and previous-to-previous values of var_1 (i.e. index = n, index = n-1 and index = n-2). When this is not possible (e.g. for index = 2 and index = 1), I want this average to be for as back as you can go.
v_2: I want v_2 to be the average value of the current, previous and previous-to-previous values of var_2 (i.e. index = n, index = n-1 and index = n-2). When this is not possible (e.g. for index = 2 and index = 1), I want this average to be for as back as you can go.
This would be something like this:
row 1 (id = 1, index = 1) : v_1 = var_1 (index 1)
row 2 (id = 1, index = 1 ; id = 1 index = 2) : v_1 = (var_1 (index 1) + var_1 (index 2))/2
row 3 (id = 1, index = 1 ; id = 1 index = 2; id = 1, index = 3) : v_1 = (var_1 (index 1) + var_1 (index 2) + var_1 (index 3)) /3
row 4 (id = 1, index = 2 ; id = 1 index = 3; id = 1, index = 4) : v_1 = (var_1 (index 2) + var_1 (index 3) + var_1 (index 4)) /3
etc.
I tried to do this with the following code:
average_data = my_data %>%
group_by(id) %>%
summarise(v_1 = mean(tail(var_1, 3)),
v_2 = mean(tail(var_2, 3)))
# final_result
final_data = merge(x = my_data, y = average_data, by = "id", all.x = TRUE)
But I am not sure if this is correct.
Can someone please show me how to do this?
Thanks!
data
df <- data.frame(
id = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 4L, 4L, 5L, 5L, 5L, 5L, 5L),
index = c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 1L, 1L, 2L, 1L, 2L, 3L, 4L, 5L),
var_1 = c(0L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L),
var_2 = c(1L, 0L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L)
)
tidyverse
library(tidyverse)
df %>%
group_by(id) %>%
mutate(across(starts_with("var_"),
.fns = ~zoo::rollapply(data = .x, width = 3, FUN = mean, partial = TRUE, align = "right"),
.names = "new_{.col}")) %>%
ungroup()
#> # A tibble: 15 × 6
#> id index var_1 var_2 new_var_1 new_var_2
#> <int> <int> <int> <int> <dbl> <dbl>
#> 1 1 1 0 1 0 1
#> 2 1 2 0 0 0 0.5
#> 3 1 3 1 1 0.333 0.667
#> 4 1 4 0 1 0.333 0.667
#> 5 2 1 1 0 1 0
#> 6 2 2 1 1 1 0.5
#> 7 2 3 0 1 0.667 0.667
#> 8 3 1 1 0 1 0
#> 9 4 1 0 0 0 0
#> 10 4 2 0 0 0 0
#> 11 5 1 0 0 0 0
#> 12 5 2 1 0 0.5 0
#> 13 5 3 0 1 0.333 0.333
#> 14 5 4 0 0 0.333 0.333
#> 15 5 5 0 1 0 0.667
Created on 2022-06-06 by the reprex package (v2.0.1)
data.table
library(data.table)
COLS <- gsub("ar", "", grep("var_", names(df), value = TRUE))
setDT(df)[,
(COLS) := lapply(.SD, function(x) zoo::rollapply(data = x, width = 3, FUN = mean, partial = TRUE, align = "right")),
by = id,
.SDcols = patterns("var_")][]
#> id index var_1 var_2 v_1 v_2
#> 1: 1 1 0 1 0.0000000 1.0000000
#> 2: 1 2 0 0 0.0000000 0.5000000
#> 3: 1 3 1 1 0.3333333 0.6666667
#> 4: 1 4 0 1 0.3333333 0.6666667
#> 5: 2 1 1 0 1.0000000 0.0000000
#> 6: 2 2 1 1 1.0000000 0.5000000
#> 7: 2 3 0 1 0.6666667 0.6666667
#> 8: 3 1 1 0 1.0000000 0.0000000
#> 9: 4 1 0 0 0.0000000 0.0000000
#> 10: 4 2 0 0 0.0000000 0.0000000
#> 11: 5 1 0 0 0.0000000 0.0000000
#> 12: 5 2 1 0 0.5000000 0.0000000
#> 13: 5 3 0 1 0.3333333 0.3333333
#> 14: 5 4 0 0 0.3333333 0.3333333
#> 15: 5 5 0 1 0.0000000 0.6666667
Created on 2022-06-06 by the reprex package (v2.0.1)
I would say this is moving average, and it can be impemented by a function f like below, using embed (preferrable) or sapply (less efficient, not recommanded), and run it group-wisely using ave:
f <- function(v, n = 3) {
rowMeans(embed(c(rep(NA, n-1), v), n), na.rm = TRUE)
}
or
f <- function(v, n = 3) {
sapply(
seq_along(v),
function(k) sum(v[pmax(k - n + 1, 1):k]) / pmin(k, n)
)
}
and then we run
transform(
df,
v1 = ave(var_1, id, FUN = f),
v2 = ave(var_2, id, FUN = f)
)
such that
id index var_1 var_2 v1 v2
1 1 1 0 1 0.0000000 1.0000000
2 1 2 0 0 0.0000000 0.5000000
3 1 3 1 1 0.3333333 0.6666667
4 1 4 0 1 0.3333333 0.6666667
5 2 1 1 0 1.0000000 0.0000000
6 2 2 1 1 1.0000000 0.5000000
7 2 3 0 1 0.6666667 0.6666667
8 3 1 1 0 1.0000000 0.0000000
9 4 1 0 0 0.0000000 0.0000000
10 4 2 0 0 0.0000000 0.0000000
11 5 1 0 0 0.0000000 0.0000000
12 5 2 1 0 0.5000000 0.0000000
13 5 3 0 1 0.3333333 0.3333333
14 5 4 0 0 0.3333333 0.3333333
15 5 5 0 1 0.0000000 0.6666667
You could create a function that acomplishes this:
library(tidyverse)
fun <- function(x, k){
y <- cummean(first(x, k-1))
if(k > length(x)) y else c(y, zoo::rollmean(x, k))
}
df %>%
group_by(id) %>%
mutate(v_1 = fun(var_1, 3), v_2 = fun(var_2, 3))
# Groups: id [5]
id index var_1 var_2 v_1 v_2
<int> <int> <int> <int> <dbl> <dbl>
1 1 1 0 1 0 1
2 1 2 0 0 0 0.5
3 1 3 1 1 0.333 0.667
4 1 4 0 1 0.333 0.667
5 2 1 1 0 1 0
6 2 2 1 1 1 0.5
7 2 3 0 1 0.667 0.667
8 3 1 1 0 1 0
9 4 1 0 0 0 0
10 4 2 0 0 0 0
11 5 1 0 0 0 0
12 5 2 1 0 0.5 0
13 5 3 0 1 0.333 0.333
14 5 4 0 0 0.333 0.333
15 5 5 0 1 0 0.667
Here is a solution using only built in functions and dplyr:
my_data %>%
mutate(
row = seq_along(id),
v_1 = (var_1 + lag(var_1, default = 0) + lag(var_1, 2, default = 0))/pmin(row, 3),
v_2 = (var_2 + lag(var_2, default = 0) + lag(var_2, 2, default = 0))/pmin(row, 3)
)
#> id index var_1 var_2 row v_1 v_2
#> 1 1 1 0 1 1 0.0000000 1.0000000
#> 2 1 2 1 0 2 0.5000000 0.5000000
#> 3 1 3 1 1 3 0.6666667 0.6666667
#> 4 1 4 1 0 4 1.0000000 0.3333333
#> 5 2 1 0 1 5 0.6666667 0.6666667
#> 6 2 2 0 1 6 0.3333333 0.6666667
#> 7 2 3 1 1 7 0.3333333 1.0000000
#> 8 3 1 1 1 8 0.6666667 1.0000000
#> 9 4 1 1 1 9 1.0000000 1.0000000
#> 10 4 2 1 1 10 1.0000000 1.0000000
#> 11 5 1 0 1 11 0.6666667 1.0000000
#> 12 5 2 0 1 12 0.3333333 1.0000000
#> 13 5 3 0 0 13 0.0000000 0.6666667
#> 14 5 4 0 0 14 0.0000000 0.3333333
#> 15 5 5 1 0 15 0.3333333 0.0000000
Created on 2022-06-09 by the reprex package (v2.0.1)
dplyr::lag() gives you the previuous values of your variable. If they don't exist, we swap them for 0, basically ignoring them. To get the average we divide on pmax(seq_along(<any variable>, 3)), which will be 1 for the first row, 2 for the second, and 3 for all other rows.
This will also work on a grouped dataframe.
You can use filter (which is hidden when loading dplyr) or convolve and ave for grouping.
fun <- function(x) {
. <- if(length(x) > 2) stats::filter(x, c(1,1,1)/3, side=1)[-2:-1] else NULL
#. <- if(length(x) > 2) convolve(x, c(1,1,1)/3, , type = "filter") else NULL #Alternative
c(cummean(x[1:min(2, length(x))]), .)
}
my_data$v_1 <- ave(my_data$var_1, my_data$id, FUN=fun)
my_data$v_2 <- ave(my_data$var_2, my_data$id, FUN=fun)
my_data
# id index var_1 var_2 v_1 v_2
#1 1 1 1 1 1.0000000 1.0000000
#2 1 2 1 1 1.0000000 1.0000000
#3 1 3 0 1 0.6666667 1.0000000
#4 1 4 1 1 0.6666667 1.0000000
#5 2 1 0 1 0.0000000 1.0000000
#6 2 2 0 0 0.0000000 0.5000000
#7 2 3 1 0 0.3333333 0.3333333
#8 3 1 0 0 0.0000000 0.0000000
#9 4 1 0 1 0.0000000 1.0000000
#10 4 2 0 0 0.0000000 0.5000000
#11 5 1 1 0 1.0000000 0.0000000
#12 5 2 0 1 0.5000000 0.5000000
#13 5 3 0 0 0.3333333 0.3333333
#14 5 4 1 0 0.3333333 0.3333333
#15 5 5 0 1 0.3333333 0.3333333
Or using cumsum:
fun2 <- function(x, n=3) {
(cumsum(x) - head(cumsum(c(rep(0, n), x)), -n)) / pmin(n, seq_along(x)) }
my_data$v_1 <- ave(my_data$var_1, my_data$id, FUN=fun2)
my_data$v_2 <- ave(my_data$var_2, my_data$id, FUN=fun2)
Here is a try with a simple function avg to return this type of average
library(dplyr , warn.conflicts = FALSE)
set.seed(1978)
my_data = data.frame(id = c(1,1,1,1,2,2,2,3,4,4,5,5,5,5,5), var_1 = sample(c(0,1), 15, replace = TRUE) , var_2 =sample(c(0,1), 15 , replace = TRUE) )
my_data = data.frame(my_data %>% group_by(id) %>% mutate(index = row_number(id)))
my_data = my_data[,c(1,4,2,3)]
#===================================
avg <- function(x){
t <- rep(c(T,NA) , c(3 , length(x) - 1))
m <- numeric(length(x))
for(i in 1:length(x)){
m [i]<- mean(x[t[3:length(t)]] , na.rm = TRUE)
t <- lag(t)
}
m
}
#===================================
library(tidyverse)
my_data %>%
group_by(id) %>%
mutate(v_1 = avg(var_1), v_2 = avg(var_2))
#> # A tibble: 15 × 6
#> # Groups: id [5]
#> id index var_1 var_2 v_1 v_2
#> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 0 0 0 0
#> 2 1 2 1 0 0.5 0
#> 3 1 3 1 0 0.667 0
#> 4 1 4 1 1 1 0.333
#> 5 2 1 0 1 0 1
#> 6 2 2 1 1 0.5 1
#> 7 2 3 0 0 0.333 0.667
#> 8 3 1 1 0 1 0
#> 9 4 1 1 1 1 1
#> 10 4 2 0 1 0.5 1
#> 11 5 1 1 1 1 1
#> 12 5 2 1 0 1 0.5
#> 13 5 3 0 1 0.667 0.667
#> 14 5 4 1 0 0.667 0.333
#> 15 5 5 1 0 0.667 0.333
Created on 2022-06-09 by the reprex package (v2.0.1)
This uses dplyr's across with slider's slide_dbl; both from the tidyverse. Slider handles partial windows, so is well-suited to this problem.
(%>% may be used instead of the native pipe |>.)
library(dplyr)
library(slider)
# Sample Data
df <- data.frame(
id = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 4L, 4L, 5L, 5L, 5L, 5L, 5L),
index = c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 1L, 1L, 2L, 1L, 2L, 3L, 4L, 5L),
var_1 = c(0L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L),
var_2 = c(1L, 0L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L)
)
# Possible answer
df |>
group_by(id) |>
mutate(across(starts_with("var_"), ~ slide_dbl(., mean, .before = 2), .names = "{.col}_mean")) |>
ungroup()
#> # A tibble: 15 × 6
#> id index var_1 var_2 var_1_mean var_2_mean
#> <int> <int> <int> <int> <dbl> <dbl>
#> 1 1 1 0 1 0 1
#> 2 1 2 0 0 0 0.5
#> 3 1 3 1 1 0.333 0.667
#> 4 1 4 0 1 0.333 0.667
#> 5 2 1 1 0 1 0
#> 6 2 2 1 1 1 0.5
#> 7 2 3 0 1 0.667 0.667
#> 8 3 1 1 0 1 0
#> 9 4 1 0 0 0 0
#> 10 4 2 0 0 0 0
#> 11 5 1 0 0 0 0
#> 12 5 2 1 0 0.5 0
#> 13 5 3 0 1 0.333 0.333
#> 14 5 4 0 0 0.333 0.333
#> 15 5 5 0 1 0 0.667
Created on 2022-06-12 by the reprex package (v2.0.1)
Related
I have a table such as
Group Family Nb
1 A 15
2 B 20
3 A 2
3 B 1
3 C 1
4 D 10
4 A 5
5 B 1
5 D 1
And I would like to transform that dataframe such that I have each unique Family element in columns, and for each Group, the frequency of the Nb element, I should then get :
Group A B C D E F
1 1 0 0 0 0 0
2 0 1 0 0 0 0
3 0.5 0.25 0.25 0 0 0
4 0.33 0 0 0.67 0 0
5 0 0.5 0 0.5 0 0
Here is the dput format of the tabel if it can helps :
Family = c("A", "B", "A", "B", "C", "D", "A", "B", "D"),
Nb = c(15L, 20L, 2L, 1L, 1L, 10L, 5L, 1L, 1L)), class = "data.frame", row.names = c(NA,
-9L))
in Base R:
prop.table(xtabs(Nb ~ ., df), 1)
# Family
#Group A B C D
# 1 1.0000000 0.0000000 0.0000000 0.0000000
# 2 0.0000000 1.0000000 0.0000000 0.0000000
# 3 0.5000000 0.2500000 0.2500000 0.0000000
# 4 0.3333333 0.0000000 0.0000000 0.6666667
# 5 0.0000000 0.5000000 0.0000000 0.5000000
If you need it as a data.frame, just wrap the results in as.data.frame.matrix
You can first group_by the Group column, then calculate the frequency and finally pivot the data into a "wide" format.
library(tidyverse)
df %>%
group_by(Group) %>%
mutate(Nb = Nb/sum(Nb)) %>%
pivot_wider(Group, names_from = "Family", values_from = "Nb", values_fill = 0)
# A tibble: 5 × 5
# Groups: Group [5]
Group A B C D
<int> <dbl> <dbl> <dbl> <dbl>
1 1 1 0 0 0
2 2 0 1 0 0
3 3 0.5 0.25 0.25 0
4 4 0.333 0 0 0.667
5 5 0 0.5 0 0.5
Another possible solution:
library(tidyverse)
df %>%
pivot_wider(names_from = Family, values_from = Nb, values_fill = 0) %>%
mutate(aux = rowSums(.[-1]), across(-Group, ~ .x / aux), aux = NULL)
#> # A tibble: 5 × 5
#> Group A B C D
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 0 0 0
#> 2 2 0 1 0 0
#> 3 3 0.5 0.25 0.25 0
#> 4 4 0.333 0 0 0.667
#> 5 5 0 0.5 0 0.5
Consider the following example:
t1 t2 t3 t4 t5 t6
A 4 6 7 8 5
A 3 6 8 1 4
A 2 3 5 3 1
A 3 4 3 1 3
B 3 6 8 3 4
B 3 9 3 7 3
B 5 2 3 2 1
I hope to get a dataframe which checks for differences between every pair of rows possible within a group (A and B seperately).
I have been trying to subset the dataframe based on a loop and manually calculate the differences between rows. This is leading to a lot of computation and it is hard to manage the referencing after all the differences have been calculated. I am basically ending up with several lists.
The resulting dataframe needs to contain the row differences for all possible row combinations(or permutations?) within a group.
For example, for B, without considering the first column i.e. characters, the following is the expected result:
t2 t3 t4 t5 t6
0 -3 5 -4 1
-2 4 5 1 3
-2 7 0 5 2
The sign is not very important. Only the magnitude
Here's one way to get a table with a row showing the difference between every pair of rows in each group. For example, row 2 of this output shows row 1 of the input minus row 3 of the input.
library(data.table)
setDT(df) # convert to data.table
df[, { pairs <- CJ(row1 = 1:.N, row2 = 1:.N)[row1 != row2]
data.table(pairs + .I[1] - 1, .SD[pairs[[1]]] - .SD[pairs[[2]]])
}, by = t1]
# t1 row1 row2 t2 t3 t4 t5 t6
# 1: A 1 2 1 0 -1 7 1
# 2: A 1 3 2 3 2 5 4
# 3: A 1 4 1 2 4 7 2
# 4: A 2 1 -1 0 1 -7 -1
# 5: A 2 3 1 3 3 -2 3
# 6: A 2 4 0 2 5 0 1
# 7: A 3 1 -2 -3 -2 -5 -4
# 8: A 3 2 -1 -3 -3 2 -3
# 9: A 3 4 -1 -1 2 2 -2
# 10: A 4 1 -1 -2 -4 -7 -2
# 11: A 4 2 0 -2 -5 0 -1
# 12: A 4 3 1 1 -2 -2 2
# 13: B 5 6 0 -3 5 -4 1
# 14: B 5 7 -2 4 5 1 3
# 15: B 6 5 0 3 -5 4 -1
# 16: B 6 7 -2 7 0 5 2
# 17: B 7 5 2 -4 -5 -1 -3
# 18: B 7 6 2 -7 0 -5 -2
This is a little redundant since it also shows row 3 - row 1 (which is just the negative). If you don't want this duplication, change row1 != row2 to row1 < row2.
df[, { pairs <- CJ(row1 = 1:.N, row2 = 1:.N)[row1 < row2]
data.table(pairs + .I[1] - 1, .SD[pairs[[1]]] - .SD[pairs[[2]]])
}, by = t1]
# t1 row1 row2 t2 t3 t4 t5 t6
# 1: A 1 2 1 0 -1 7 1
# 2: A 1 3 2 3 2 5 4
# 3: A 1 4 1 2 4 7 2
# 4: A 2 3 1 3 3 -2 3
# 5: A 2 4 0 2 5 0 1
# 6: A 3 4 -1 -1 2 2 -2
# 7: B 5 6 0 -3 5 -4 1
# 8: B 5 7 -2 4 5 1 3
# 9: B 6 7 -2 7 0 5 2
Explanation:
CJ(a, b) generates a data.table with a row for all possible pairs of values (a[i], b[j]). Example:
CJ(1:3, 1:3)
# V1 V2
# 1: 1 1
# 2: 1 2
# 3: 1 3
# 4: 2 1
# 5: 2 2
# 6: 2 3
# 7: 3 1
# 8: 3 2
# 9: 3 3
Since it is a data.table, you can subset by using non-$-prefixed column names in [], example
CJ(a = 1:3, b = 1:3)[a < b]
# a b
# 1: 1 2
# 2: 1 3
# 3: 2 3
Within the j part of dt[i, j, k], the variable .SD is the entire data.table subset to the given group (groups determined by the grouping vars in k). So this answer takes the first element of each pair, selects the rows of the group corresponding to those elements .SD[pairs[[1]]], and subtracts from that the rows of the group corresponding to the other element of each pair .SD[pairs[[2]]]. A data.table is created with pairs and the output of this subtraction. This is done for each group, and data.table automatically rbinds all the group outputs together.
Using tidyverse, you can use a Cartesian join within each t1:
dat %>%
gather(key, value, -grp) %>%
left_join(gather(dat, key, value, - grp), by = "grp") %>%
mutate(diff = value.x - value.y)
grp key.x value.x key.y value.y diff
1 A t2 4 t2 4 0
2 A t2 4 t2 3 1
3 A t2 4 t2 2 2
4 A t2 4 t2 3 1
5 A t2 4 t3 6 -2
6 A t2 4 t3 6 -2
Note that since my brain doesn't always comply I relabeled t1 as grp.
Here is an option. We split the dataframe by group, then we find every combo of rows per group, then we map out the differences and rejoin.
library(tidyverse)
testDat %>%
group_by(t1) %>%
mutate(row = row_number()) %>%
split(.$t1) %>%
map(
~nest(., data = -c(t1, row)) %>%
list(.,.) %>%
reduce(full_join, by = "t1") %>%
rename(row1 = row.x, row2 = row.y, vec1 = data.x, vec2 = data.y) %>%
filter(row1 != row2) %>%
mutate(diff = map2(vec1, vec2, ~unlist(.x)-unlist(.y)))%>%
select(-vec1, -vec2) %>%
unnest_wider(col = diff)
) %>%
bind_rows()
#> # A tibble: 18 x 8
#> # Groups: t1 [2]
#> t1 row1 row2 t2 t3 t4 t5 t6
#> <chr> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A 1 2 1 0 -1 7 1
#> 2 A 1 3 2 3 2 5 4
#> 3 A 1 4 1 2 4 7 2
#> 4 A 2 1 -1 0 1 -7 -1
#> 5 A 2 3 1 3 3 -2 3
#> 6 A 2 4 0 2 5 0 1
#> 7 A 3 1 -2 -3 -2 -5 -4
#> 8 A 3 2 -1 -3 -3 2 -3
#> 9 A 3 4 -1 -1 2 2 -2
#> 10 A 4 1 -1 -2 -4 -7 -2
#> 11 A 4 2 0 -2 -5 0 -1
#> 12 A 4 3 1 1 -2 -2 2
#> 13 B 1 2 0 -3 5 -4 1
#> 14 B 1 3 -2 4 5 1 3
#> 15 B 2 1 0 3 -5 4 -1
#> 16 B 2 3 -2 7 0 5 2
#> 17 B 3 1 2 -4 -5 -1 -3
#> 18 B 3 2 2 -7 0 -5 -2
Another option is making use of outer
f1 <- function(dat) {
m1 <- outer(seq_len(nrow(dat)), seq_len(nrow(dat)),
FUN = Vectorize(function(i, j) list(dat[i, ] - dat[j, ])))
do.call(rbind, m1[row(m1)!= col(m1)])
}
do.call(rbind, lapply(split(df1[-1], df1$t1), f1))
Or using the function with tidyverse
library(dplyr)
library(purrr)
df1 %>%
group_split(t1) %>%
map_dfr(~ .x %>%
summarise(t1 = first(t1), out = list(f1(.[-1])))) %>%
unnest(out)
data
df1 <- structure(list(t1 = c("A", "A", "A", "A", "B", "B", "B"), t2 = c(4L,
3L, 2L, 3L, 3L, 3L, 5L), t3 = c(6L, 6L, 3L, 4L, 6L, 9L, 2L),
t4 = c(7L, 8L, 5L, 3L, 8L, 3L, 3L), t5 = c(8L, 1L, 3L, 1L,
3L, 7L, 2L), t6 = c(5L, 4L, 1L, 3L, 4L, 3L, 1L)),
class = "data.frame", row.names = c(NA,
-7L))
Here is a base R solution, where combn() is used:
dfout <- lapply(split(df,df$t1),
function(x) do.call(rbind,combn(seq(nrow(x)),2, function(v) x[v[1],-1]-x[v[2],-1],simplify = F)))
such that
> dfout
$A
t2 t3 t4 t5 t6
1 1 0 -1 7 1
2 2 3 2 5 4
3 1 2 4 7 2
21 1 3 3 -2 3
22 0 2 5 0 1
31 -1 -1 2 2 -2
$B
t2 t3 t4 t5 t6
5 0 -3 5 -4 1
51 -2 4 5 1 3
6 -2 7 0 5 2
DATA
df <- structure(list(t1 = c("A", "A", "A", "A", "B", "B", "B"), t2 = c(4L,
3L, 2L, 3L, 3L, 3L, 5L), t3 = c(6L, 6L, 3L, 4L, 6L, 9L, 2L),
t4 = c(7L, 8L, 5L, 3L, 8L, 3L, 3L), t5 = c(8L, 1L, 3L, 1L,
3L, 7L, 2L), t6 = c(5L, 4L, 1L, 3L, 4L, 3L, 1L)), class = "data.frame", row.names = c(NA,
-7L))
I have the following tibble containing all the permutations of some indexes:
bb <- as_tibble(expand.grid(v1=0:2, v2=0:2)) %>%
arrange(v1, v2)
bb
# A tibble: 9 x 2
v1 v2
<int> <int>
1 0 0
2 0 1
3 0 2
4 1 0
5 1 1
6 1 2
7 2 0
8 2 1
9 2 2
How can it be arranged in such a way that it generates this output instead:
v1 v2
<int> <int>
1 0 0
2 1 1
3 2 2
4 0 1
5 1 2
6 2 0
7 0 2
8 1 0
9 2 1
Where the output is three groups/sets such that within each set there is no repetition of the index within each variable. Note that there can be only so many rows per group/set fulfilling this criteria ...
Sorry that I am not very familiar with tibble, so I provide a solution with data.frame in base R:
shifter <- function(x, n) ifelse(n == 0, return(x), return(c(tail(x, -n), head(x, n))))
res <- `rownames<-`(Reduce(rbind,lapply(seq(length(dfs<-split(df,rep(0:2,3)))),
function(k) {
dfs[[k]][,2] <- shifter(dfs[[k]][,1],k-1)
dfs[[k]]})),seq(nrow(df)))
which gives:
> res
v1 v2
1 0 0
2 1 1
3 2 2
4 0 1
5 1 2
6 2 0
7 0 2
8 1 0
9 2 1
DATA
df <- structure(list(v1 = c(0L, 0L, 0L, 1L, 1L, 1L, 2L, 2L, 2L), v2 = c(0L,
1L, 2L, 0L, 1L, 2L, 0L, 1L, 2L)), class = "data.frame", row.names = c(NA,
-9L))
Update: a more efficient generator for all combinations with desired format is given as below:
genAllCombn <- function(n) {
v1 <- rep(0:(n-1),n)
v2 <- (v1 + rep(0:(n-1),1,each = n)) %% n
return(data.frame(v1,v2))
}
> genAllCombn(4)
v1 v2
1 0 0
2 1 1
3 2 2
4 3 3
5 0 1
6 1 2
7 2 3
8 3 0
9 0 2
10 1 3
11 2 0
12 3 1
13 0 3
14 1 0
15 2 1
16 3 2
I need every four rows to be sorted by the 4th column, separately from the next four rows, made a function :
for (i in seq(1,nrow(data_frame), by=4)) {
data_frame[i:(i+3),4] <- sort(data_frame[i:(i+3),4], decreasing=TRUE) }
problem is only the 4th column gets sorted but the corresponding rows are maintained.
from
x y z userID
-1 1 2 5 1
-2 1 1 2 2
-3 0 0 5 5
-6 1 2 5 3
-4 1 1 2 6
-5 0 0 5 4
-4 1 1 2 1
-5 0 0 5 5
to -
x y z userID
-1 1 2 5 5
-2 1 1 2 3
-3 0 0 5 2
-6 1 2 5 1
-4 1 1 2 6
-5 0 0 5 5
-4 1 1 2 4
-5 0 0 5 1
With tidyverse, we can use %/% to create a grouping column with %/% and use that to sort the 'userID'
library(tidyverse)
df1 %>%
group_by(grp = (row_number()-1) %/% 4 + 1) %>%
#or use
#group_by(grp = cumsum(rep(c(TRUE, FALSE, FALSE, FALSE), length.out = n()))) %>%
mutate(userID = sort(userID, decreasing = TRUE))
# A tibble: 8 x 5
# Groups: grp [2]
# x y z userID grp
# <int> <int> <int> <int> <dbl>
#1 1 2 5 5 1
#2 1 1 2 3 1
#3 0 0 5 2 1
#4 1 2 5 1 1
#5 1 1 2 6 2
#6 0 0 5 5 2
#7 1 1 2 4 2
#8 0 0 5 1 2
Or using base R with ave
with(df1, ave(userID, (seq_along(userID)-1) %/% 4 + 1,
FUN = function(x) sort(x, decreasing = TRUE)))
#[1] 5 3 2 1 6 5 4 1
data
df1 <- structure(list(x = c(1L, 1L, 0L, 1L, 1L, 0L, 1L, 0L), y = c(2L,
1L, 0L, 2L, 1L, 0L, 1L, 0L), z = c(5L, 2L, 5L, 5L, 2L, 5L, 2L,
5L), userID = c(1L, 2L, 5L, 3L, 6L, 4L, 1L, 5L)), row.names = c(NA,
-8L), class = "data.frame")
In base R, we can split every 4 rows, order the fourth column and return the updated dataframe back.
df[] <- do.call(rbind, lapply(split(df, gl(nrow(df)/4, 4)),
function(p) p[order(p[[4]], decreasing = TRUE), ]))
df
# x y z userID
#1 0 0 5 5
#2 1 2 5 3
#3 1 1 2 2
#4 1 2 5 1
#5 1 1 2 6
#6 0 0 5 5
#7 0 0 5 4
#8 1 1 2 1
tidyverse approach using the same logic would be
library(tidyverse)
df %>%
group_split(gl(n()/4, 4), keep = FALSE) %>%
map_dfr(. %>% arrange(desc(userID)))
I have a data.frame as below and I want to add a variable describing the longest consecutive count of 1 in the VALUE variable observed in the group (i.e. longest consecutive rows with 1 in VALUE per group).
GROUP_ID VALUE
1 0
1 1
1 1
1 1
1 1
1 0
2 1
2 1
2 0
2 1
2 1
2 1
3 1
3 0
3 1
3 0
So the output would look like this:
GROUP_ID VALUE CONSECUTIVE
1 0 4
1 1 4
1 1 4
1 1 4
1 1 4
1 0 4
2 1 3
2 1 3
2 0 3
2 1 3
2 1 3
2 1 3
3 1 1
3 0 1
3 1 1
3 0 1
Any help would be greatly appreciated!
Using dplyr:
library(dplyr)
dat %>%
group_by(GROUP_ID) %>%
mutate(CONSECUTIVE = {rl <- rle(VALUE); max(rl$lengths[rl$values == 1])})
which gives:
# A tibble: 16 x 3
# Groups: GROUP_ID [3]
GROUP_ID VALUE CONSECUTIVE
<int> <int> <int>
1 1 0 4
2 1 1 4
3 1 1 4
4 1 1 4
5 1 1 4
6 1 0 4
7 2 1 3
8 2 1 3
9 2 0 3
10 2 1 3
11 2 1 3
12 2 1 3
13 3 1 1
14 3 0 1
15 3 1 1
16 3 0 1
Or with data.table:
library(data.table)
setDT(dat) # convert to a 'data.table'
dat[, CONSECUTIVE := {rl <- rle(VALUE); max(rl$lengths[rl$values == 1])}
, by = GROUP_ID][]
We can use ave with rle and get maximum occurrence of consecutive 1's for each group. (GROUP_ID)
df$Consecutive <- ave(df$VALUE, df$GROUP_ID, FUN = function(x) {
y <- rle(x == 1)
max(y$lengths[y$values])
})
df
# GROUP_ID VALUE Consecutive
#1 1 0 4
#2 1 1 4
#3 1 1 4
#4 1 1 4
#5 1 1 4
#6 1 0 4
#7 2 1 3
#8 2 1 3
#9 2 0 3
#10 2 1 3
#11 2 1 3
#12 2 1 3
#13 3 1 1
#14 3 0 1
#15 3 1 1
#16 3 0 1
Here is another option with data.table
library(data.table)
library(dplyr)
setDT(df1)[, CONSECUTIVE := max(table(na_if(rleid(VALUE)*VALUE, 0))), .(GROUP_ID)]
df1
# GROUP_ID VALUE CONSECUTIVE
# 1: 1 0 4
# 2: 1 1 4
# 3: 1 1 4
# 4: 1 1 4
# 5: 1 1 4
# 6: 1 0 4
# 7: 2 1 3
# 8: 2 1 3
# 9: 2 0 3
#10: 2 1 3
#11: 2 1 3
#12: 2 1 3
#13: 3 1 1
#14: 3 0 1
#15: 3 1 1
#16: 3 0 1
data
df1 <- structure(list(GROUP_ID = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L), VALUE = c(0L, 1L, 1L, 1L, 1L, 0L,
1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 0L)), class = "data.frame", row.names = c(NA,
-16L))