I have the following data frame (with 1000's of columns):
df<- structure(c(1, 2, 2, 1, 2, 2, 2, 1, 3, 3, 2, 2),
.Dim = 4:3, .Dimnames = list(c("a", "b", "c", "d"),
c("t1", "t2", "t3")))
What would be an efficient way to get average of every two rows?
Result I want:
t1 t2 t3
a 1 2 3
b 2 2 3
a_b 1.5 2 3
c 2 2 2
d 1 1 2
c_d 1.5 1.5 2
Split on ever 2 rows, then get mean per column, and rbind, and rbind all again.
do.call(rbind,
lapply(seq(1, nrow(df), 2), function(i){
x <- df[ i:(i + 1), , drop = FALSE]
res <- rbind(x, colSums(x)/2)
rownames(res)[ nrow(res) ] <- paste(rownames(x), collapse = "_")
res
}))
# t1 t2 t3
# a 1.0 2.0 3
# b 2.0 2.0 3
# a_b 1.5 2.0 3
# c 2.0 2.0 2
# d 1.0 1.0 2
# c_d 1.5 1.5 2
One dplyr possibility could be:
df %>%
data.frame() %>%
rownames_to_column() %>%
mutate_if(is.factor, as.numeric) %>%
group_by(group = gl(n()/2, 2)) %>%
group_map(~ bind_rows(.x, tibble(rowname = paste(.x$rowname, collapse = "_"),
t1 = mean(.x$t1),
t2 = mean(.x$t2),
t3 = mean(.x$t3)))) %>%
ungroup() %>%
select(-group)
rowname t1 t2 t3
<chr> <dbl> <dbl> <dbl>
1 a 1 2 2
2 b 2 2 2
3 a_b 1.5 2 2
4 c 2 2 1
5 d 1 1 1
6 c_d 1.5 1.5 1
The first three rows could be omitted if you create it beforehand as a data.frame, with names as a column and with factors as numeric variables. Then, what it does, is to, first, create a grouping variables using gl(). Second, it calculates the means, creates the name as a combination of the two elements in the group and binds it with the original data. Finally, it ungroups and removes the redundant variable.
a base R solution that works with any number of columns
M <- matrix(unlist(c(df)), ncol = 2, byrow = TRUE)
M <- cbind(M, rowMeans(M))
M <- matrix(c(t(M)),ncol = ncol(df), byrow = FALSE)
# add row names and column names
row.names <- matrix(rownames(df), ncol = 2 ,byrow = TRUE)
rownames(M) <- c(t(cbind(row.names, apply(row.names,1, paste, collapse = "_"))))
colnames(M) <- colnames(df)
# t1 t2 t3
# a 1.0 2.0 3
# b 2.0 2.0 3
# a_b 1.5 2.0 3
# c 2.0 2.0 2
# d 1.0 1.0 2
# c_d 1.5 1.5 2
Another dplyr approach.
Update: If you really need the row names (a, b, a_b, etc) see after my original solution for a scalable, but more convoluted, version.
Original
df <- df %>% as_tibble()
n <- nrow(df)/2
orig <- df %>% mutate(grp = sort(rep(1:2, n)))
means <- orig %>% group_by(grp) %>% summarise_all(mean)
bind_rows(orig, means) %>% arrange(grp) %>% select(-grp)
Output:
# A tibble: 6 x 3
t1 t2 t3
<dbl> <dbl> <dbl>
1 1 2 3
2 2 2 3
3 1.5 2 3
4 2 2 2
5 1 1 2
6 1.5 1.5 2
Updated with row names
rnames <- row.names(df)
df <- df %>% as_tibble()
n <- (nrow(df)/2)
orig <- df %>%
mutate(grp = sort(rep(1:n, n)), rn = rnames)
means <- orig %>%
group_by(grp) %>%
mutate(rn = paste0(rn, collapse="_")) %>%
ungroup() %>%
group_by(rn) %>%
summarise_if(is.numeric, mean)
bind_rows(orig, means) %>% arrange(grp) %>% select(-grp)
Output:
t1 t2 t3 rn
<dbl> <dbl> <dbl> <chr>
1 1 2 3 a
2 2 2 3 b
3 1.5 2 3 a_b
4 2 2 2 c
5 1 1 2 d
6 1.5 1.5 2 c_d
One possibility is to use the dplyr package.
Note that the data I use is slightly different from the data you are using: in your data the numbers are actually character values.
df <- structure(c(1, 2, 2, 1, 2, 2, 2, 1, 3, 3, 2, 2),
.Dim = 4:3, .Dimnames = list(c("a", "b", "c", "d"),
c("t1", "t2", "t3")))
First I create the summary tibble (which contains the means).
library(dplyr)
df_summary <- df %>% as_tibble(rownames = "names") %>%
group_by(ceiling(1:n() / 2)) %>%
summarise(names = paste(names, collapse = "_"),
t1 = mean(t1),
t2 = mean(t2),
t3 = mean(t3)) %>%
select(-1)
# A tibble: 2 x 4
names t1 t2 t3
<chr> <dbl> <dbl> <dbl>
1 a_b 1.5 2 3
2 c_d 1.5 1.5 2
Then I combine the summary data with original data:
df_summary %>% bind_rows(df %>% as_tibble(rownames = "names")) %>%
slice(3, 4, 1, 5, 6, 2)
# A tibble: 6 x 4
names t1 t2 t3
<chr> <dbl> <dbl> <dbl>
1 a 1 2 3
2 b 2 2 3
3 a_b 1.5 2 3
4 c 2 2 2
5 d 1 1 2
6 c_d 1.5 1.5 2
This function averages based on a column named "group"
and should be in the dataset.
x is a data frame or a matrix.
rowm = function(x){
x = as.data.frame(x)
u = unique(x$group)
r = rep(NA, ncol(x)*length(u))
tempDF = matrix(r, ncol=ncol(x))
counter=0
for(i in u){
counter = counter+1
tempDF[counter, ] = colMeans(x[x$group==i, ], )
}
colnames(tempDF) = colnames(x)
return(tempDF)}
Related
I want to select a row for each group created by variable a. It should be the row with the highest value for variable c, but if variable b is TRUE, then the row with b = TRUE and maximum c within that group should be selected.
I have the following code:
set.seed(42)
a <- rep(1:3, each = 3)
b <- sample(c(0,1), size = 9, replace = T)
c <- sample(1:9, size = 9, replace = F)
df <- data.frame(a = a,
b = b,
c = c)
df %>% group_by(a) %>% filter(b == 1) %>%
arrange(desc(c), .by_group = T) %>%
summarise_all(function(x) x[1]) -> df1
df %>% group_by(a) %>% filter(all(b != 1)) %>%
arrange(desc(c), .by_group = T) %>%
summarise_all(function(x) x[1]) -> df2
df3 <- rbind(df1, df2)
This works, but I wonder if there is a simpler way to achieve the same.
You could filter the values for groups and then do your summarize.
df %>%
group_by(a) %>%
filter(all(b==0) | b==1) %>%
summarize(b = first(b), c = max(c))
# a b c
# <int> <dbl> <int>
# 1 1 0 8
# 2 2 1 5
# 3 3 1 9
So we only keep the values per group if b==1 or if all b==0
We can do it with ifelse inside summarise and without the need to filter b values.
set.seed(42)
a <- rep(1:3, each = 3)
b <- sample(c(0,1), size = 9, replace = T)
cc <- sample(1:9, size = 9, replace = F)
df <- data.frame(a = a,
b = b,
cc = cc)
df |>
group_by(a) |>
summarise(b = max(b),teste = ifelse(any(b == 1), max(cc[b == 1]), max(cc)) )
Also, never name something c in R.
library(data.table)
setDT(df)
# select the maximum c value, grouped by a and b
# then negative order by b (so rows with b == 1 get on top),
# and select the first row of each a-group
df[df[, .I[c == max(c)], by = .(a,b)]$V1][order(a,-b), .SD[1], by = a]
library(dplyr)
df %>% group_by(a) %>%
arrange(desc(b),desc(c), .by_group = T) %>%
slice_head(n = 1) %>%
ungroup()
#> # A tibble: 3 × 3
#> a b c
#> <int> <dbl> <int>
#> 1 1 0 8
#> 2 2 1 5
#> 3 3 1 9
Input data:
set.seed(42)
a <- rep(1:3, each = 3)
b <- sample(c(0,1), size = 9, replace = T)
c <- sample(1:9, size = 9, replace = F)
df <- data.frame(a = a,
b = b,
c = c)
df
#> a b c
#> 1 1 0 8
#> 2 1 0 7
#> 3 1 0 4
#> 4 2 0 1
#> 5 2 1 5
#> 6 2 1 2
#> 7 3 1 9
#> 8 3 1 3
#> 9 3 0 6
Created on 2023-01-30 with reprex v2.0.2
I need a function f(B,A) that, given a dataset with the following structure,
T1 T2 T3 T4 T5 ... P1 P2 P3 P4 P5 ...
1 2 5 8 9 ... A C B B A ...
1 3 4 6 6 ... C A C A B ...
finds the first time B and A appear in Pj columns (starting with j=1) and returns the value difference in the corresponding Ti columns.
For instance:
in line 1: B appears in P3 first, A appears in P1 first. Then:
f(B, A) = T3 - T1 = 5-1 = 4
in line 2: B appears in P5 first, A appears in P2 first. Then:
f(B, A) = T5 - T2 = 6-3 = 3
I can find in which Pj columns B and A appear using str_detect() , but I don't know how to "move" from P_j1, P_j2 to T_j1, T_j2.
Using datatable syntax (or base R) will be appreciated
Here is a data.table approach.
library(data.table)
DT <- fread("T1 T2 T3 T4 T5 P1 P2 P3 P4 P5
1 2 5 8 9 A C B B A
1 3 4 6 6 C A C A B")
# Add row ID's
DT[, id := .I]
#melt to a long format
DT.melt <- data.table::melt(DT,
id.vars = "id",
measure.vars = patterns(T = "^T", P = "^P"))
# Find first B for each id
val1 <- DT.melt[P == "B", T[1], by = .(id)]$V1
# [1] 5 6
# Find first A for each id
val2 <- DT.melt[P == "A", T[1], by = .(id)]$V1
# [1] 1 3
val1 - val2
# [1] 4 3
base R
f <- function(l1, l2){
apply(df, 1, function(x){
dfP <- x[grepl("P", names(x))]
dfT <- x[grepl("T", names(x))]
as.numeric(dfT[which(dfP == l1)[1]]) - as.numeric(dfT[which(dfP == l2)[1]])
})
}
f("B", "A")
[1] 4 3
Tidyverse
With this type of data, it's usually best to pivot to long and then back to wide: here is a tidyverse solution, with diff being the desired output.
library(tidyverse)
df %>%
mutate(id = row_number()) %>%
pivot_longer(-id, names_pattern = "(\\D)(\\d)",
names_to = c(".value", "group")) %>%
group_by(id) %>%
mutate(diff = first(T[P == "B"]) - first(T[P == "A"])) %>%
pivot_wider(c(id, diff), names_from = group, values_from = c(T, P), names_sep = "")
output
id diff T1 T2 T3 T4 T5 P1 P2 P3 P4 P5
<int> <int> <int> <int> <int> <int> <int> <chr> <chr> <chr> <chr> <chr>
1 1 4 1 2 5 8 9 A C B B A
2 2 3 1 3 4 6 6 C A C A B
I am trying to create a new column that will contain a result of calculations done rowwise over a subset of columns of a tibble, and add this new column to the existing tibble. Like so:
df <- tibble(
ID = c("one", "two", "three"),
A1 = c(1, 1, 1),
A2 = c(2, 2, 2),
A3 = c(3, 3, 3)
)
I effectively want to do a dplyr equivalent of this code from base R:
df$SumA <- rowSums(df[,grepl("^A", colnames(df))])
My problem is that this doesn't work:
df %>%
select(starts_with("A")) %>%
mutate(SumA = rowSums(.))
# some code here
...because I got rid of the "ID" column in order to let mutate run the rowSums over the other (numerical) columns. I have tried to cbind or bind_cols in the pipe after the mutate, but it doesn't work. None of the variants of mutate work, because they work in-place (within each cell of the tibble, and not across the columns, even with rowwise).
This does work, but doesn't strike me as an elegant solution:
df %>%
mutate(SumA = rowSums(.[,grepl("^A", colnames(df))]))
Is there any tidyverse-based solution that does not require grepl or square brackets but only more standard dplyr verbs and parameters?
My expected output is this:
df_out <- tibble(
ID = c("one", "two", "three"),
A1 = c(1, 1, 1),
A2 = c(2, 2, 2),
A3 = c(3, 3, 3),
SumA = c(6, 6, 6)
)
Best
kJ
Here's one way to approach row-wise computation in the tidyverse using purrr::pmap. This is best used with functions that actually need to be run row by row; simple addition could probably be done a faster way. Basically we use select to provide the input list to pmap, which lets us use the select helpers such as starts_with or matches if you need regex.
library(tidyverse)
df <- tibble(
ID = c("one", "two", "three"),
A1 = c(1, 1, 1),
A2 = c(2, 2, 2),
A3 = c(3, 3, 3)
)
df %>%
mutate(
SumA = pmap_dbl(
.l = select(., starts_with("A")),
.f = function(...) sum(...)
)
)
#> # A tibble: 3 x 5
#> ID A1 A2 A3 SumA
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 one 1 2 3 6
#> 2 two 1 2 3 6
#> 3 three 1 2 3 6
Created on 2019-01-30 by the reprex package (v0.2.1)
Here's a different approach that doesn't move rowwise but instead exploits the vectorised nature of addition and that addition commutes. That lets use repeatedly apply + with purrr::reduce
library(tidyverse)
df <- tibble(
ID = c("one", "two", "three"),
A1 = c(1, 1, 1),
A2 = c(2, 2, 2),
A3 = c(3, 3, 3)
)
df %>%
mutate(
SumA = reduce(
.x = select(., starts_with("A")),
.f = `+`
)
)
#> # A tibble: 3 x 5
#> ID A1 A2 A3 SumA
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 one 1 2 3 6
#> 2 two 1 2 3 6
#> 3 three 1 2 3 6
Created on 2019-01-30 by the reprex package (v0.2.1)
1) To do it with rowSums try nesting a second pipeline in the mutate like this:
library(dplyr)
df %>% mutate(Sum = select(., starts_with("A")) %>% rowSums)
giving:
# A tibble: 3 x 5
ID A1 A2 A3 Sum
<chr> <dbl> <dbl> <dbl> <dbl>
1 one 1 2 3 6
2 two 1 2 3 6
3 three 1 2 3 6
2) An alternative is to reshape it to long form and then summarize:
library(dplyr)
library(purrr)
library(tidyr)
df %>%
mutate(Sum = gather(., key, value, -ID) %>%
group_by(., ID) %>%
summarize(sum = sum(value)) %>%
ungroup %>%
pull(sum))
giving:
# A tibble: 3 x 5
ID A1 A2 A3 Sum
<chr> <dbl> <dbl> <dbl> <dbl>
1 one 1 2 3 6
2 two 1 2 3 6
3 three 1 2 3 6
[upd] I didn't notice that #Calum used a nearly the same approach.
Another possible way to do that:
library(dplyr)
library(purrr)
dat %>%
mutate(SumA = pmap_dbl(select(., contains('A')), sum))
Data:
# dat <- tibble(
# ID = c("one", "two", "three"),
# A1 = c(1, 1, 1),
# A2 = c(2, 2, 2),
# A3 = c(3, 3, 3)
# )
Output:
# # A tibble: 3 x 5
# ID A1 A2 A3 SumA
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 one 1 2 3 6
# 2 two 1 2 3 6
# 3 three 1 2 3 6
You could nest and use rowSums on the nested columns :
library(tidyverse)
df %>% nest(-ID) %>%
mutate(SumA = map_dbl(data,rowSums)) %>%
unnest
# # A tibble: 3 x 5
# ID SumA A1 A2 A3
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 one 6 1 2 3
# 2 two 6 1 2 3
# 3 three 6 1 2 3
Or this variant on the pmap approach :
df %>% mutate(SumA = pmap_dbl(.[-1],sum))
# # A tibble: 3 x 5
# ID A1 A2 A3 SumA
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 one 1 2 3 6
# 2 two 1 2 3 6
# 3 three 1 2 3 6
And to show that base is sometimes easier :
df$SumA <- rowSums(df[-1])
This is my reproducible code:
df <- data.frame(x = c(1, 2), y = c(3, 4))
df1 <- df %>% mutate(z = 1)
df2 <- df %>% mutate(z = 2)
df3 <- df %>% mutate(z = 3)
df <- rbind(df1, df2, df3)
df
I repeat the original data frame df 3 times, whilst adding one column where the number in the column indicated the repetition. In my use case, I have to do this more than 3 times. I could use a loop but is there a neater way? I guess i cannot use expand.grid.
You can also do it with a merge:
dfz <- data.frame(z = 1:3)
merge(df, dfz)
# x y z
# 1 1 3 1
# 2 2 4 1
# 3 1 3 2
# 4 2 4 2
# 5 1 3 3
# 6 2 4 3
We can create a list column and unnest
library(tidyverse)
df %>%
mutate(z = list(1:3)) %>%
unnest %>%
arrange(z)
# x y z
#1 1 3 1
#2 2 4 1
#3 1 3 2
#4 2 4 2
#5 1 3 3
#6 2 4 3
We can also do a cross join with sqldf. This creates a Cartesian Product of df and the reps tables:
library(sqldf)
reps <- data.frame(z = 1:3)
sqldf("select * from df, reps order by z")
or simply with map_dfr from purrr:
library(purrr)
map_dfr(1:3, ~cbind(df, z = .))
Output:
x y z
1 1 3 1
2 2 4 1
3 1 3 2
4 2 4 2
5 1 3 3
6 2 4 3
Yet another option using base R
n <- 3
do.call(rbind,
Map(`[<-`, replicate(n = n,
expr = df,
simplify = FALSE),
"z",
value = seq_len(n)))
# x y z
#1 1 3 1
#2 2 4 1
#3 1 3 2
#4 2 4 2
#5 1 3 3
#6 2 4 3
A few other ways not covered yet:
# setup
df = data.frame(x = c(1, 2), y = c(3, 4))
n = 3
# simple row indexing, add column manually
result = df[rep(1:nrow(df), 3), ]
result$id = rep(1:n, each = nrow(df))
# cross join in base
merge(df, data.frame(id = 1:n), by = NULL)
# cross join in tidyr
tidyr::crossing(df, data.frame(id = 1:n))
# dplyr version of the row-index method above
slice(df, rep(1:n(), n)) %>% mutate(id = rep(1:n, each = nrow(df)))
Inspiration drawn heavily from an old question of mine, How can I repeat a data frame?. Basically the same question but without the id column requirement.
This question already has answers here:
Repeat each row of data.frame the number of times specified in a column
(10 answers)
Closed 2 years ago.
I have a trouble with repeating rows of my real data using dplyr. There is already another post in here repeat-rows-of-a-data-frame but no solution for dplyr.
Here I just wonder how could be the solution for dplyr
but failed with error:
Error: wrong result size (16), expected 4 or 1
library(dplyr)
df <- data.frame(column = letters[1:4])
df_rep <- df%>%
mutate(column=rep(column,each=4))
Expected output
>df_rep
column
#a
#a
#a
#a
#b
#b
#b
#b
#*
#*
#*
Using the uncount function will solve this problem as well. The column count indicates how often a row should be repeated.
library(tidyverse)
df <- tibble(letters = letters[1:4])
df
# A tibble: 4 x 1
letters
<chr>
1 a
2 b
3 c
4 d
df %>%
mutate(count = c(2, 3, 2, 4)) %>%
uncount(count)
# A tibble: 11 x 1
letters
<chr>
1 a
2 a
3 b
4 b
5 b
6 c
7 c
8 d
9 d
10 d
11 d
I was looking for a similar (but slightly different) solution. Posting here in case it's useful to anyone else.
In my case, I needed a more general solution that allows each letter to be repeated an arbitrary number of times. Here's what I came up with:
library(tidyverse)
df <- data.frame(letters = letters[1:4])
df
> df
letters
1 a
2 b
3 c
4 d
Let's say I want 2 A's, 3 B's, 2 C's and 4 D's:
df %>%
mutate(count = c(2, 3, 2, 4)) %>%
group_by(letters) %>%
expand(count = seq(1:count))
# A tibble: 11 x 2
# Groups: letters [4]
letters count
<fctr> <int>
1 a 1
2 a 2
3 b 1
4 b 2
5 b 3
6 c 1
7 c 2
8 d 1
9 d 2
10 d 3
11 d 4
If you don't want to keep the count column:
df %>%
mutate(count = c(2, 3, 2, 4)) %>%
group_by(letters) %>%
expand(count = seq(1:count)) %>%
select(letters)
# A tibble: 11 x 1
# Groups: letters [4]
letters
<fctr>
1 a
2 a
3 b
4 b
5 b
6 c
7 c
8 d
9 d
10 d
11 d
If you want the count to reflect the number of times each letter is repeated:
df %>%
mutate(count = c(2, 3, 2, 4)) %>%
group_by(letters) %>%
expand(count = seq(1:count)) %>%
mutate(count = max(count))
# A tibble: 11 x 2
# Groups: letters [4]
letters count
<fctr> <dbl>
1 a 2
2 a 2
3 b 3
4 b 3
5 b 3
6 c 2
7 c 2
8 d 4
9 d 4
10 d 4
11 d 4
This is rife with peril if the data.frame has other columns (there, I said it!), but the do block will allow you to generate a derived data.frame within a dplyr pipe (though, ceci n'est pas un pipe):
library(dplyr)
df <- data.frame(column = letters[1:4], stringsAsFactors = FALSE)
df %>%
do( data.frame(column = rep(.$column, each = 4), stringsAsFactors = FALSE) )
# column
# 1 a
# 2 a
# 3 a
# 4 a
# 5 b
# 6 b
# 7 b
# 8 b
# 9 c
# 10 c
# 11 c
# 12 c
# 13 d
# 14 d
# 15 d
# 16 d
As #Frank suggested, a much better alternative could be
df %>% slice(rep(1:n(), each=4))
I did a quick benchmark to show that uncount() is a lot faster than expand()
# for the pipe
library(magrittr)
# create some test data
df_test <-
tibble::tibble(
letter = letters,
row_count = sample(1:10, size = 26, replace = TRUE)
)
# benchmark
bench <- microbenchmark::microbenchmark(
expand = df_test %>%
dplyr::group_by(letter) %>%
tidyr::expand(row_count = seq(1:row_count)),
uncount = df_test %>%
tidyr::uncount(row_count)
)
# plot the benchmark
ggplot2::autoplot(bench)