repeat dataframe n times whilst adding column - r

This is my reproducible code:
df <- data.frame(x = c(1, 2), y = c(3, 4))
df1 <- df %>% mutate(z = 1)
df2 <- df %>% mutate(z = 2)
df3 <- df %>% mutate(z = 3)
df <- rbind(df1, df2, df3)
df
I repeat the original data frame df 3 times, whilst adding one column where the number in the column indicated the repetition. In my use case, I have to do this more than 3 times. I could use a loop but is there a neater way? I guess i cannot use expand.grid.

You can also do it with a merge:
dfz <- data.frame(z = 1:3)
merge(df, dfz)
# x y z
# 1 1 3 1
# 2 2 4 1
# 3 1 3 2
# 4 2 4 2
# 5 1 3 3
# 6 2 4 3

We can create a list column and unnest
library(tidyverse)
df %>%
mutate(z = list(1:3)) %>%
unnest %>%
arrange(z)
# x y z
#1 1 3 1
#2 2 4 1
#3 1 3 2
#4 2 4 2
#5 1 3 3
#6 2 4 3

We can also do a cross join with sqldf. This creates a Cartesian Product of df and the reps tables:
library(sqldf)
reps <- data.frame(z = 1:3)
sqldf("select * from df, reps order by z")
or simply with map_dfr from purrr:
library(purrr)
map_dfr(1:3, ~cbind(df, z = .))
Output:
x y z
1 1 3 1
2 2 4 1
3 1 3 2
4 2 4 2
5 1 3 3
6 2 4 3

Yet another option using base R
n <- 3
do.call(rbind,
Map(`[<-`, replicate(n = n,
expr = df,
simplify = FALSE),
"z",
value = seq_len(n)))
# x y z
#1 1 3 1
#2 2 4 1
#3 1 3 2
#4 2 4 2
#5 1 3 3
#6 2 4 3

A few other ways not covered yet:
# setup
df = data.frame(x = c(1, 2), y = c(3, 4))
n = 3
# simple row indexing, add column manually
result = df[rep(1:nrow(df), 3), ]
result$id = rep(1:n, each = nrow(df))
# cross join in base
merge(df, data.frame(id = 1:n), by = NULL)
# cross join in tidyr
tidyr::crossing(df, data.frame(id = 1:n))
# dplyr version of the row-index method above
slice(df, rep(1:n(), n)) %>% mutate(id = rep(1:n, each = nrow(df)))
Inspiration drawn heavily from an old question of mine, How can I repeat a data frame?. Basically the same question but without the id column requirement.

Related

Add a named row between two dataframes in R

I'm trying to add a named row between two data.frames d1 and d2. My Desired_output is shown below.
I have tried a solution but it failed to get me to my desired output. Is there a solution to this?
d1 <- data.frame(b = 1:2, SE = 2:3)
d2 <- data.frame(b = 0:1, SE = 1:2)
a <- "obs"
# Solution failed:
dplyr::bind_rows(d1, !!a := rep(NA, ncol(d1)), d2)
Desired_output =
" b SE
1 1 2
2 2 3
obs NA NA
4 0 1
5 1 2"
In base R, we may also do
rbind(d1, `row.names<-`(d1[NA,][1,], a),
`row.names<-`(d2, nrow(d1) + seq_len(nrow(d2))))
-output
b SE
1 1 2
2 2 3
obs NA NA
3 0 1
4 1 2
Maybe this will help -
d1 <- data.frame(b = 1:2, SE = 2:3)
d2 <- data.frame(b = 0:1, SE = 1:2)
a <- "obs"
d3 <- d1[1, ]
d3[] <- NA
rownames(d3) <- a
rbind(d1, d3, d2)
# b SE
#1 1 2
#2 2 3
#obs NA NA
#11 0 1
#21 1 2
Here is a an alternative solution, in case you want it in a pipe!
We wrap the whole procedure arround add_row from tibble package.
With bind_rows we bind both tables together and add a row before index2.
Then we have to change between rownames_to_column and vice versa.
library(tibble)
library(dplyr)
add_row(bind_rows(d1,d2),
b = NA,
SE = NA,
.before = 3)%>%
data.frame() %>%
rownames_to_column("x") %>%
mutate(x = ifelse(x == "3", "obs", x)) %>%
column_to_rownames("x")
)
b SE
1 1 2
2 2 3
obs NA NA
4 0 1
5 1 2

Average values from multiple data frames by position [duplicate]

This question already has an answer here:
Average Cells of Two or More DataFrames
(1 answer)
Closed 1 year ago.
I have two dataframes:
dataA <- data.frame(A = replicate(5, 1), B = replicate(5, 2))
dataB <- data.frame(A = replicate(5, 3), B = replicate(5, 4))
I would like to create a third data frame dataC that is the average of the other two. For example, row 1 column 1 in the third data frame would be the average of the same position in the first two data frames.
Desired output:
dataC <- data.frame(A = replicate(5, 2), B = replicate(5, 3))
dataC
A B
2 3
2 3
2 3
2 3
2 3
We can use place the datasets in a list, do elementwise sum with + and divide by the lenght of the list
Reduce(`+`, list(dataA, dataB))/2
-output
A B
1 2 3
2 2 3
3 2 3
4 2 3
5 2 3
Or another option is to bind the datasets while creating a grouping column based on sequence and then do the group by mean
library(dplyr)
library(data.table)
bind_rows(dataA, dataB, .id = 'grp') %>%
group_by(grp = rowid(grp)) %>%
summarise(across(everything(), mean)) %>%
select(-grp)
-output
# A tibble: 5 x 2
A B
<dbl> <dbl>
1 2 3
2 2 3
3 2 3
4 2 3
5 2 3
Here are some solutions:
# method 1:
dataC <- (dataA + dataB) / 2
# method 2:
dataC <- dataA
dataC[] <- Map(function(x,y) (x+y)/2, dataA, dataB)
# A B
# 1 2 3
# 2 2 3
# 3 2 3
# 4 2 3
# 5 2 3

Apply function to a row in a data.frame using dplyr

In base R I would do the following:
d <- data.frame(a = 1:4, b = 4:1, c = 2:5)
apply(d, 1, which.max)
With dplyr I could do the following:
library(dplyr)
d %>% mutate(u = purrr::pmap_int(list(a, b, c), function(...) which.max(c(...))))
If there’s another column in d I need to specify it, but I want this to work w/ an arbitrary amount if columns.
Conceptually, I’d like something like
pmap_int(list(everything()), ...)
pmap_int(list(.), ...)
But this does obviously not work. How would I solve that canonically with dplyr?
We just need the data to be specified as . as data.frame is a list with columns as list elements. If we wrap list(.), it becomes a nested list
library(dplyr)
d %>%
mutate(u = pmap_int(., ~ which.max(c(...))))
# a b c u
#1 1 4 2 2
#2 2 3 3 2
#3 3 2 4 3
#4 4 1 5 3
Or can use cur_data()
d %>%
mutate(u = pmap_int(cur_data(), ~ which.max(c(...))))
Or if we want to use everything(), place that inside select as list(everything()) doesn't address the data from which everything should be selected
d %>%
mutate(u = pmap_int(select(., everything()), ~ which.max(c(...))))
Or using rowwise
d %>%
rowwise %>%
mutate(u = which.max(cur_data())) %>%
ungroup
# A tibble: 4 x 4
# a b c u
# <int> <int> <int> <int>
#1 1 4 2 2
#2 2 3 3 2
#3 3 2 4 3
#4 4 1 5 3
Or this is more efficient with max.col
max.col(d, 'first')
#[1] 2 2 3 3
Or with collapse
library(collapse)
dapply(d, which.max, MARGIN = 1)
#[1] 2 2 3 3
which can be included in dplyr as
d %>%
mutate(u = max.col(cur_data(), 'first'))
Here are some data.table options
setDT(d)[, u := which.max(unlist(.SD)), 1:nrow(d)]
or
setDT(d)[, u := max.col(.SD, "first")]

Count number of values in row using dplyr

This question should have a simple, elegant solution but I can't figure it out, so here it goes:
Let's say I have the following dataset and I want to count the number of 2s present in each row using dplyr.
set.seed(1)
ID <- LETTERS[1:5]
X1 <- sample(1:5, 5,T)
X2 <- sample(1:5, 5,T)
X3 <- sample(1:5, 5,T)
df <- data.frame(ID,X1,X2,X3)
library(dplyr)
Now, the following works:
df %>%
rowwise %>%
mutate(numtwos = sum(c(X1,X2,X3) == 2))
But how do I avoid typing out all of the column names?
I know this is probably easier to do without dplyr, but more generally I want to know how I can use dplyr's mutate with multiple columns without typing out all the column names.
Try rowSums:
> set.seed(1)
> ID <- LETTERS[1:5]
> X1 <- sample(1:5, 5,T)
> X2 <- sample(1:5, 5,T)
> X3 <- sample(1:5, 5,T)
> df <- data.frame(ID,X1,X2,X3)
> df
ID X1 X2 X3
1 A 2 5 2
2 B 2 5 1
3 C 3 4 4
4 D 5 4 2
5 E 2 1 4
> rowSums(df == 2)
[1] 2 1 0 1 1
Alternatively, with dplyr:
> df %>% mutate(numtwos = rowSums(. == 2))
ID X1 X2 X3 numtwos
1 A 2 5 2 2
2 B 2 5 1 1
3 C 3 4 4 0
4 D 5 4 2 1
5 E 2 1 4 1
Here's another alternative using purrr:
library(purrr)
df %>%
by_row(function(x) {
sum(x[-1] == 2) },
.to = "numtwos",
.collate = "cols"
)
Which gives:
#Source: local data frame [5 x 5]
#
# ID X1 X2 X3 numtwos
# <fctr> <int> <int> <int> <int>
#1 A 2 5 2 2
#2 B 2 5 1 1
#3 C 3 4 4 0
#4 D 5 4 2 1
#5 E 2 1 4 1
As per mentioned in the NEWS, row based functionals are still maturing in dplyr:
We are still figuring out what belongs in dplyr and what belongs in
purrr. Expect much experimentation and many changes with these
functions.
Benchmark
We can see how rowwise() and do() compare to purrr::by_row() for this type of problem and how they "perform" against rowSums() and the tidy data way:
largedf <- df[rep(seq_len(nrow(df)), 10e3), ]
library(microbenchmark)
microbenchmark(
steven = largedf %>%
by_row(function(x) {
sum(x[-1] == 2) },
.to = "numtwos",
.collate = "cols"),
psidom = largedf %>%
rowwise %>%
do(data_frame(numtwos = sum(.[-1] == 2))) %>%
cbind(largedf, .),
gopala = largedf %>%
gather(key, value, -ID) %>%
group_by(ID) %>%
summarise(numtwos = sum(value == 2)) %>%
inner_join(largedf, .),
evan = largedf %>%
mutate(numtwos = rowSums(. == 2)),
times = 10L,
unit = "relative"
)
Results:
#Unit: relative
# expr min lq mean median uq max neval cld
# steven 1225.190659 1261.466936 1267.737126 1227.762573 1276.07977 1339.841636 10 b
# psidom 3677.603240 3759.402212 3726.891458 3678.717170 3728.78828 3777.425492 10 c
# gopala 2.715005 2.684599 2.638425 2.612631 2.59827 2.572972 10 a
# evan 1.000000 1.000000 1.000000 1.000000 1.00000 1.000000 10 a
Just wanted to add to the answer of #evan.oman in case you only want to sum rows for specific columns, not all of them. You can use the regular select and/or select_helpers functions. In this example, we don't want to include X1 in rowSums:
df %>%
mutate(numtwos = rowSums(select(., -X1) == 2))
ID X1 X2 X3 numtwos
1 A 2 5 2 1
2 B 2 5 1 0
3 C 3 4 4 0
4 D 5 4 2 1
5 E 2 1 4 0
One approach is to use a combination of dplyr and tidyr to convert data into long format, and do the computation:
library(dplyr)
library(tidyr)
df %>%
gather(key, value, -ID) %>%
group_by(ID) %>%
summarise(numtwos = sum(value == 2)) %>%
inner_join(df, .)
Output is as follows:
ID X1 X2 X3 numtwos
1 A 2 5 2 2
2 B 2 5 1 1
3 C 3 4 4 0
4 D 5 4 2 1
5 E 2 1 4 1
You can use do, which doesn't add the column to your original data frame and you need to add the column to your original data frame.
df %>%
rowwise %>%
do(numtwos = sum(.[-1] == 2)) %>%
data.frame
numtwos
1 2
2 1
3 0
4 1
5 1
Add a cbind to bind the new column to the original data frame:
df %>%
rowwise %>%
do(numtwos = sum(.[-1] == 2)) %>%
data.frame %>% cbind(df, .)
ID X1 X2 X3 numtwos
1 A 2 5 2 2
2 B 2 5 1 1
3 C 3 4 4 0
4 D 5 4 2 1
5 E 2 1 4 1

Filling in values in a data frame in R?

Suppose I have this data frame:
times vals
1 1 2
2 3 4
3 7 6
set up with
foo <- data.frame(times=c(1,3,7), vals=c(2,4,6))
and I want this one:
times vals
1 1 2
2 2 2
3 3 4
4 4 4
5 5 4
6 6 4
7 7 6
That is, I want to fill in all the times from 1 to 7, and fill in the vals from the latest time that is not greater than the given time.
I have some code to do it using dplyr, but it is ugly. Suggestions for better?
library(dplyr)
foo <- merge(foo, data.frame(times=1:max(foo$times)), all.y=TRUE)
foo2 <- merge(foo, foo, by=c(), suffixes=c('', '.1'))
foo2 <- foo2 %>% filter(is.na(vals) & !is.na(vals.1) & times.1 <= times) %>%
group_by(times) %>% arrange(-times.1) %>% mutate(rn = row_number()) %>%
filter(rn == 1) %>%
mutate(vals = vals.1,
rn = NULL,
vals.1 = NULL,
times.1 = NULL)
foo <- merge(foo, foo2, by=c('times'), all.x=TRUE, suffixes=c('', '.2'))
foo <- mutate(foo,
vals = ifelse(is.na(vals), vals.2, vals),
vals.2 = NULL)
This is a standard rolling join problem:
library(data.table)
setDT(foo)[.(1:7), on = 'times', roll = T]
# times vals
#1: 1 2
#2: 2 2
#3: 3 4
#4: 4 4
#5: 5 4
#6: 6 4
#7: 7 6
The above is for devel version (1.9.7+), which is smarter about column matching during joins. For 1.9.6 you still need to specify column name for the inner table:
setDT(foo)[.(times = 1:7), on = 'times', roll = T]
With approx:
data.frame(times = 1:7,
vals = unlist(approx(foo, xout = 1:7, method = "constant", f = 0)[2], use.names = F))
times vals
1 1 2
2 2 2
3 3 4
4 4 4
5 5 4
6 6 4
7 7 6
A dplyr and tidyr option:
library(dplyr)
library(tidyr)
foo %>%
right_join(data_frame(times = min(foo$times):max(foo$times))) %>%
fill(vals)
# Joining by: "times"
# times vals
# 1 1 2
# 2 2 2
# 3 3 4
# 4 4 4
# 5 5 4
# 6 6 4
# 7 7 6
This is a bit longer and more verbose base R solution:
# calculate the number of repetitions needed for vals variable
reps <- c(with(foo, times[2:length(times)]-times[1:length(times)-1]), 1)
# get result
fooDoneIt <- data.frame(times = min(foo$times):max(foo$times),
vals = rep(foo$vals, reps))

Resources