My dataset looks like this:
block <- c(1,1,2,2,3,3,4,4)
treatment <- c(1,1,2,2,1,1,2,2)
type <- c("adult1","adult2","adult1","adult2","adult1","adult2","adult1","adult2")
t1 <- c(1,1,2,2,3,3,4,4)
t2 <- c(1,1,2,2,3,3,4,4)
t100 <- c(1,1,2,2,3,3,4,4)
df <- data.frame(block,treatment, type,t1,t2,t100)
I wish to sum the adults for each time point (t1,t2,t100) only with respect to the block. Here is what I want the final output to llok like
block <- c(1,2,3,4)
treatment <- c(1,2,1,2)
type <- c("adult","adult","adult","adult")
t1 <- c(2,4,6,8)
t2 <- c(2,4,6,8)
t100 <- c(2,4,6,8)
df <- data.frame(block,treatment,type,t1,t2,t100
)
Here is my attempt using aggregate function:
aggregate(df[,3:5], by = list(df$block), FUN = sum)
I get an error message saying that "arguments must be of the same length".
With aggregate you can use a formula to sum up t1:t100 and group by block and treatment:
df_final = aggregate(cbind(t1, t2, t100) ~ block + treatment, data = df, sum)
df_final$type1 = "adult"
Result:
block treatment t1 t2 t100 type1
1 1 1 2 2 2 adult
2 3 1 6 6 6 adult
3 2 2 4 4 4 adult
4 4 2 8 8 8 adult
Or you can do this with dplyr:
library(dplyr)
df %>%
group_by(block, treatment) %>%
summarize_at(vars(t1:t100), sum) %>%
mutate(type1 = "adult")
or
df %>%
group_by(block, treatment) %>%
summarize_at(vars(2:4), sum) %>%
mutate(type1 = "adult")
Result:
# A tibble: 4 x 6
# Groups: block [4]
block treatment t1 t2 t100 type1
<dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 1 1 2 2 2 adult
2 2 2 4 4 4 adult
3 3 1 6 6 6 adult
4 4 2 8 8 8 adult
You can also use data.table, which supports column indexing:
library(data.table)
setDT(df)[, lapply(.SD, sum), by=.(block, treatment), .SDcols=4:6]
Result:
block treatment t1 t2 t100
1: 1 1 2 2 2
2: 2 2 4 4 4
3: 3 1 6 6 6
4: 4 2 8 8 8
Solution in base R:
df <- cbind.data.frame(
aggregate(cbind(t1, t2, t100) ~ block + treatment, data = df, FUN = sum),
type = "adult");
# block treatment t1 t2 t100 type
#1 1 1 2 2 2 adult
#2 3 1 6 6 6 adult
#3 2 2 4 4 4 adult
#4 4 2 8 8 8 adult
Note: They key is here to cbind relevant columns.
Benchmarking comparison
Below are the results from a microbenchmark of all three solutions (base R, dplyr, data.table).
# Sample dataframe with 1000 rows
df <- cbind.data.frame(
block = rep(seq(1, 1000), each = 2),
treatment = rep(c(1, 1, 2, 2), length.out = 250),
type = rep(c("adult1", "adult2"), length.out = 500),
t1 = rep(seq(1, 1000), each = 2),
t2 = rep(seq(1, 1000), each = 2),
t100 = rep(seq(1, 1000), each = 2));
# Benchmarking results
require(microbenchmark);
require(dplyr);
require(magrittr);
require(data.table);
microbenchmark(
baseR = cbind.data.frame(
aggregate(cbind(t1, t2, t100) ~ block + treatment, data = df, FUN = sum),
type = "adult"),
dplyr = df %>%
group_by(block, treatment) %>%
summarize_at(vars(t1:t100), sum) %>%
mutate(type1 = "adult"),
datatable = setDT(df)[, lapply(.SD, sum), by=.(block, treatment), .SDcols=4:6]
)
#Unit: microseconds
# expr min lq mean median uq max neval
# baseR 13817.627 14040.4835 14931.4202 14278.8220 15026.413 42347.511 100
# dplyr 6698.983 7076.6360 8459.7861 7240.1680 7486.245 73401.747 100
# datatable 463.837 500.6555 663.5425 576.3075 597.443 9015.664 100
Related
I have a data frame like this
x1<- c(0,1,1,1,1,0)
df<-data.frame(x1)
I want to add another column that will take the sum of every two rows and store the value for the first two rows. This should look like this.
You can see here that the first two rows' sum is 1 and that is given in the first two rows of the new column (x2). Next, the third and fourth-row sum is given in the 3rd and fourth row of the new column. Can anyone help?
You can define the groups using floor division and then simply obtain the grouped sum:
library(dplyr)
df %>%
mutate(group = (row_number() - 1) %/% 2) %>%
group_by(group) %>%
mutate(x2 = sum(x1)) %>%
ungroup() %>%
select(-group)
# # A tibble: 6 × 2
# x1 x2
# <dbl> <dbl>
# 1 0 1
# 2 1 1
# 3 1 2
# 4 1 2
# 5 1 1
# 6 0 1
Here a way using dplyr where I create a auxiliar column to group by
library(dplyr)
x1<- c(0,1,1,1,1,0)
df <- data.frame(x1)
len_df <- nrow(df)
aux <- rep(seq(1:(len_df/2)),each = 2)[1:len_df]
df %>%
mutate(aux = aux) %>%
group_by(aux) %>%
mutate(x2 = sum(x1)) %>%
ungroup() %>%
select(-aux)
# A tibble: 6 x 2
x1 x2
<dbl> <dbl>
1 0 1
2 1 1
3 1 2
4 1 2
5 1 1
6 0 1
Create an index with gl for every 2 rows and do the sum after grouping
library(dplyr)
df <- df %>%
group_by(grp = as.integer(gl(n(), 2, n()))) %>%
mutate(x2 = sum(x1)) %>%
ungroup %>%
select(-grp)
-output
df
# A tibble: 6 × 2
x1 x2
<dbl> <dbl>
1 0 1
2 1 1
3 1 2
4 1 2
5 1 1
6 0 1
Or using collapse/data.table
library(data.table)
library(collapse)
setDT(df)[, x2 := fsum(x1, g = rep(.I, each = 2, length.out = .N), TRA = 1)]
-output
> df
x1 x2
<num> <num>
1: 0 1
2: 1 1
3: 1 2
4: 1 2
5: 1 1
6: 0 1
You can use ave + ceiling (both are base R functions)
> transform(df, x2 = ave(x1, ceiling(seq_along(x1) / 2)) * 2)
x1 x2
1 0 1
2 1 1
3 1 2
4 1 2
5 1 1
6 0 1
First, a way of making the data.frame without the intermediate variable.
This splits the data.frame into groups of 2, sums, then repeats the pattern into the new variable.
df<-data.frame(x1=c(0,1,1,1,1,0))
df$x2<-rep(lapply(split(df, rep(1:3, each=2)), sum), each=2)
# x1 x2
#1 0 1
#2 1 1
#3 1 2
#4 1 2
#5 1 1
#6 0 1
in base R you could do:
transform(df,x2 = ave(x1, gl(nrow(df)/2, 2), FUN = sum))
x1 x2
1 0 1
2 1 1
3 1 2
4 1 2
5 1 1
6 0 1
A few more options with select benchmarks.
x1 <- sample(0:1, 1e4, 1)
microbenchmark::microbenchmark(
matrix = rep(colSums(matrix(x1, 2)), each = 2),
recycle = x1 + x1[seq(x1) + c(1, -1)],
cumsum = rep(diff(cumsum(c(0, x1))[seq(1, length(x1) + 1, 2)]), each = 2),
Thomas = ave(x1, ceiling(seq_along(x1)/2))*2,
onyambu = ave(x1, gl(length(x1)/2, 2), FUN = sum),
check = "equal"
)
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> matrix 65.001 69.6510 79.27203 78.4510 82.1510 148.501 100
#> recycle 95.001 100.6505 108.65003 107.5510 110.6010 176.901 100
#> cumsum 137.201 148.9010 169.61090 166.5505 177.7015 340.002 100
#> Thomas 24645.401 25297.2010 26450.46994 25963.3515 27463.2010 31803.101 100
#> onyambu 3774.902 3935.7510 4444.36500 4094.3520 4336.1505 11070.301 100
With data.table for large data:
library(data.table)
library(collapse)
x1 <- sample(0:1, 1e6, 1)
df <- data.frame(x1)
microbenchmark::microbenchmark(
matrix = setDT(df)[, x2 := rep(colSums(matrix(x1, 2)), each = 2)],
recycle = setDT(df)[, x2 := x1 + x1[.I + c(1, -1)]],
akrun = setDT(df)[, x2 := fsum(x1, g = rep(.I, each = 2, length.out = .N), TRA = 1)],
check = "identical"
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> matrix 8.053302 8.937301 10.64786 9.376551 12.51890 17.2037 100
#> recycle 12.117101 12.965950 16.57696 14.003151 17.09805 56.4729 100
#> akrun 10.071701 10.611051 14.42578 11.291601 14.79090 55.1141 100
I am scoring a survey (23 items) with multiple options for each item (it is select all answers that apply, not one choice per item), and trying to find the minimum, maximum and average value for each item. I have written code to do this (below) but am wondering if there is a more efficient way than to cut and paste the three lines creating min, max and avg columns for every item.
Here is the reproducible example with 2 questions, 3 answer options for questions:
#Establish dataframe
dt <- data.frame(matrix(sample(0:1,30,replace=TRUE), ncol = 6))
colnames(dt) <- c("OptA_1", "OptB_1", "OptC_1", "OptA_2", "OptB_2", "OptC_2")
#Rescore incorrect value
dt[ ,grepl("OptB_", colnames(dt))] <- ifelse(dt[ ,grepl("OptB_", colnames(dt))]==1, 2, NA)
dt[ ,grepl("OptC_", colnames(dt))] <- ifelse(dt[ ,grepl("OptC_", colnames(dt))]==1, 3, NA)
dt[ ,grepl("OptA_", colnames(dt))] <- ifelse(dt[ ,grepl("OptA_", colnames(dt))]==1, 1, NA)
This is the code to calculate the values (note here Option A, B and C are the answer choices, while the _1 denotes item 1)
##Calculate Values
dt$it1_min <- apply(dt[ ,c("OptA_1", "OptB_1", "OptC_1")], 1, min, na.rm=T)
dt$it1_max <- apply(dt[ ,c("OptA_1", "OptB_1", "OptC_1")], 1, max, na.rm=T)
dt$it1_avg <- rowMeans(dt[ ,c("OptA_1", "OptB_1", "OptC_1")], na.rm=T)
I am wondering if I need to do the above ^ for every single item, or if it's possible to write a function so that I can score all items (OptA_1 through OptA_23) more efficiently.
###potentially repetitive code?
dt$it2_min <- apply(dt[ ,c("OptA_2", "OptB_2", "OptC_2")], 1, min, na.rm=T)
dt$it2_max <- apply(dt[ ,c("OptA_2", "OptB_2", "OptC_2")], 1, max, na.rm=T)
dt$it2_avg <- rowMeans(dt[ ,c("OptA_2", "OptB_2", "OptC_2")], na.rm=T)
Here is what the eventual scoring will look like:
##Eventual scoring
dt$tot_min <- rowSums(dt[ ,c("it1_min", "it2_min")], na.rm=T)
dt$tot_max <- rowSums(dt[ ,c("it1_max", "it2_max")], na.rm=T)
dt$tot_avg <- rowSums(dt[ ,c("it1_avg", "it2_avg")], na.rm=T)
You will need to convert the data to long form first (tidyr::pivot_longer).
library(dplyr)
library(tidyr)
dt_long <- dt %>%
# add identifier for participant
mutate(participant = row_number()) %>%
# convert to long form using pattern
pivot_longer(cols = -participant,
names_pattern = "Opt(.*)_(\\d+)",
names_to = c("answer_choice", "item"),
values_to = "selected")
dt_long
# long form data looks like this
# A tibble: 30 x 4
# participant answer_choice item selected
# <int> <chr> <chr> <dbl>
# 1 1 A 1 NA
# 2 1 B 1 2
# 3 1 C 1 NA
# 4 1 A 2 1
# 5 1 B 2 2
# 6 1 C 2 3
# now group by each participant and item and compute the required fields
dt_long %>%
group_by(item, participant) %>%
summarise(it_min = min(selected, na.rm = TRUE),
it_max = max(selected, na.rm = TRUE),
it_avg = mean(selected, na.rm = TRUE))
#> # A tibble: 10 x 5
#> # Groups: item [2]
#> item participant it_min it_max it_avg
#> <chr> <int> <dbl> <dbl> <dbl>
#> 1 1 1 2 2 2
#> 2 1 2 2 2 2
#> 3 1 3 1 3 2
#> 4 1 4 2 3 2.5
#> 5 1 5 3 3 3
#> 6 2 1 1 1 1
#> 7 2 2 1 3 2
#> 8 2 3 1 3 2
#> 9 2 4 1 3 2
#> 10 2 5 1 2 1.5
You can use data.table to melt your dt long, estimate your indicators by group, and then dcast back to wide format:
library(data.table)
dt = melt(setDT(dt)[, row:=.I], id.vars="row")[, c("variable","grp") := tstrsplit(variable, "_")][]
dcast(dt[, .(it_min = min(value,na.rm=T),
it_max = max(value,na.rm=T),
it_avg = mean(value, na.rm=T)
), by=.(row,grp)],
row~grp,
value.var=c("it_min", "it_max", "it_avg")
)
Output: (note that you used sample() above, without setting a seed, see my reproducible data below)
row it_min_1 it_min_2 it_max_1 it_max_2 it_avg_1 it_avg_2
<int> <num> <num> <num> <num> <num> <num>
1: 1 2 NA 3 NA 2.5 NaN
2: 2 2 1 3 3 2.5 2
3: 3 2 3 3 3 2.5 3
4: 4 1 NA 1 NA 1.0 NaN
5: 5 3 3 3 3 3.0 3
Input Data:
set.seed(123)
dt <- data.frame(matrix(sample(0:1,30,replace=TRUE), ncol = 6))
colnames(dt) <- c("OptA_1", "OptB_1", "OptC_1", "OptA_2", "OptB_2", "OptC_2")
dt[ ,grepl("OptB_", colnames(dt))] <- ifelse(dt[ ,grepl("OptB_", colnames(dt))]==1, 2, NA)
dt[ ,grepl("OptC_", colnames(dt))] <- ifelse(dt[ ,grepl("OptC_", colnames(dt))]==1, 3, NA)
dt[ ,grepl("OptA_", colnames(dt))] <- ifelse(dt[ ,grepl("OptA_", colnames(dt))]==1, 1, NA)
I want to calculate the medeian for several columns based on what letters each column begins with, creating a new column.
I know the way to do this for mean using dplyr and an example like that shown below:
library(dplyr)
mutate(df, IVMean = rowMeans(select(df, starts_with("IV")), na.rm = TRUE))
But how is this possible for median?
I have tried a number of ways including select then mutate but having no luck.
We could use rowMedians after converting to matrix
library(matrixStats)
library(dplyr)
mutate(df, IVMedian = rowMedians(as.matrix(select(df,
starts_with("IV"))), na.rm = TRUE))
-ouptut
IV1 IV2 IV3 col4 IVMedian
1 2 4 1 1 2
2 3 8 3 2 3
3 4 9 4 3 4
Or this can be done in rowwise with c_across
df %>%
rowwise %>%
mutate(IVMedian = median(c_across(starts_with('IV')))) %>%
ungroup
# A tibble: 3 x 5
IV1 IV2 IV3 col4 IVMedian
<dbl> <dbl> <dbl> <dbl> <dbl>
1 2 4 1 1 2
2 3 8 3 2 3
3 4 9 4 3 4
Or use pmap (from purrr)
library(purrr)
df %>%
mutate(IVMedian = pmap_dbl(select(., starts_with('IV')),
~ median(c(...))))
IV1 IV2 IV3 col4 IVMedian
1 2 4 1 1 2
2 3 8 3 2 3
3 4 9 4 3 4
Or in collapse
library(collapse)
df$IVMedian <- dapply(gvr(df, vars = '^IV'), MARGIN = 1, FUN = fmedian)
data
df <- data.frame(IV1 = c(2, 3, 4), IV2 = c(4, 8, 9),
IV3 = c(1, 3, 4), col4 = c(1, 2, 3))
This is my reproducible code:
df <- data.frame(x = c(1, 2), y = c(3, 4))
df1 <- df %>% mutate(z = 1)
df2 <- df %>% mutate(z = 2)
df3 <- df %>% mutate(z = 3)
df <- rbind(df1, df2, df3)
df
I repeat the original data frame df 3 times, whilst adding one column where the number in the column indicated the repetition. In my use case, I have to do this more than 3 times. I could use a loop but is there a neater way? I guess i cannot use expand.grid.
You can also do it with a merge:
dfz <- data.frame(z = 1:3)
merge(df, dfz)
# x y z
# 1 1 3 1
# 2 2 4 1
# 3 1 3 2
# 4 2 4 2
# 5 1 3 3
# 6 2 4 3
We can create a list column and unnest
library(tidyverse)
df %>%
mutate(z = list(1:3)) %>%
unnest %>%
arrange(z)
# x y z
#1 1 3 1
#2 2 4 1
#3 1 3 2
#4 2 4 2
#5 1 3 3
#6 2 4 3
We can also do a cross join with sqldf. This creates a Cartesian Product of df and the reps tables:
library(sqldf)
reps <- data.frame(z = 1:3)
sqldf("select * from df, reps order by z")
or simply with map_dfr from purrr:
library(purrr)
map_dfr(1:3, ~cbind(df, z = .))
Output:
x y z
1 1 3 1
2 2 4 1
3 1 3 2
4 2 4 2
5 1 3 3
6 2 4 3
Yet another option using base R
n <- 3
do.call(rbind,
Map(`[<-`, replicate(n = n,
expr = df,
simplify = FALSE),
"z",
value = seq_len(n)))
# x y z
#1 1 3 1
#2 2 4 1
#3 1 3 2
#4 2 4 2
#5 1 3 3
#6 2 4 3
A few other ways not covered yet:
# setup
df = data.frame(x = c(1, 2), y = c(3, 4))
n = 3
# simple row indexing, add column manually
result = df[rep(1:nrow(df), 3), ]
result$id = rep(1:n, each = nrow(df))
# cross join in base
merge(df, data.frame(id = 1:n), by = NULL)
# cross join in tidyr
tidyr::crossing(df, data.frame(id = 1:n))
# dplyr version of the row-index method above
slice(df, rep(1:n(), n)) %>% mutate(id = rep(1:n, each = nrow(df)))
Inspiration drawn heavily from an old question of mine, How can I repeat a data frame?. Basically the same question but without the id column requirement.
I have a dataframe (might not be sorted like this) that looks like this:
Group Value
A 1
A 5
A 6
A 11
B 3
B 4
B 5
B 10
And now I want a new column that counts how many rows per Group that have a value that falls within a fixed range of the value in each row (let's say for this example that it has to be between 2 less than the current row's value and the actual value, inclusive). So the result would be
Group Value New Count
A 1 1 (because there is only 1 row in Group A between -1 and 1, this row)
A 5 1 (because there is only 1 row in Group A between 3 and 5, this row)
A 6 2 (because there are 2 rows in Group A between 4 and 6)..and so on
A 11 1
B 3 1
B 4 2
B 5 3
B 10 1
I have seen some answers with respect to running total counters within a group, etc, but I haven't come across this situation in my searching on SO...
Another approach is to use a non-equi join and group on the join conditions:
library(data.table)
setDT(DF)[, New.Count := .SD[.(Group = Group, V1 = Value, V2 = Value - delta),
on = .(Group, Value <= V1, Value >= V2), .N, by = .EACHI]$N][]
Group Value New.Count
1: A 1 1
2: A 5 1
3: A 6 2
4: A 11 1
5: B 3 1
6: B 4 2
7: B 5 3
8: B 10 1
Data
library(data.table)
DF <- fread(
" Group Value
A 1
A 5
A 6
A 11
B 3
B 4
B 5
B 10"
)
I found a way looping, not sure how to do otherwise :
Df <- data.frame(list(Value = c(1,5,8,11,3,4,5,10), Group = c("A","A","A","A","B","B","B","B")))
for (i in 1:dim(Df)[1])
{Df$newcount[i] <- sum(as.numeric(Df$Value <=Df$Value[i] & Df$Value >= Df$Value[i]-2 & Df$Group == Df$Group[i] )) }
It loop on each row and count the conditions you were saying : value between the value and value - 2, and in the same group.
I was looking for a data.table way but didn't managed it.
the output :
Value Group newcount
1 1 A 1
2 5 A 1
3 8 A 1
4 11 A 1
5 3 B 1
6 4 B 2
7 5 B 3
8 10 B 1
Based on what you started (as mentionned in your comment), here is loop to do it
df <- data.frame(Group = c(rep("A", 4), rep("B", 4)),
Value = c(1, 5, 6, 11, 3, 4, 5, 10))
require(dplyr)
for(i in seq_along(df$Value)){
df$NewCount[i] <- nrow(df %>% filter(Group == Group[i] &
Value <= Value[i] &
Value >= Value[i]-2))
}
You can achieve this with purrr, but maybe there is a more succinct way. We first create a new variable with the range we will search. Next we find all unique values for the given group. For the result we sum the count of all values which fall into the search range. We can wrap this in a function and re-use in a convenient way.
library(tidyverse)
find_counts <- function(x, range = 2) {
search_range <- map(x, ~seq(.x-range, .x, 1))
unique_vals <- list(x)
map2_int(unique_vals, search_range, ~sum(.x %in% .y))
}
Df %>%
group_by(Group) %>%
mutate(result = find_counts(Value))
#> # A tibble: 8 x 3
#> # Groups: Group [2]
#> Group Value result
#> <fctr> <int> <dbl>
#> 1 A 1 1
#> 2 A 5 1
#> 3 A 8 1
#> 4 A 11 1
#> 5 B 3 1
#> 6 B 4 2
#> 7 B 5 3
#> 8 B 10 1
Results from microbenchmark::microbenchmark with the following data:
set.seed(928374)
DF <- data.frame(Group = sample(letters[1:15], 500, replace = T),
Value = sample(1:10, 500, replace = T))
Unit: milliseconds
expr min lq mean median uq max neval cld
ANG 1607.59370 1645.93364 1776.582546 1709.976584 1822.011283 2603.61574 30 c
ThomasK 15.30110 16.11919 19.040010 17.238959 19.550713 54.30369 30 a
denis 155.92567 165.73500 182.563020 171.147209 204.508171 253.26394 30 b
uwe 2.15669 2.46198 3.207837 2.570449 3.114574 13.28832 30 a
Data
Df <- read.table(text = " Group Value
A 1
A 5
A 8
A 11
B 3
B 4
B 5
B 10", header = T)
Only base R:
count_in_range = function(x){
delta = 2
vapply(x,
FUN = function(value) sum(x>=(value - delta) & x<=value, na.rm = TRUE),
FUN.VALUE = numeric(1)
)
}
dfs$newcount = ave(dfs$Value, dfs$Group, FUN = count_in_range)
dfs
# Group Value newcount
# 1 A 1 1
# 2 A 5 1
# 3 A 6 2
# 4 A 11 1
# 5 B 3 1
# 6 B 4 2
# 7 B 5 3
# 8 B 10 1
Benchmark with data.table:
set.seed(928374)
DF <- data.frame(Group = sample(letters[1:15], 500, replace = T),
Value = sample(1:10, 500, replace = T))
library(data.table)
library(microbenchmark)
DT = as.data.table(DF)
delta = 2
microbenchmark(
datatable = {
DT[, New.Count := .SD[.(Group = Group, V1 = Value, V2 = Value - delta),
on = .(Group, Value <= V1, Value >= V2), .N, by = .EACHI]$N][]
},
ave = {
DF$newcount = ave(DF$Value, DF$Group, FUN = count_in_range)
}
)
# Unit: microseconds
# expr min lq mean median uq max neval
# datatable 1424.814 1438.3355 1492.9422 1459.2175 1512.100 1914.575 100
# ave 712.708 737.1955 849.0507 756.7265 789.327 3583.369 100
all.equal(DF$newcount, DT$New.Count) # TRUE