I am scoring a survey (23 items) with multiple options for each item (it is select all answers that apply, not one choice per item), and trying to find the minimum, maximum and average value for each item. I have written code to do this (below) but am wondering if there is a more efficient way than to cut and paste the three lines creating min, max and avg columns for every item.
Here is the reproducible example with 2 questions, 3 answer options for questions:
#Establish dataframe
dt <- data.frame(matrix(sample(0:1,30,replace=TRUE), ncol = 6))
colnames(dt) <- c("OptA_1", "OptB_1", "OptC_1", "OptA_2", "OptB_2", "OptC_2")
#Rescore incorrect value
dt[ ,grepl("OptB_", colnames(dt))] <- ifelse(dt[ ,grepl("OptB_", colnames(dt))]==1, 2, NA)
dt[ ,grepl("OptC_", colnames(dt))] <- ifelse(dt[ ,grepl("OptC_", colnames(dt))]==1, 3, NA)
dt[ ,grepl("OptA_", colnames(dt))] <- ifelse(dt[ ,grepl("OptA_", colnames(dt))]==1, 1, NA)
This is the code to calculate the values (note here Option A, B and C are the answer choices, while the _1 denotes item 1)
##Calculate Values
dt$it1_min <- apply(dt[ ,c("OptA_1", "OptB_1", "OptC_1")], 1, min, na.rm=T)
dt$it1_max <- apply(dt[ ,c("OptA_1", "OptB_1", "OptC_1")], 1, max, na.rm=T)
dt$it1_avg <- rowMeans(dt[ ,c("OptA_1", "OptB_1", "OptC_1")], na.rm=T)
I am wondering if I need to do the above ^ for every single item, or if it's possible to write a function so that I can score all items (OptA_1 through OptA_23) more efficiently.
###potentially repetitive code?
dt$it2_min <- apply(dt[ ,c("OptA_2", "OptB_2", "OptC_2")], 1, min, na.rm=T)
dt$it2_max <- apply(dt[ ,c("OptA_2", "OptB_2", "OptC_2")], 1, max, na.rm=T)
dt$it2_avg <- rowMeans(dt[ ,c("OptA_2", "OptB_2", "OptC_2")], na.rm=T)
Here is what the eventual scoring will look like:
##Eventual scoring
dt$tot_min <- rowSums(dt[ ,c("it1_min", "it2_min")], na.rm=T)
dt$tot_max <- rowSums(dt[ ,c("it1_max", "it2_max")], na.rm=T)
dt$tot_avg <- rowSums(dt[ ,c("it1_avg", "it2_avg")], na.rm=T)
You will need to convert the data to long form first (tidyr::pivot_longer).
library(dplyr)
library(tidyr)
dt_long <- dt %>%
# add identifier for participant
mutate(participant = row_number()) %>%
# convert to long form using pattern
pivot_longer(cols = -participant,
names_pattern = "Opt(.*)_(\\d+)",
names_to = c("answer_choice", "item"),
values_to = "selected")
dt_long
# long form data looks like this
# A tibble: 30 x 4
# participant answer_choice item selected
# <int> <chr> <chr> <dbl>
# 1 1 A 1 NA
# 2 1 B 1 2
# 3 1 C 1 NA
# 4 1 A 2 1
# 5 1 B 2 2
# 6 1 C 2 3
# now group by each participant and item and compute the required fields
dt_long %>%
group_by(item, participant) %>%
summarise(it_min = min(selected, na.rm = TRUE),
it_max = max(selected, na.rm = TRUE),
it_avg = mean(selected, na.rm = TRUE))
#> # A tibble: 10 x 5
#> # Groups: item [2]
#> item participant it_min it_max it_avg
#> <chr> <int> <dbl> <dbl> <dbl>
#> 1 1 1 2 2 2
#> 2 1 2 2 2 2
#> 3 1 3 1 3 2
#> 4 1 4 2 3 2.5
#> 5 1 5 3 3 3
#> 6 2 1 1 1 1
#> 7 2 2 1 3 2
#> 8 2 3 1 3 2
#> 9 2 4 1 3 2
#> 10 2 5 1 2 1.5
You can use data.table to melt your dt long, estimate your indicators by group, and then dcast back to wide format:
library(data.table)
dt = melt(setDT(dt)[, row:=.I], id.vars="row")[, c("variable","grp") := tstrsplit(variable, "_")][]
dcast(dt[, .(it_min = min(value,na.rm=T),
it_max = max(value,na.rm=T),
it_avg = mean(value, na.rm=T)
), by=.(row,grp)],
row~grp,
value.var=c("it_min", "it_max", "it_avg")
)
Output: (note that you used sample() above, without setting a seed, see my reproducible data below)
row it_min_1 it_min_2 it_max_1 it_max_2 it_avg_1 it_avg_2
<int> <num> <num> <num> <num> <num> <num>
1: 1 2 NA 3 NA 2.5 NaN
2: 2 2 1 3 3 2.5 2
3: 3 2 3 3 3 2.5 3
4: 4 1 NA 1 NA 1.0 NaN
5: 5 3 3 3 3 3.0 3
Input Data:
set.seed(123)
dt <- data.frame(matrix(sample(0:1,30,replace=TRUE), ncol = 6))
colnames(dt) <- c("OptA_1", "OptB_1", "OptC_1", "OptA_2", "OptB_2", "OptC_2")
dt[ ,grepl("OptB_", colnames(dt))] <- ifelse(dt[ ,grepl("OptB_", colnames(dt))]==1, 2, NA)
dt[ ,grepl("OptC_", colnames(dt))] <- ifelse(dt[ ,grepl("OptC_", colnames(dt))]==1, 3, NA)
dt[ ,grepl("OptA_", colnames(dt))] <- ifelse(dt[ ,grepl("OptA_", colnames(dt))]==1, 1, NA)
Related
I have a data frame like this
x1<- c(0,1,1,1,1,0)
df<-data.frame(x1)
I want to add another column that will take the sum of every two rows and store the value for the first two rows. This should look like this.
You can see here that the first two rows' sum is 1 and that is given in the first two rows of the new column (x2). Next, the third and fourth-row sum is given in the 3rd and fourth row of the new column. Can anyone help?
You can define the groups using floor division and then simply obtain the grouped sum:
library(dplyr)
df %>%
mutate(group = (row_number() - 1) %/% 2) %>%
group_by(group) %>%
mutate(x2 = sum(x1)) %>%
ungroup() %>%
select(-group)
# # A tibble: 6 × 2
# x1 x2
# <dbl> <dbl>
# 1 0 1
# 2 1 1
# 3 1 2
# 4 1 2
# 5 1 1
# 6 0 1
Here a way using dplyr where I create a auxiliar column to group by
library(dplyr)
x1<- c(0,1,1,1,1,0)
df <- data.frame(x1)
len_df <- nrow(df)
aux <- rep(seq(1:(len_df/2)),each = 2)[1:len_df]
df %>%
mutate(aux = aux) %>%
group_by(aux) %>%
mutate(x2 = sum(x1)) %>%
ungroup() %>%
select(-aux)
# A tibble: 6 x 2
x1 x2
<dbl> <dbl>
1 0 1
2 1 1
3 1 2
4 1 2
5 1 1
6 0 1
Create an index with gl for every 2 rows and do the sum after grouping
library(dplyr)
df <- df %>%
group_by(grp = as.integer(gl(n(), 2, n()))) %>%
mutate(x2 = sum(x1)) %>%
ungroup %>%
select(-grp)
-output
df
# A tibble: 6 × 2
x1 x2
<dbl> <dbl>
1 0 1
2 1 1
3 1 2
4 1 2
5 1 1
6 0 1
Or using collapse/data.table
library(data.table)
library(collapse)
setDT(df)[, x2 := fsum(x1, g = rep(.I, each = 2, length.out = .N), TRA = 1)]
-output
> df
x1 x2
<num> <num>
1: 0 1
2: 1 1
3: 1 2
4: 1 2
5: 1 1
6: 0 1
You can use ave + ceiling (both are base R functions)
> transform(df, x2 = ave(x1, ceiling(seq_along(x1) / 2)) * 2)
x1 x2
1 0 1
2 1 1
3 1 2
4 1 2
5 1 1
6 0 1
First, a way of making the data.frame without the intermediate variable.
This splits the data.frame into groups of 2, sums, then repeats the pattern into the new variable.
df<-data.frame(x1=c(0,1,1,1,1,0))
df$x2<-rep(lapply(split(df, rep(1:3, each=2)), sum), each=2)
# x1 x2
#1 0 1
#2 1 1
#3 1 2
#4 1 2
#5 1 1
#6 0 1
in base R you could do:
transform(df,x2 = ave(x1, gl(nrow(df)/2, 2), FUN = sum))
x1 x2
1 0 1
2 1 1
3 1 2
4 1 2
5 1 1
6 0 1
A few more options with select benchmarks.
x1 <- sample(0:1, 1e4, 1)
microbenchmark::microbenchmark(
matrix = rep(colSums(matrix(x1, 2)), each = 2),
recycle = x1 + x1[seq(x1) + c(1, -1)],
cumsum = rep(diff(cumsum(c(0, x1))[seq(1, length(x1) + 1, 2)]), each = 2),
Thomas = ave(x1, ceiling(seq_along(x1)/2))*2,
onyambu = ave(x1, gl(length(x1)/2, 2), FUN = sum),
check = "equal"
)
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> matrix 65.001 69.6510 79.27203 78.4510 82.1510 148.501 100
#> recycle 95.001 100.6505 108.65003 107.5510 110.6010 176.901 100
#> cumsum 137.201 148.9010 169.61090 166.5505 177.7015 340.002 100
#> Thomas 24645.401 25297.2010 26450.46994 25963.3515 27463.2010 31803.101 100
#> onyambu 3774.902 3935.7510 4444.36500 4094.3520 4336.1505 11070.301 100
With data.table for large data:
library(data.table)
library(collapse)
x1 <- sample(0:1, 1e6, 1)
df <- data.frame(x1)
microbenchmark::microbenchmark(
matrix = setDT(df)[, x2 := rep(colSums(matrix(x1, 2)), each = 2)],
recycle = setDT(df)[, x2 := x1 + x1[.I + c(1, -1)]],
akrun = setDT(df)[, x2 := fsum(x1, g = rep(.I, each = 2, length.out = .N), TRA = 1)],
check = "identical"
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> matrix 8.053302 8.937301 10.64786 9.376551 12.51890 17.2037 100
#> recycle 12.117101 12.965950 16.57696 14.003151 17.09805 56.4729 100
#> akrun 10.071701 10.611051 14.42578 11.291601 14.79090 55.1141 100
I want to replace value(s) with NA by group.
have <- data.frame(id = c(1,1,1,1,2,2,2),
value = c(1,2,3,4,5,6,7))
want1 <- data.frame(id = c(1,1,1,1,2,2,2),
value = c(1,2,3,NA,5,6,NA))
want2 <- data.frame(id = c(1,1,1,1,2,2,2),
value = c(1,2,NA,NA,5,NA,NA))
want1 corresponds to replacing the last obs of value with NA and want2 corresponds to replacing last obs of value & last 2nd value with NA. I'm currently trying to do with with dplyr package but can't seem to get any traction. Any help would be much appreciated. Thanks!
We can use row_number() to test the current row against n() the total rows in the group.
have |>
group_by(id) |>
mutate(
last1 = ifelse(row_number() == n(), NA, value),
last2 = ifelse(row_number() >= n() - 1, NA, value)
)
# # A tibble: 7 × 4
# # Groups: id [2]
# id value last1 last2
# <dbl> <dbl> <dbl> <dbl>
# 1 1 1 1 1
# 2 1 2 2 2
# 3 1 3 3 NA
# 4 1 4 NA NA
# 5 2 5 5 5
# 6 2 6 6 NA
# 7 2 7 NA NA
And a general way to provide variants as different data frames.
lapply(
1:2,
function(k) {
have %>%
group_by(id) %>%
mutate(value=ifelse(row_number() <= (n() - k), value, NA))
}
)
[[1]]
# A tibble: 7 × 2
# Groups: id [2]
id value
<dbl> <dbl>
1 1 1
2 1 2
3 1 3
4 1 NA
5 2 5
6 2 6
7 2 NA
[[2]]
# A tibble: 7 × 2
# Groups: id [2]
id value
<dbl> <dbl>
1 1 1
2 1 2
3 1 NA
4 1 NA
5 2 5
6 2 NA
7 2 NA
Here is a base R way.
have <- data.frame(id = c(1,1,1,1,2,2,2),
value = c(1,2,3,4,5,6,7))
want1 <- data.frame(id = c(1,1,1,1,2,2,2),
value = c(1,2,3,NA,5,6,NA))
want2 <- data.frame(id = c(1,1,1,1,2,2,2),
value = c(1,2,NA,NA,5,NA,NA))
with(have, ave(value, id, FUN = \(x){
x[length(x)] <- NA
x
}))
#> [1] 1 2 3 NA 5 6 NA
with(have, ave(value, id, FUN = \(x){
x[length(x)] <- NA
if(length(x) > 1)
x[length(x) - 1L] <- NA
x
}))
#> [1] 1 2 NA NA 5 NA NA
Created on 2022-06-09 by the reprex package (v2.0.1)
Then reassign these results to column value.
I'm looking to get each unique combination of two variables:
library(purrr)
cross_df(list(id1 = seq_len(3), id2 = seq_len(3)), .filter = `==`)
# A tibble: 6 x 2
id1 id2
<int> <int>
1 2 1
2 3 1
3 1 2
4 3 2
5 1 3
6 2 3
How do I remove out the mirrored combinations? That is, I want only one of rows 1 and 3 in the data frame above, only one of rows 2 and 5, and only one of rows 4 and 6. My desired output would be something like:
# A tibble: 3 x 2
id1 id2
<int> <int>
1 2 1
2 3 1
3 3 2
I don't care if a particular id value is in id1 or id2, so the below is just as acceptable as the output:
# A tibble: 3 x 2
id1 id2
<int> <int>
1 1 2
2 1 3
3 2 3
A tidyverse version of Dan's answer:
cross_df(list(id1 = seq_len(3), id2 = seq_len(3)), .filter = `==`) %>%
mutate(min = pmap_int(., min), max = pmap_int(., max)) %>% # Find the min and max in each row
unite(check, c(min, max), remove = FALSE) %>% # Combine them in a "check" variable
distinct(check, .keep_all = TRUE) %>% # Remove duplicates of the "check" variable
select(id1, id2)
# A tibble: 3 x 2
id1 id2
<int> <int>
1 2 1
2 3 1
3 3 2
A Base R approach:
# create a string with the sorted elements of the row
df$temp <- apply(df, 1, function(x) paste(sort(x), collapse=""))
# then you can simply keep rows with a unique sorted-string value
df[!duplicated(df$temp), 1:2]
Let's say I have the following simple data.frame:
ID value
1 1 3
2 2 4
3 1 5
4 3 3
My desired output is below, where we add a value to cumsum or we update it according to the latest value of an already used ID.
ID value cumsum
1 1 3 3
2 2 4 7
3 1 5 9
4 3 3 12
In row 3, the new value forms an updated cumsum (7-3+5=9). Row 4 adds a new value to cumsum because the ID was not used before (4+5+3=12).
This produces the desired outcome for your example:
df<-read.table(header=T, text="ID value
1 1 3
2 2 4
3 1 5
4 3 3")
library(tidyverse)
df %>%
group_by(ID) %>%
mutate(value = value-lag(value, def = 0L)) %>%
ungroup %>% mutate(cumsum = cumsum(value))
# # A tibble: 4 x 3
# ID value cumsum
# <int> <int> <int>
# 1 1 3 3
# 2 2 4 7
# 3 1 2 9
# 4 3 3 12
I used data.table for cumsum. Calculating the cumulative mean is a bit more tricky because the number of oberservations is not adjusted by just using cummean.
library(data.table)
dt = data.table(id = c(1, 2, 1, 3), value = c(3, 4, 5, 3))
dt[, tmp := value-shift(value, n = 1L, type = "lag", fill = 0), by = c("id")]
#CUMSUM
dt[, cumsum := cumsum(tmp)]
#CUMMEAN WITH UPDATED N
dt[value != tmp, skip := 1:.N]
dt[, skip := na.locf(skip, na.rm = FALSE)]
dt[is.na(skip), skip := 0]
dt[, cummean := cumsum(tmp)/(seq_along(tmp)-skip)]
Output is:
id value tmp cumsum skip cummean
1: 1 3 3 3 0 3.0
2: 2 4 4 7 0 3.5
3: 1 5 2 9 1 4.5
4: 3 3 3 12 1 4.0
Edit: Changed lag function to data.table's shift function.
I have a dataframe (might not be sorted like this) that looks like this:
Group Value
A 1
A 5
A 6
A 11
B 3
B 4
B 5
B 10
And now I want a new column that counts how many rows per Group that have a value that falls within a fixed range of the value in each row (let's say for this example that it has to be between 2 less than the current row's value and the actual value, inclusive). So the result would be
Group Value New Count
A 1 1 (because there is only 1 row in Group A between -1 and 1, this row)
A 5 1 (because there is only 1 row in Group A between 3 and 5, this row)
A 6 2 (because there are 2 rows in Group A between 4 and 6)..and so on
A 11 1
B 3 1
B 4 2
B 5 3
B 10 1
I have seen some answers with respect to running total counters within a group, etc, but I haven't come across this situation in my searching on SO...
Another approach is to use a non-equi join and group on the join conditions:
library(data.table)
setDT(DF)[, New.Count := .SD[.(Group = Group, V1 = Value, V2 = Value - delta),
on = .(Group, Value <= V1, Value >= V2), .N, by = .EACHI]$N][]
Group Value New.Count
1: A 1 1
2: A 5 1
3: A 6 2
4: A 11 1
5: B 3 1
6: B 4 2
7: B 5 3
8: B 10 1
Data
library(data.table)
DF <- fread(
" Group Value
A 1
A 5
A 6
A 11
B 3
B 4
B 5
B 10"
)
I found a way looping, not sure how to do otherwise :
Df <- data.frame(list(Value = c(1,5,8,11,3,4,5,10), Group = c("A","A","A","A","B","B","B","B")))
for (i in 1:dim(Df)[1])
{Df$newcount[i] <- sum(as.numeric(Df$Value <=Df$Value[i] & Df$Value >= Df$Value[i]-2 & Df$Group == Df$Group[i] )) }
It loop on each row and count the conditions you were saying : value between the value and value - 2, and in the same group.
I was looking for a data.table way but didn't managed it.
the output :
Value Group newcount
1 1 A 1
2 5 A 1
3 8 A 1
4 11 A 1
5 3 B 1
6 4 B 2
7 5 B 3
8 10 B 1
Based on what you started (as mentionned in your comment), here is loop to do it
df <- data.frame(Group = c(rep("A", 4), rep("B", 4)),
Value = c(1, 5, 6, 11, 3, 4, 5, 10))
require(dplyr)
for(i in seq_along(df$Value)){
df$NewCount[i] <- nrow(df %>% filter(Group == Group[i] &
Value <= Value[i] &
Value >= Value[i]-2))
}
You can achieve this with purrr, but maybe there is a more succinct way. We first create a new variable with the range we will search. Next we find all unique values for the given group. For the result we sum the count of all values which fall into the search range. We can wrap this in a function and re-use in a convenient way.
library(tidyverse)
find_counts <- function(x, range = 2) {
search_range <- map(x, ~seq(.x-range, .x, 1))
unique_vals <- list(x)
map2_int(unique_vals, search_range, ~sum(.x %in% .y))
}
Df %>%
group_by(Group) %>%
mutate(result = find_counts(Value))
#> # A tibble: 8 x 3
#> # Groups: Group [2]
#> Group Value result
#> <fctr> <int> <dbl>
#> 1 A 1 1
#> 2 A 5 1
#> 3 A 8 1
#> 4 A 11 1
#> 5 B 3 1
#> 6 B 4 2
#> 7 B 5 3
#> 8 B 10 1
Results from microbenchmark::microbenchmark with the following data:
set.seed(928374)
DF <- data.frame(Group = sample(letters[1:15], 500, replace = T),
Value = sample(1:10, 500, replace = T))
Unit: milliseconds
expr min lq mean median uq max neval cld
ANG 1607.59370 1645.93364 1776.582546 1709.976584 1822.011283 2603.61574 30 c
ThomasK 15.30110 16.11919 19.040010 17.238959 19.550713 54.30369 30 a
denis 155.92567 165.73500 182.563020 171.147209 204.508171 253.26394 30 b
uwe 2.15669 2.46198 3.207837 2.570449 3.114574 13.28832 30 a
Data
Df <- read.table(text = " Group Value
A 1
A 5
A 8
A 11
B 3
B 4
B 5
B 10", header = T)
Only base R:
count_in_range = function(x){
delta = 2
vapply(x,
FUN = function(value) sum(x>=(value - delta) & x<=value, na.rm = TRUE),
FUN.VALUE = numeric(1)
)
}
dfs$newcount = ave(dfs$Value, dfs$Group, FUN = count_in_range)
dfs
# Group Value newcount
# 1 A 1 1
# 2 A 5 1
# 3 A 6 2
# 4 A 11 1
# 5 B 3 1
# 6 B 4 2
# 7 B 5 3
# 8 B 10 1
Benchmark with data.table:
set.seed(928374)
DF <- data.frame(Group = sample(letters[1:15], 500, replace = T),
Value = sample(1:10, 500, replace = T))
library(data.table)
library(microbenchmark)
DT = as.data.table(DF)
delta = 2
microbenchmark(
datatable = {
DT[, New.Count := .SD[.(Group = Group, V1 = Value, V2 = Value - delta),
on = .(Group, Value <= V1, Value >= V2), .N, by = .EACHI]$N][]
},
ave = {
DF$newcount = ave(DF$Value, DF$Group, FUN = count_in_range)
}
)
# Unit: microseconds
# expr min lq mean median uq max neval
# datatable 1424.814 1438.3355 1492.9422 1459.2175 1512.100 1914.575 100
# ave 712.708 737.1955 849.0507 756.7265 789.327 3583.369 100
all.equal(DF$newcount, DT$New.Count) # TRUE