How to I get accuracy values by group

I can't get the average accuracies (proportion of TRUE values) in Correct_answers columns for the groups chart type and condition.
structure(list(Element = structure(c(1L, 1L, 1L, 1L, 1L), .Label = c("1",
"2", "3", "4", "5", "6"), class = "factor"), Correct_answer = structure(c(2L,
2L, 2L, 1L, 2L), .Label = c("FALSE", "TRUE"), class = "factor"),
Response_time = c(25.155, 6.74, 28.649, 16.112, 105.5906238
), Chart_type = structure(c(2L, 2L, 1L, 1L, 1L), .Label = c("Box",
"Violin"), class = "factor"), Condition = structure(c(1L,
2L, 1L, 2L, 1L), .Label = c("0", "1"), class = "factor")), row.names = c(NA,
5L), class = "data.frame")
Average by chart_type
av_data_chartType <- data %>% group_by(Chart_type) %>% summarise_each(funs(mean, sd))
Average by condition
av_data_conition <- data %>% group_by(Condition) %>% summarise_each(funs(mean, sd))
No mean produced for accuracy
NA value is place where accuracy should be.

Reproducing your code I had a warning that led me to the answer : you shouldn't compute statistics on factor variables. If you know what you are doing you can convert them to numeric :
data <- structure(list(Element = structure(c(1L, 1L, 1L, 1L, 1L),
.Label = c("1", "2", "3", "4", "5", "6"),
class = "factor"),
Correct_answer = structure(c(2L, 2L, 2L, 1L, 2L),
.Label = c("FALSE", "TRUE"),
class = "factor"),
Response_time = c(25.155, 6.74, 28.649, 16.112, 105.5906238
Chart_type = structure(c(2L, 2L, 1L, 1L, 1L),
.Label = c("Box",
class = "factor"),
Condition = structure(c(1L, 2L, 1L, 2L, 1L),
.Label = c("0", "1"),
class = "factor")),
row.names = c(NA, 5L), class = "data.frame")
library("dplyr", warn.conflicts = FALSE)
data <- data %>% as_tibble
# av_data_chartType
data %>%
group_by(Chart_type) %>%
mutate_if(.predicate = is.factor, .funs = as.numeric) %>%
summarise_each(list( ~mean, ~sd))
#> `mutate_if()` ignored the following grouping variables:
#> Column `Chart_type`
#> # A tibble: 2 x 9
#> Chart_type Element_mean Correct_answer_~ Response_time_m~ Condition_mean
#> <fct> <dbl> <dbl> <dbl> <dbl>
#> 1 Box 1 1.67 50.1 1.33
#> 2 Violin 1 2 15.9 1.5
#> # ... with 4 more variables: Element_sd <dbl>, Correct_answer_sd <dbl>,
#> # Response_time_sd <dbl>, Condition_sd <dbl>
# av_data_condition
data %>%
group_by(Condition) %>%
mutate_if(.predicate = is.factor, .funs = as.numeric) %>%
summarise_each(list( ~mean, ~sd))
#> `mutate_if()` ignored the following grouping variables:
#> Column `Condition`
#> # A tibble: 2 x 9
#> Condition Element_mean Correct_answer_~ Response_time_m~ Chart_type_mean
#> <fct> <dbl> <dbl> <dbl> <dbl>
#> 1 0 1 2 53.1 1.33
#> 2 1 1 1.5 11.4 1.5
#> # ... with 4 more variables: Element_sd <dbl>, Correct_answer_sd <dbl>,
#> # Response_time_sd <dbl>, Chart_type_sd <dbl>
Created on 2019-06-11 by the reprex package (v0.2.1)

This should work:
a$Correct_answer <- as.logical(a$Correct_answer)
av_data_chartType <- a %>% select(Chart_type, Correct_answer) %>% group_by(Chart_type) %>% summarise_each(funs(mean, sd))
av_data_chartType <- a %>% select(Condition, Correct_answer) %>% group_by(Condition) %>% summarise_each(funs(mean, sd))
You had 2 problems:
Your Correct_answer was a factor.
You tried to calculate your functions over every Column

You probably need
data %>%
mutate(Correct_answer = as.logical(Correct_answer)) %>%
group_by(Chart_type, Condition) %>%
summarise(avg = mean(Correct_answer))
Or if you need them separately
data %>%
mutate(Correct_answer = as.logical(Correct_answer)) %>%
group_by(Chart_type) %>%
summarise(avg = mean(Correct_answer))
data %>%
mutate(Correct_answer = as.logical(Correct_answer)) %>%
group_by(Condition) %>%
summarise(avg = mean(Correct_answer))


How to add column reporting sum of couple of subsequent rows

I have the following dataset
structure(list(Var1 = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L), .Label = c("0", "1"), class = "factor"), Var2 = structure(c(1L,
1L, 2L, 2L, 1L, 1L, 2L, 2L), .Label = c("congruent", "incongruent"
), class = "factor"), Var3 = structure(c(1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L), .Label = c("spoken", "written"), class = "factor"),
Freq = c(8L, 2L, 10L, 2L, 10L, 2L, 10L, 2L)), class = "data.frame", row.names = c(NA,
I would like to add another column reporting sum of coupled subsequent rows. Thus the final result would look like this:
I have proceeded like this
Table =$unimodal,data_1$cong_cond, data_1$presentation_mode)) %>%
mutate(Var1 = factor(Var1, levels = c('0', '1')))
row = Table %>% #is.factor(Table$Var1)
~ .[Var1 == '0'] + .[Var1 == '1'],
.names = "{.col}_sum"))
column = c(rbind(row$Freq_sum,rep(NA, 4)))
Table$column = column
But I am looking for the quickest way possible with no scripting separated codes. Here I have used the dplyr package, but if you might know possibly suggest some other ways with map(), for loop, and or the method you deem as the best, please just let me know.
This should do:
df$column <-
rep(colSums(matrix(df$Freq, 2)), each=2) * c(1, NA)
If you are fine with no NAs in the dataframe, you can
df %>%
group_by(Var2, Var3) %>%
mutate(column = sum(Freq))
# A tibble: 8 × 5
# Groups: Var2, Var3 [4]
Var1 Var2 Var3 Freq column
<fct> <fct> <fct> <int> <int>
1 0 congruent spoken 8 10
2 1 congruent spoken 2 10
3 0 incongruent spoken 10 12
4 1 incongruent spoken 2 12
5 0 congruent written 10 12
6 1 congruent written 2 12
7 0 incongruent written 10 12
8 1 incongruent written 2 12

How to obtain freqs & percentages for all categorical vars in df

My df, Chap3, has ~50 categorical variables. I want to produce a frequency table for each categorical variable that also includes percentages. The code below works fine for the single var bsex but I cannot figure out how to repeat it for all categorical vars. Have tried using variants of apply, using select_if(is.factor), etc, to no avail.
Chap3 %>%
count(bsex) %>%
mutate(percent = round(n / sum(n) * 100,1))
For such cases it is better if you get the categorical data in long format.
Chap3 %>%
pivot_longer(cols = where(is.factor)) %>%
count(name, value) %>%
group_by(name) %>%
mutate(n = round(prop.table(n), 1)) %>%
# name value n
# <chr> <fct> <dbl>
#1 bsex 0 0.4
#2 bsex 1 0.6
#3 csex 0 0.5
#4 csex 1 0.5
It is easier to help if you provide data in a reproducible format
Chap3 <- data.frame(id = 1:10,
bsex = factor(sample(c(1, 0), 10, replace = TRUE)),
csex = factor(sample(c(1, 0), 10, replace = TRUE)))
We may use table/proportions from base R
proportions(table(stack(type.convert(Chap3[-1], = TRUE))), 2)
values bsex csex
0 0.4 0.5
1 0.6 0.5
Chap3 <- structure(list(id = 1:10, bsex = structure(c(2L, 2L, 2L, 1L,
2L, 1L, 1L, 1L, 2L, 2L), .Label = c("0", "1"), class = "factor"),
csex = structure(c(1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L
), .Label = c("0", "1"), class = "factor")), class = "data.frame", row.names = c(NA,

squashing multiple rows by time difference

Assuming these are few timestamped observations in a dataset:
Id Status DateCreated Group
10 Read 2017-11-04 18:24:55 Red
10 Write 2017-11-04 18:24:56 Red
10 Review 2017-11-04 18:25:16 Red
10 Read 2017-11-04 18:26:17 Red
10 Write 2017-11-04 18:26:47 Red
How do I collapse rows that are within 1 minute of each other?
For example, rows 1,2,3 are collapsed into 1 row and rows 4 and 5 are collapsed into second row.
The expected output would look like this:
Id Status DateCreated Date Ended Group
10 Read,Write,Review 2017-11-04 18:24:55 2017-11-04 18:25:16 Red, Red, Red
10 Read,Write 2017-11-04 18:26:17 2017-11-04 18:26:47 Red, Red
Here is the code to reproduce the test dataset in this example:
df <- structure(list(Id = structure(c(1L, 1L, 1L, 1L, 1L), .Label = "10", class = "factor"),
Status = structure(c(1L, 3L, 2L, 1L, 3L), .Label = c("Read",
"Review", "Write"), class = "factor"), DateCreated = structure(1:5, .Label = c("2017-11-04 18:24:55",
"2017-11-04 18:24:56", "2017-11-04 18:25:16", "2017-11-04 18:26:17",
"2017-11-04 18:26:47"), class = "factor"), Group = structure(c(1L,
1L, 1L, 1L, 1L), .Label = "Red", class = "factor")), class = "data.frame", row.names = c(NA,
I would do something like that:
df %>%
mutate(DateCreated = ymd_hms(DateCreated))%>%
summarise(Status = paste(Status,collapse = ", "),DateCreated = DateCreated[1],Date_ended = last(DateCreated),Group = paste(Group,collapse = ", "))
df <-
Id = structure(c(1L, 1L, 1L, 1L, 1L), .Label = "10", class = "factor"),
Status = structure(
c(1L, 3L, 2L, 1L, 3L),
.Label = c("Read",
"Review", "Write"),
class = "factor"
DateCreated = structure(
.Label = c(
"2017-11-04 18:24:55",
"2017-11-04 18:24:56",
"2017-11-04 18:25:16",
"2017-11-04 18:26:17",
"2017-11-04 18:26:47"
class = "factor"
Group = structure(c(1L,
1L, 1L, 1L, 1L), .Label = "Red", class = "factor")
class = "data.frame",
row.names = c(NA,-5L)
df2 <-
df %>%
mutate(DateCreated = as_datetime(df$DateCreated)) %>%
arrange(DateCreated) %>%
mutate(diff = DateCreated - lag(DateCreated))
df2$diff[1] <- 0L
g <- 0
df3 <- mutate(df2, date_groups =
accumulate(df2$diff, function(x, y)
if (y - x < 60)
else {
g <<- g + 1
})) %>%
group_by(date_groups) %>%
Status = paste(Status, collapse = ", "),
DateCreated = DateCreated[1],
Date_ended = last(DateCreated),
Group = paste(Group, collapse = ", ")
#> # A tibble: 2 x 5
#> date_groups Status DateCreated Date_ended Group
#> <dbl> <chr> <dttm> <dttm> <chr>
#> 1 0 Read, Write… 2017-11-04 18:24:55 2017-11-04 18:24:55 Red, Re…
#> 2 1 Read, Write 2017-11-04 18:26:17 2017-11-04 18:26:17 Red, Red
Created on 2019-01-28 by the reprex package (v0.2.1)

Calculate max value across multiple columns by multiple groups

I have a data file with numeric values in three columns and two grouping variables (ID and Group) from which I need to calculate a single max value by ID and Group:
structure(list(ID = structure(c(1L, 1L, 1L, 2L), .Label = c("a1",
"a2"), class = "factor"), Group = structure(c(1L, 1L, 2L, 2L), .Label =
"def"), class = "factor"), Score1 = c(10L, 0L, 0L, 5L), Score2 = c(0L,
0L, 5L, 10L), Score3 = c(0L, 11L, 2L, 11L)), class = "data.frame", row.names =
The result I am trying to obtain is:
structure(list(ID = structure(c(1L, 1L, 2L), .Label = c("a1",
"a2"), class = "factor"), Group = structure(c(1L, 2L, 2L), .Label = c("abc",
"def"), class = "factor"), Max = c(11L, 5L, 11L)), class = "data.frame",
row.names = c(NA,
I am trying the following in dplyr:
SampTable<-SampDF %>% group_by(ID,Group) %>%
summarize(max = pmax(SampDF$Score1, SampDF$Score2,SampDF$Score3))
But it generates this error:
Error in summarise_impl(.data, dots) :
Column `max` must be length 1 (a summary value), not 4
Is there an easy way to achieve this in dplyr or data.table?
Solution using data.table. Find max value on 3:5 columns (Score columns) by ID and Group.
d[, .(Max =, .SD)), .SDcols = 3:5, .(ID, Group)]
ID Group Max
1: a1 abc 11
2: a1 def 5
3: a2 def 11
d <- structure(list(ID = structure(c(1L, 1L, 1L, 2L), .Label = c("a1",
"a2"), class = "factor"), Group = structure(c(1L, 1L, 2L, 2L), .Label =
"def"), class = "factor"), Score1 = c(10L, 0L, 0L, 5L), Score2 = c(0L,
0L, 5L, 10L), Score3 = c(0L, 11L, 2L, 11L)), class = "data.frame", row.names =
A solution using tidyverse.
dat2 <- dat1 %>%
gather(Column, Value, starts_with("Score")) %>%
group_by(ID, Group) %>%
summarise(Max = max(Value)) %>%
# # A tibble: 3 x 3
# ID Group Max
# <fct> <fct> <dbl>
# 1 a1 abc 11
# 2 a1 def 5
# 3 a2 def 11
Here are couple of other options with tidyverse
df1 %>%
group_by(ID, Group) %>%
nest %>%
mutate(Max = map_dbl(data, ~ max(unlist(.x)))) %>%
Or using pmax
df1 %>%
mutate(Max = pmax(!!! rlang::syms(names(.)[3:5]))) %>%
group_by(ID, Group) %>%
summarise(Max = max(Max))
# A tibble: 3 x 3
# Groups: ID [?]
# ID Group Max
# <fct> <fct> <dbl>
#1 a1 abc 11
#2 a1 def 5
#3 a2 def 11
Or using base R
aggregate(cbind(Max =, df1[3:5])) ~ ID + Group, df1, max)
Here is a tidyverse solution using nest :
df %>%
nest(-(1:2),.key="Max") %>%
mutate_at("Max",map_dbl, max)
# ID Group Max
# 1 a1 abc 11
# 2 a1 def 5
# 3 a2 def 11
In base R:
res <- aggregate(. ~ ID + Group,df,max)
res <- cbind(res[1:2], Max =,res[-(1:2)]))
# ID Group Max
# 1 a1 abc 11
# 2 a1 def 5
# 3 a2 def 11
Here is a base R solution
# gives 2x2 table
x <- by(df[, !names(df) %in% c("ID", "Group")], list(df$ID, df$Group), max)
# get requested format
tmp <- expand.grid(ID = rownames(x), Group = colnames(x))
tmp$Max <- as.vector(x)
tmp[complete.cases(tmp), ]
#R ID Group Max
#R 1 a1 abc 11
#R 3 a1 def 5
#R 4 a2 def 11
df <- structure(list(
ID = structure(c(1L, 1L, 1L, 2L), .Label = c("a1", "a2"), class = "factor"),
Group = structure(c(1L, 1L, 2L, 2L), .Label = c("abc", "def"), class = "factor"),
Score1 = c(10L, 0L, 0L, 5L), Score2 = c(0L, 0L, 5L, 10L),
Score3 = c(0L, 11L, 2L, 11L)),
class = "data.frame", row.names = c(NA, -4L))

Summary multiple columns with dplyr - categorical version

Following this question and this one, I wondered what was the best option to summarise categorical variables in one dataset.
I have a dataset such as
# A tibble: 10 <U+00D7> 4
empstat_couple nssec7_couple3 nchild07 age_couple
<chr> <fctr> <fctr> <dbl>
1 Neo-Trad Lower Managerial 1child 39
2 Neo-Trad Higher Managerial 1child 31
3 Neo-Trad Manual and Routine 1child 33
4 Trad Higher Managerial 1child 43
The 3 first variables are categorical (character or factor) and the last numerical.
What I would like is something like (output)
var n p
1: Neo-Trad 6 0.6
2: OtherArrangment 2 0.2
3: Trad 2 0.2
4: Higher Managerial 4 0.4
5: Lower Managerial 5 0.5
6: Manual and Routine 1 0.1
7: 1child 9 0.9
8: 2children 1 0.1
Well for the numerical variable, I am unsure how to add it meaningfully to the summary.
I guess the most basic way to go is
a = count(dt, empstat_couple) %>% mutate(p = n / sum(n))
b = count(dt, nssec7_couple3) %>% mutate(p = n / sum(n))
c = count(dt, nchild07) %>% mutate(p = n / sum(n))
I wondered if a summarise_each solution existed ?
This doesn't work
dt %>% summarise_each(funs(count))
Using apply I could come up with this
apply(dt, 2, %>% rbindlist()
But it's not great.
Any suggestions ?
dt = structure(list(empstat_couple = c("Neo-Trad", "Neo-Trad", "Neo-Trad",
"Trad", "OtherArrangment", "Neo-Trad", "Trad", "OtherArrangment",
"Neo-Trad", "Neo-Trad"), nssec7_couple3 = structure(c(2L, 1L,
4L, 1L, 2L, 2L, 1L, 2L, 1L, 2L), .Label = c("Higher Managerial",
"Lower Managerial", "Intermediate", "Manual and Routine"), class = "factor"),
nchild07 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
1L), .Label = c("1child", "2children", ">2children"), class = "factor"),
age_couple = c(39, 31, 33, 43, 32, 28, 28, 40, 33, 26), hldid = 1:10), .Names = c("empstat_couple",
"nssec7_couple3", "nchild07", "age_couple", "hldid"), row.names = c(NA,
-10L), class = "data.frame")
We can melt with data.table and get the .N and proportion
unique(melt(setDT(dt), id.var = "age_couple")[, n := .N , value],
by = c("variable", "value", "n"))[, p := n/sum(n), variable
][, c("age_couple", "variable" ) := NULL][]
Or using dplyr/tidyr
gather(dt, var1, var, -age_couple) %>%
group_by(var) %>%
mutate(n = n()) %>%
select(-age_couple) %>%
unique() %>%
group_by(var1) %>%
mutate(p= n/sum(n)) %>%
ungroup() %>%
